學(xué)院網(wǎng)站建設(shè)的目的及定位全網(wǎng)營銷系統(tǒng)怎么樣
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
from numpy import fft
import pandas as pd
一、光譜分析
? 將時間序列分解為許多正弦或余弦函數(shù)的總和
? 這些函數(shù)的系數(shù)應(yīng)該具有不相關(guān)的值
? 對正弦函數(shù)進行回歸
光譜分析應(yīng)用場景
基于光譜的擬合
基于光譜的擬合是一種常見的分析方法,它通過將實際觀測到的光譜數(shù)據(jù)與已知的光譜模型進行比較和匹配,來獲得對未知樣品的估計或預(yù)測。該方法可以用于光譜分析、化學(xué)定量分析、物質(zhì)識別等領(lǐng)域
示例
#傅里葉外推算法
def fourierExtrapolation(x, n_predict):n = x.sizen_harm = 5 # 設(shè)置了模型中的諧波數(shù)量,即傅里葉級數(shù)中所包含的諧波數(shù)量t = np.arange(0, n)p = np.polyfit(t, x, 1) # 利用線性回歸找到了序列 x 中的線性趨勢x_notrend = x - p[0] * t # 通過減去線性趨勢,將原始數(shù)據(jù) x 去趨勢化x_freqdom = fft.fft(x_notrend) # 對去趨勢化后的數(shù)據(jù)進行傅里葉變換,將數(shù)據(jù)從時域轉(zhuǎn)換到頻域f = fft.fftfreq(n) # 生成頻率數(shù)組,用于表示傅里葉變換結(jié)果中每個頻率對應(yīng)的頻率值indexes = list(range(n))# 對頻率數(shù)組進行排序,以便從低到高選擇頻率成分indexes.sort(key = lambda i: np.absolute(f[i]))t = np.arange(0, n + n_predict)restored_sig = np.zeros(t.size)for i in indexes[:1 + n_harm * 2]:ampli = np.absolute(x_freqdom[i]) / n # 振幅phase = np.angle(x_freqdom[i]) # 相位2restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase)return restored_sig + p[0] * t# 利用傅立葉變換原理,通過擬合周期函數(shù)來預(yù)測時間序列的未來值
x = np.array([669, 592, 664, 1005, 699, 401, 646, 472, 598, 681, 1126, 1260, 562, 491, 714, 530, 521, 687, 776, 802, 499, 536, 871, 801, 965, 768, 381, 497, 458, 699, 549, 427, 358, 219, 635, 756, 775, 969, 598, 630, 649, 722, 835, 812, 724, 966, 778, 584, 697, 737, 777, 1059, 1218, 848, 713, 884, 879, 1056, 1273, 1848, 780, 1206, 1404, 1444, 1412, 1493, 1576, 1178, 836, 1087, 1101, 1082, 775, 698, 620, 651, 731, 906, 958, 1039, 1105, 620, 576, 707, 888, 1052, 1072, 1357, 768, 986, 816, 889, 973, 983, 1351, 1266, 1053, 1879, 2085, 2419, 1880, 2045, 2212, 1491, 1378, 1524, 1231, 1577, 2459, 1848, 1506, 1589, 1386, 1111, 1180, 1075, 1595, 1309, 2092, 1846, 2321, 2036, 3587, 1637, 1416, 1432, 1110, 1135, 1233, 1439, 894, 628, 967, 1176, 1069, 1193, 1771, 1199, 888, 1155, 1254, 1403, 1502, 1692, 1187, 1110, 1382, 1808, 2039, 1810, 1819, 1408, 803, 1568, 1227, 1270, 1268, 1535, 873, 1006, 1328, 1733, 1352, 1906, 2029, 1734, 1314, 1810, 1540, 1958, 1420, 1530, 1126, 721, 771, 874, 997, 1186, 1415, 973, 1146, 1147, 1079, 3854, 3407, 2257, 1200, 734, 1051, 1030, 1370, 2422, 1531, 1062, 530, 1030, 1061, 1249, 2080, 2251, 1190, 756, 1161, 1053, 1063, 932, 1604, 1130, 744, 930, 948, 1107, 1161, 1194, 1366, 1155, 785, 602, 903, 1142, 1410, 1256, 742, 985, 1037, 1067, 1196, 1412, 1127, 779, 911, 989, 946, 888, 1349, 1124, 761, 994, 1068, 971, 1157, 1558, 1223, 782, 2790, 1835, 1444, 1098, 1399, 1255, 950, 1110, 1345, 1224, 1092, 1446, 1210, 1122, 1259, 1181, 1035, 1325, 1481, 1278, 769, 911, 876, 877, 950, 1383, 980, 705, 888, 877, 638, 1065, 1142, 1090, 1316, 1270, 1048, 1256, 1009, 1175, 1176, 870, 856, 860])#原始時間序列數(shù)據(jù)
n_predict = 100 # 未來進行預(yù)測的數(shù)據(jù)點數(shù)目
extrapolation = fourierExtrapolation(x, n_predict) # 調(diào)用fourierExtrapolation函數(shù),使用原始數(shù)據(jù)和預(yù)測數(shù)據(jù)點數(shù)目作為參數(shù),得到外推的結(jié)果
# 使用Matplotlib庫繪制了兩條曲線,一條代表原始數(shù)據(jù)x,另一條代表外推的結(jié)果extrapolation
plt.plot(np.arange(0, x.size), x, 'b', label = 'x', linewidth = 3)
plt.plot(np.arange(0, extrapolation.size), extrapolation, 'r', label = 'extrapolation')
plt.legend()# 添加圖例以便區(qū)分曲線
# 通過Fourier外推方法對航空乘客數(shù)量的時間序列數(shù)據(jù)進行預(yù)測,并將原始數(shù)據(jù)和預(yù)測結(jié)果可視化
air_passengers = pd.read_csv('/home/mw/input/demo2813/AirPassengers.csv') # 讀取了包含航空乘客數(shù)量的時間序列數(shù)據(jù)的CSV文件
x = np.array(air_passengers['#Passengers'].values) # 將CSV文件中的乘客數(shù)量數(shù)據(jù)提取出來并轉(zhuǎn)換為Numpy數(shù)組,存儲在變量x中
n_predict = 300 # 定義外推預(yù)測的數(shù)據(jù)點數(shù)目
extrapolation = fourierExtrapolation(x, n_predict) # 調(diào)用fourierExtrapolation函數(shù),使用變量x和n_predict作為參數(shù),得到外推的結(jié)果
plt.plot(np.arange(0, x.size), x, 'b', label = 'x', linewidth = 3) # 繪制原始數(shù)據(jù)x的曲線,顏色為藍色
plt.plot(np.arange(0, extrapolation.size), extrapolation, 'r', label = 'extrapolation') # 繪制外推結(jié)果extrapolation的曲線,顏色為紅色
plt.legend() # 添加圖例,用于區(qū)分原始數(shù)據(jù)和外推結(jié)果的曲線
!pip install pandas-datareader -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install tqdm -i https://pypi.tuna.tsinghua.edu.cn/simple
二、聚類和分類
距離度量
在機器學(xué)習(xí)和數(shù)據(jù)挖掘中,分類和聚類是兩種常見的任務(wù)。雖然它們的目標和方法有所不同,但兩者都經(jīng)常涉及到數(shù)據(jù)點之間的距離度量。距離度量標準的選擇對于分類和聚類的效果至關(guān)重要,因為它決定了數(shù)據(jù)點之間的相似性或差異性的計算方式
應(yīng)用
基于DTW的聚類
基于DTW的最近鄰分類法
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
from pandas_datareader.data import DataReader
from datetime import datetime
from scipy.cluster.hierarchy import dendrogram, linkage
from pandas_datareader.data import DataReader
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from math import sqrt
from scipy.spatial.distance import squareform
from tqdm import tqdm#讀取文件
words = pd.read_csv('/home/mw/input/demo2813/50words_TEST.csv')#從數(shù)據(jù)框 words 中提取除第一列之外的所有數(shù)據(jù),將其轉(zhuǎn)換為矩陣形式,存儲在名為 test 的變量中
test = words.ix[:, 1:].as_matrix()'''
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexingSee the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
'''test.shape
# (454, 270)a = lambda x, y: x[0] + y[0]# 計算兩個序列之間的動態(tài)時間規(guī)整(DTW)距離的函數(shù)
# DWT是用于衡量兩個序列之間相似度的方法,它可以處理序列在時間軸上的扭曲和偏移
def DTWDistance(s1, s2):# 將輸入序列轉(zhuǎn)換為NumPy數(shù)組s1, s2 = np.array(s1), np.array(s2)n, m = len(s1), len(s2)# 初始化DTW矩陣DTW = np.full((n+1, m+1), float('inf'))DTW[0, 0] = 0# 計算DTW距離for i in range(1, n+1):for j in range(1, m+1):dist = (s1[i-1] - s2[j-1]) ** 2DTW[i, j] = dist + min(DTW[i-1, j], DTW[i, j-1], DTW[i-1, j-1])return np.sqrt(DTW[n, m])# 使用動態(tài)時間規(guī)整(DTW)距離來計算測試數(shù)據(jù)集中每對樣本之間的距離
# size = test.shape[0]
# distance_matrix = np.zeros((size, size))# for i in tqdm(range(size), desc="計算DTW距離"):
# for j in range(i, size):
# distance_matrix[i, j] = DTWDistance(test[i], test[j])
# distance_matrix[j, i] = distance_matrix[i, j]# 返回distance_matrix的行列數(shù)
# distance_matrix.shape# 使用 linkage 函數(shù)來對距離矩陣 p 進行層次聚類,聚類方法是 Ward 方法
# z = linkage(distance_matrix, 'ward')# z# np.savetxt('linkage_matrix.txt', z)'''
--------------------------------------------------------------------------------------------------------------------------
注釋到這
將下面讀取已經(jīng)在project目錄里預(yù)存好的數(shù)據(jù)的代碼取消注釋
'''from scipy.cluster.hierarchy import dendrogram
# 加載鏈接矩陣
z = np.loadtxt('linkage_matrix.txt') #讀取預(yù)存數(shù)據(jù)
dendrogram(z)
plt.title('層次聚類樹狀圖')
plt.xlabel('樣本索引')
plt.ylabel('聚類距離')
plt.show()
#顯示前幾行數(shù)據(jù)
words.head()
4 | -0.89094 | -0.86099 | -0.82438 | -0.78214 | -0.73573 | -0.68691 | -0.63754 | -0.58937 | -0.54342 | ... | -0.86309 | -0.86791 | -0.87271 | -0.87846 | -0.88592 | -0.89619 | -0.90783 | -0.91942 | -0.93018 | -0.93939 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12 | -0.78346 | -0.68562 | -0.58409 | -0.47946 | -0.37398 | -0.27008 | -0.17225 | -0.087463 | -0.019191 | ... | -0.88318 | -0.89189 | -0.90290 | -0.91427 | -0.92668 | -0.93966 | -0.95244 | -0.96623 | -0.9805 | -0.99178 |
1 | 13 | -1.32560 | -1.28430 | -1.21970 | -1.15670 | -1.09980 | -1.04960 | -1.01550 | -0.996720 | -0.985040 | ... | -0.83499 | -0.86204 | -0.88559 | -0.90454 | -0.93353 | -0.99135 | -1.06910 | -1.13680 | -1.1980 | -1.27000 |
2 | 23 | -1.09370 | -1.04200 | -0.99840 | -0.95997 | -0.93997 | -0.93764 | -0.92649 | -0.857090 | -0.693320 | ... | -0.72810 | -0.74512 | -0.76376 | -0.78068 | -0.80593 | -0.84350 | -0.89531 | -0.96052 | -1.0509 | -1.12830 |
3 | 4 | -0.90138 | -0.85228 | -0.80196 | -0.74932 | -0.69298 | -0.63316 | -0.57038 | -0.506920 | -0.446040 | ... | -0.95452 | -0.97322 | -0.98984 | -1.00520 | -1.01880 | -1.02960 | -1.03700 | -1.04110 | -1.0418 | -1.04030 |
4 | 13 | -1.24470 | -1.22000 | -1.16940 | -1.09130 | -0.98968 | -0.86828 | -0.73462 | -0.595370 | -0.457100 | ... | -0.59899 | -0.69078 | -0.78410 | -0.87322 | -0.95100 | -1.01550 | -1.07050 | -1.12200 | -1.1728 | -1.21670 |
# 創(chuàng)建名為 type 的新列,并將數(shù)據(jù)框 words 中第一列的數(shù)據(jù)復(fù)制到這個新列中
words['type'] = words.ix[:, 1]'''
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexingSee the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
'''# 篩選出 words 數(shù)據(jù)框中 type 列的取值小于5的行,并將這些行存儲在新的數(shù)據(jù)框 w 中
w = words[words['type'] < 5]#數(shù)據(jù)框的行列數(shù)
w.shape
# (454, 272)# 繪制數(shù)據(jù)框 w 中第一行從第二列開始的所有數(shù)據(jù)的圖表
w.ix[0, 1:].plot()
# 繪制數(shù)據(jù)框 w 中第三行從第二列開始的所有數(shù)據(jù)的圖表
w.ix[2, 1:].plot()