深圳制作網(wǎng)站培訓(xùn)學(xué)校陜西seo快速排名
Deep Learning
- 1. 回歸算法思路
- 2. 代碼
- 2.1 基礎(chǔ)操作
- 2.2 定義相關(guān)函數(shù)
- 2.3.1 定義圖像繪制函數(shù)
- 2.3.2 數(shù)據(jù)集加載及預(yù)處理
- 2.3.3 構(gòu)造數(shù)據(jù)加載器
- 2.3.4 構(gòu)建前饋神經(jīng)網(wǎng)絡(luò)(Feedforward Neural Network)模型
- 2.3.5 神經(jīng)網(wǎng)絡(luò)的訓(xùn)練過(guò)程
- 2.3.6 模型評(píng)估
- 2.3.7 模型測(cè)試
- 2.3.8 模型初始化
- 2.3 模型運(yùn)行
1. 回歸算法思路
基于3層神經(jīng)網(wǎng)絡(luò)的回歸優(yōu)化
2. 代碼
2.1 基礎(chǔ)操作
切換文件路徑,并創(chuàng)建新的文件夾:
%cd /content/drive/MyDrive
#change directory to google drive
#!mkdir ML2023
#make a directory named ML2023
%cd ./ML2023
#change directory to ML2023
/content/drive/MyDrive
/content/drive/MyDrive/ML2023
查看當(dāng)前路徑下的文件:
!ls
covid.test.csv covid.train.csv models
顯示當(dāng)前文件路徑:
!pwd #output the current directory
/content/drive/MyDrive/ML2023
文件下載:
# Download Data
tr_path = 'covid.train.csv' # path to training data
tt_path = 'covid.test.csv' # path to testing data!gdown --id '19CCyCgJrUxtvgZF53vnctJiOJ23T5mqF' --output covid.train.csv
!gdown --id '1CE240jLm2npU-tdz81-oVKEF3T2yfT1O' --output covid.test.csv
Downloading…
From: https://drive.google.com/uc?id=19CCyCgJrUxtvgZF53vnctJiOJ23T5mqF
To: /content/covid.train.csv
100% 2.00M/2.00M [00:00<00:00, 31.7MB/s]
Downloading…
From: https://drive.google.com/uc?id=1CE240jLm2npU-tdz81-oVKEF3T2yfT1O
To: /content/covid.test.csv
100% 651k/651k [00:00<00:00, 10.2MB/s]
導(dǎo)入所需要的相關(guān)包:
# Import Some Packages
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader# For data preprocess
import numpy as np
import csv
import os# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure# For feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
2.2 定義相關(guān)函數(shù)
2.3.1 定義圖像繪制函數(shù)
def get_device():''' Get device (if GPU is available, use GPU) '''return 'cuda' if torch.cuda.is_available() else 'cpu'def plot_learning_curve(loss_record, title=''):''' Plot learning curve of your DNN (train & dev loss) dev:development'''total_steps = len(loss_record['train']) #x_1 = range(total_steps)figure(figsize=(6, 4))plt.plot(x_1, loss_record['train'], c='tab:red', label='train')if len(loss_record['dev'])!=0:x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])] # 計(jì)算步長(zhǎng),保持訓(xùn)練集和開發(fā)集步長(zhǎng)一致plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')plt.ylim(0.0, 20.0) # 設(shè)置縱坐標(biāo)的范圍,將其限制在0.0到20.0之間plt.xlabel('Training steps')plt.ylabel('MSE loss') # RMSE?plt.title('Learning curve of {}'.format(title))plt.legend()plt.show()# 繪制預(yù)測(cè)結(jié)果的散點(diǎn)圖
def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):# dv_set:開發(fā)集(或驗(yàn)證集)的數(shù)據(jù)集,包含輸入特征和實(shí)際目標(biāo)值。# model:訓(xùn)練好的深度神經(jīng)網(wǎng)絡(luò)模型,用于進(jìn)行預(yù)測(cè)。# device:指定模型在哪個(gè)設(shè)備上運(yùn)行,通常是CPU或GPU。# lim:橫縱坐標(biāo)的限制范圍,默認(rèn)為35。# preds:模型的預(yù)測(cè)值(可選參數(shù)),如果未提供,則會(huì)重新計(jì)算。# targets:實(shí)際目標(biāo)值(可選參數(shù)),如果未提供,則會(huì)重新獲取''' Plot prediction of your DNN '''if preds is None or targets is None:model.eval()preds, targets = [], []for x, y in dv_set: # x是輸入特征,y是實(shí)際目標(biāo)值x, y = x.to(device), y.to(device)with torch.no_grad():pred = model(x)preds.append(pred.detach().cpu())targets.append(y.detach().cpu())preds = torch.cat(preds, dim=0).numpy()targets = torch.cat(targets, dim=0).numpy()figure(figsize=(5, 5))plt.scatter(targets, preds, c='r', alpha=0.5)plt.plot([-0.2, lim], [-0.2, lim], c='b')plt.xlim(-0.2, lim)plt.ylim(-0.2, lim)plt.xlabel('ground truth value')plt.ylabel('predicted value')plt.title('Ground Truth v.s. Prediction')plt.show()
2.3.2 數(shù)據(jù)集加載及預(yù)處理
class COVID19Dataset(Dataset):''' Dataset for loading and preprocessing the COVID19 dataset '''# target_only:一個(gè)布爾值,表示是否僅使用目標(biāo)特征(在這里是最后一列)# 如果target_only為True,則只選擇目標(biāo)特征,否則選擇一組特定的特征def __init__(self,path,mode='train',target_only=False):self.mode = mode# Read data into numpy arrayswith open(path, 'r') as fp:data = list(csv.reader(fp))data = np.array(data[1:])[:, 1:].astype(float) # 切片操作去掉第1列和行if not target_only:feats = list(range(93)) # 一共94個(gè)特征,但是需要除去最后一個(gè)特征,最后一個(gè)特征是用來(lái)預(yù)測(cè)的else:# TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)!!!# 使用硬編碼feats = [40, 41, 42, 43, 57, 58, 59, 60, 61, 75, 76, 77, 78, 79] # sklean mutual infoif mode == 'test':# Testing data# data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))data = data[:, feats]self.data = torch.FloatTensor(data)else:# Training data (train/dev sets)# data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))target = data[:, -1]data = data[:, feats]# 整個(gè)train+dev一起mean/stdself.mean = torch.FloatTensor(data).mean(dim=0, keepdim=True) # 計(jì)算張量中每列的均值# keepdim=True保持結(jié)果的維度與輸入張量相同,結(jié)果將仍然是一個(gè)包含均值的張量,但它將具有與每列相同的維度self.std = torch.FloatTensor(data).std(dim=0, keepdim=True)# Splitting training data into train & dev setsif mode == 'train':indices = [i for i in range(len(data)) if i % 5 != 0]elif mode == 'dev':indices = [i for i in range(len(data)) if i % 5 == 0]# Convert data into PyTorch tensorsself.data = torch.FloatTensor(data[indices])self.target = torch.FloatTensor(target[indices])self.dim = self.data.shape[1] # 獲取數(shù)據(jù)集self.data的列數(shù),也就是特征的數(shù)量print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'.format(mode, len(self.data), self.dim))# All subclasses should overwrite __getitem__, # supporting fetching a data sample for a given key. def __getitem__(self, index):# Returns one sample at a timeif self.mode in ['train', 'dev']:# For trainingreturn self.data[index], self.target[index]else:# For testing (no target)return self.data[index]def __len__(self):# Returns the size of the datasetreturn len(self.data)def normalization(self, mean=None, std=None):# Normalize features (you may remove this part to see what will happen)# The mean and standard variance of training data will be reused to normalize testing data.if self.mode == 'train' or self.mode =='dev':mean = self.meanstd = self.stdself.data = (self.data-mean) / stdelse:self.data = (self.data-mean) / stdreturn mean, std
Z-Score
標(biāo)準(zhǔn)化(標(biāo)準(zhǔn)差標(biāo)準(zhǔn)化): 將數(shù)據(jù)縮放到均值為0,標(biāo)準(zhǔn)差為1的標(biāo)準(zhǔn)正態(tài)分布
-
計(jì)算特征的均值(mean):
μ = 1 N ∑ i = 1 N x i \mu = \frac{1}{N} \sum_{i=1}^{N} x_i μ=N1?∑i=1N?xi? -
計(jì)算特征的標(biāo)準(zhǔn)差(standard deviation):
σ = 1 N ∑ i = 1 N ( x i ? μ ) 2 \sigma = \sqrt{\frac{1}{N} \sum_{i=1}^{N} (x_i - \mu)^2} σ=N1?∑i=1N?(xi??μ)2? -
對(duì)每個(gè)數(shù)據(jù)點(diǎn) x i x_i xi?,應(yīng)用以下標(biāo)準(zhǔn)化公式:
z i = x i ? μ σ z_i = \frac{x_i - \mu}{\sigma} zi?=σxi??μ? -
數(shù)據(jù)的均值為0,即標(biāo)準(zhǔn)化后的數(shù)據(jù)集的均值接近于0。
-
數(shù)據(jù)的標(biāo)準(zhǔn)差為1,即標(biāo)準(zhǔn)化后的數(shù)據(jù)集的標(biāo)準(zhǔn)差接近于1。
-
數(shù)據(jù)的分布形狀不會(huì)改變,只是尺度和位置發(fā)生了變化。
Z-Score 標(biāo)準(zhǔn)化適用于許多統(tǒng)計(jì)和機(jī)器學(xué)習(xí)算法,特別是對(duì)于需要計(jì)算距離或涉及梯度下降等數(shù)值計(jì)算的算法。通過(guò)標(biāo)準(zhǔn)化,可以確保不同特征的尺度不會(huì)對(duì)模型的訓(xùn)練產(chǎn)生不適當(dāng)?shù)挠绊?#xff0c;幫助模型更快地收斂并提高性能。
Min-Max
標(biāo)準(zhǔn)化(最小-最大值縮放):
-
計(jì)算特征的最小值: x min = min ? ( x 1 , x 2 , … , x N ) x_{\text{min}} = \min(x_1, x_2, \ldots, x_N) xmin?=min(x1?,x2?,…,xN?)
-
計(jì)算特征的最大值: x max = max ? ( x 1 , x 2 , … , x N ) x_{\text{max}} = \max(x_1, x_2, \ldots, x_N) xmax?=max(x1?,x2?,…,xN?)
-
對(duì)每個(gè)數(shù)據(jù)點(diǎn) x i x_i xi?,應(yīng)用以下標(biāo)準(zhǔn)化公式:
x i ′ = x i ? x min x max ? x min x_i' = \frac{x_i - x_{\text{min}}}{x_{\text{max}}- x_{\text{min}}} xi′?=xmax??xmin?xi??xmin?? -
σ \sigma σ 表示標(biāo)準(zhǔn)差。
-
N N N 表示數(shù)據(jù)點(diǎn)的總數(shù)。
-
x i x_i xi? 表示數(shù)據(jù)集中的第 i 個(gè)數(shù)據(jù)點(diǎn)。
-
μ \mu μ 表示數(shù)據(jù)集的均值(平均值),計(jì)算方式為: μ = 1 N ∑ i = 1 N x i \mu = \frac{1}{N} \sum_{i=1}^{N} x_i μ=N1?∑i=1N?xi?
2.3.3 構(gòu)造數(shù)據(jù)加載器
構(gòu)造數(shù)據(jù)加載器,用于訓(xùn)練、驗(yàn)證或測(cè)試機(jī)器學(xué)習(xí)模型。
# 定義函數(shù)可接受多個(gè)參數(shù),包括數(shù)據(jù)文件路徑path、數(shù)據(jù)集模式mode、批量大小batch_size、并行工作數(shù)n_jobs、
# 是否僅使用目標(biāo)數(shù)據(jù)target_only以及均值和標(biāo)準(zhǔn)差的參數(shù)。
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False, mean=None, std=None):''' Generates a dataset, then is put into a dataloader. '''# Construct datasetdataset = COVID19Dataset(path, mode=mode, target_only=target_only) mean, std = dataset.normalization(mean, std)# 創(chuàng)建數(shù)據(jù)加載器對(duì)象:將數(shù)據(jù)集劃分成小批量,并提供批量數(shù)據(jù)以供模型訓(xùn)練dataloader = DataLoader(dataset, batch_size,shuffle=(mode == 'train'), # shuffle為一個(gè)布爾值,指示是否在每個(gè)周期(epoch)之前隨機(jī)打亂數(shù)據(jù)。# 通常在訓(xùn)練模型時(shí)設(shè)置為True,確保每個(gè)周期中的樣本順序不同。drop_last=False,num_workers=n_jobs, pin_memory=True) return dataloader, mean, std
2.3.4 構(gòu)建前饋神經(jīng)網(wǎng)絡(luò)(Feedforward Neural Network)模型
class NeuralNet(nn.Module):''' A simple fully-connected deep neural network '''def __init__(self, input_dim):super(NeuralNet, self).__init__()# Define neural network here# TODO: How to modify this model to achieve better performance?# 定義了神經(jīng)網(wǎng)絡(luò)的結(jié)構(gòu),包括輸入層、隱藏層和輸出層。self.net = nn.Sequential(nn.Linear(input_dim, 64),nn.ReLU(),nn.Linear(64, 16),nn.ReLU(),nn.Linear(16,8),nn.ReLU(),nn.Linear(8,4),nn.ReLU(),nn.Linear(4,1) # 單個(gè)輸出神經(jīng)元的線性層,用于回歸任務(wù))# Mean squared error loss# reduction='mean':損失值會(huì)被平均計(jì)算self.criterion = nn.MSELoss(reduction='mean') def forward(self, x):''' Given input of size (batch_size x input_dim), compute output of the network '''return self.net(x).squeeze(1)def cal_loss(self, pred, target, l1_lambda):# target:真實(shí)的目標(biāo)值# l1_lambda:L1正則化的超參數(shù),用于控制正則化的強(qiáng)度''' Calculate loss '''# TODO: you may implement L2 regularization hereloss = self.criterion(pred, target)# L1 regularizationl1_reg = torch.tensor(0.).to(device)for param in model.parameters():l1_reg += torch.sum(torch.abs(param))loss += l1_lambda * l1_regreturn loss
2.3.5 神經(jīng)網(wǎng)絡(luò)的訓(xùn)練過(guò)程
- 設(shè)置訓(xùn)練超參數(shù)和優(yōu)化器: 代碼從配置文件中獲取了訓(xùn)練超參數(shù),包括最大的訓(xùn)練周期數(shù)(n_epochs)、優(yōu)化器類型(optimizer)、以及優(yōu)化器的超參數(shù)(optim_hparas)。然后,通過(guò) PyTorch 中的 getattr 函數(shù)創(chuàng)建了相應(yīng)類型的優(yōu)化器(如 Adam、SGD 等)。
- 初始化記錄器和計(jì)數(shù)器: 代碼初始化了一些變量,包括損失記錄器 loss_record,用于記錄每個(gè)訓(xùn)練周期的訓(xùn)練和開發(fā)(驗(yàn)證)集損失,以及用于早停(Early Stopping)的計(jì)數(shù)器 early_stop_cnt。
- 開始訓(xùn)練循環(huán): 代碼進(jìn)入了一個(gè)訓(xùn)練循環(huán),該循環(huán)在最大訓(xùn)練周期數(shù)內(nèi)運(yùn)行,或者在出現(xiàn)早停情況下提前結(jié)束訓(xùn)練。
- 模型訓(xùn)練: 在每個(gè)訓(xùn)練周期內(nèi),代碼設(shè)置模型為訓(xùn)練模式(model.train()),然后迭代訓(xùn)練數(shù)據(jù)集中的每個(gè)批次。
- 驗(yàn)證集評(píng)估: 每個(gè)訓(xùn)練周期結(jié)束后,代碼使用驗(yàn)證集(開發(fā)集)對(duì)模型進(jìn)行評(píng)估,計(jì)算驗(yàn)證集上的均方誤差。如果驗(yàn)證集上的均方誤差小于之前的最小值(min_mse),則保存模型參數(shù),并重置早停計(jì)數(shù)器 early_stop_cnt。這有助于防止過(guò)擬合,并在性能改善時(shí)保存模型。
- 早停策略: 如果連續(xù) early_stop 個(gè)訓(xùn)練周期內(nèi)都沒有性能改善(驗(yàn)證集損失不再降低),訓(xùn)練過(guò)程將提前結(jié)束。
- 訓(xùn)練結(jié)束: 訓(xùn)練結(jié)束后,代碼打印出訓(xùn)練周期數(shù),然后返回最小的驗(yàn)證集均方誤差和損失記錄器 loss_record。
def train(tr_set, dv_set, model, config, device):''' DNN training '''n_epochs = config['n_epochs'] # Maximum number of epochs# Setup optimizeroptimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), **config['optim_hparas'])min_mse = 1000.loss_record = {'train': [], 'dev': []} # for recording training lossearly_stop_cnt = 0epoch = 0while epoch < n_epochs:model.train() # set model to training modefor x, y in tr_set: # iterate through the dataloaderoptimizer.zero_grad() # set gradient to zerox, y = x.to(device), y.to(device) # move data to device (cpu/cuda)pred = model(x) # forward pass (compute output)mse_loss = model.cal_loss(pred, y, config['l1_lambda']) # compute lossmse_loss.backward() # compute gradient (backpropagation)optimizer.step() # update model with optimizerloss_record['train'].append(mse_loss.detach().cpu().item())# After each epoch, test your model on the validation (development) set.dev_mse = dev(dv_set, model, device)if dev_mse < min_mse:# Save model if your model improvedmin_mse = dev_mseprint('Saving model (epoch = {:4d}, val_loss = {:.4f})'.format(epoch + 1, min_mse))torch.save(model.state_dict(), config['save_path']) # Save model to specified pathearly_stop_cnt = 0else:early_stop_cnt += 1epoch += 1loss_record['dev'].append(dev_mse)if early_stop_cnt > config['early_stop']:# Stop training if your model stops improving for "config['early_stop']" epochs.breakprint('Finished training after {} epochs'.format(epoch))return min_mse, loss_record
2.3.6 模型評(píng)估
def dev(dv_set, model, device):model.eval() # set model to evalutation modetotal_loss = 0for x, y in dv_set: # iterate through the dataloaderx, y = x.to(device), y.to(device) # move data to device (cpu/cuda)with torch.no_grad(): # disable gradient calculationpred = model(x) # forward pass (compute output)mse_loss = model.cal_loss(pred, y, config['l1_lambda']) # compute losstotal_loss += mse_loss.detach().cpu().item() * len(x) # accumulate losstotal_loss = total_loss / len(dv_set.dataset) # compute averaged lossreturn total_loss
2.3.7 模型測(cè)試
def test(tt_set, model, device):model.eval() # set model to evalutation modepreds = []for x in tt_set: # iterate through the dataloaderx = x.to(device) # move data to device (cpu/cuda)with torch.no_grad(): # disable gradient calculationpred = model(x) # forward pass (compute output)preds.append(pred.detach().cpu()) # collect predictionpreds = torch.cat(preds, dim=0).numpy() # concatenate all predictions and convert to a numpy arrayreturn preds
2.3.8 模型初始化
初始化訓(xùn)練過(guò)程中需要的設(shè)備、目錄和超參數(shù)配置,以便在訓(xùn)練模型之前進(jìn)行必要的準(zhǔn)備工作。
device = get_device() # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True) # The trained model will be saved to ./models/
target_only = True # TODO: Using 40 states & 2 tested_positive featuresseed = 459
np.random.seed(seed)
delta = np.random.normal(loc=0,scale = 0.000001)# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {'n_epochs': 3000, # maximum number of epochs'batch_size': 270, # mini-batch size for dataloader'optimizer': 'Adam', # optimization algorithm (optimizer in torch.optim)'optim_hparas': { # hyper-parameters for the optimizer (depends on which optimizer you are using)'lr': 0.003, # learning rate of Adam#'weight_decay': 1e-8 # weight decay (L2 regularization)},'l1_lambda':1e-5 + delta, # L1 regularization'early_stop': 200, # early stopping epochs (the number epochs since your model's last improvement)'save_path': 'models/model.pth' # your model will be saved here
}myseed = 42069 # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():torch.cuda.manual_seed_all(myseed)
2.3 模型運(yùn)行
tr_set, mean, std = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set, _, _ = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only, mean=mean, std=std)
tt_set, _, _ = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only, mean=mean, std=std)
Finished reading the train set of COVID19 Dataset (2160 samples found, each dim = 14)
Finished reading the dev set of COVID19 Dataset (540 samples found, each dim = 14)
Finished reading the test set of COVID19 Dataset (893 samples found, each dim = 14)
model = NeuralNet(tr_set.dataset.dim).to(device) # Construct model and move to device
model_loss, model_loss_record = train(tr_set, dv_set, model, config, device)
plot_learning_curve(model_loss_record, title='deep model')
del model
model = NeuralNet(tr_set.dataset.dim).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu') # Load your best model
model.load_state_dict(ckpt)
if len(dv_set) > 0:plot_pred(dv_set, model, device) # Show prediction on the validation set
def save_pred(preds, file):''' Save predictions to specified file '''print('Saving results to {}'.format(file))with open(file, 'w') as fp:writer = csv.writer(fp)writer.writerow(['id', 'tested_positive'])for i, p in enumerate(preds):writer.writerow([i, p])preds = test(tt_set, model, device) # predict COVID-19 cases with your model
save_pred(preds, 'COVID-19 pred.csv') # save prediction file to COVID-19 pred.csv