2026年李宏毅机器学习-homework1(新冠预测) h1对美国各州进行one-hot编码选用MSEMean Square error任务描述根据美国某州过去5天的调查/检测结果预测第5天新增检测阳性病例的百分比。训练集118列id1州37新冠症状、行为表现、心理健康情况、每天的感染概率16*5测试集​​​​​​​117列少了第五天的感染概率作业h1: COVID-19 事件预测 (回归)学习目标:使用 deep neural networks (DNN)解决回归问题理解基本的DNN训练技巧熟悉和掌握PyTorch.下载数据集下载 Google Drive 上 ID 为1kLSW_-cW2Huj7bh84YTdimGBOJaODiOS的文件并保存为covid.train.csv训练集。下载 ID 为1iiI5qROrAhZn-o4FPqsE97bMzDEFvIdg的文件并保存为covid.test.csv测试集。!gdown --id 1kLSW_-cW2Huj7bh84YTdimGBOJaODiOS --output covid.train.csv !gdown --id 1iiI5qROrAhZn-o4FPqsE97bMzDEFvIdg --output covid.test.csv调包# Numerical Operations import math import numpy as np # Reading/Writing Data import pandas as pd import os import csv # For Progress Bar 进度条 from tqdm import tqdm # Pytorch import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader, random_split # For plotting learning curve from torch.utils.tensorboard import SummaryWriterSome Utility Functions你不需要调整这一部分cudnncuDNN 是 NVIDIA 提供的深度学习加速库PyTorch 在使用 GPU 训练卷积神经网络时经常会调用它。determinsiticTrue就是尽量使用“确定性算法”benchmark 默认True情况下,使用速度最快的算法但会影响结果的稳定性。def same_seed(seed):让 GPU 计算尽量稳定关闭 GPU 自动选择最快算法固定 NumPy 的随机种子固定 PyTorch CPU 的随机种子如果有 GPU也固定 PyTorch GPU 的随机数def same_seed(seed): Fixes random number generator seeds for reproducibility. 固定随机种子 torch.backends.cudnn.deterministic True torch.backends.cudnn.benchmark False np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) def train_valid_split(data_set, valid_ratio, seed): Split provided training data into training set and validation set 在已有训练集上再划分成训练集和验证集 valid_set_size int(valid_ratio * len(data_set)) train_set_size len(data_set) - valid_set_size train_set, valid_set random_split(data_set, [train_set_size, valid_set_size], generatortorch.Generator().manual_seed(seed)) return np.array(train_set), np.array( valid_set) def predict(test_loader, model, device): Set your model to evaluation mode.预测函数模块 model.eval() for x in tqdm(test_loader): x x.to(device) with torch.no_grad(): pred model(x) preds.append(pred.detach().cpu()) preds torch.cat(preds, dim0).numpy() return predsDataset数据集class COVID19Dataset(Dataset): x: Features. y: Targets, if none, do prediction. def __init__(self, x, yNone): if y is None: self.y y else: self.y torch.FloatTensor(y) self.x torch.FloatTensor(x) def __getitem__(self, idx): if self.y is None: return self.x[idx] else: return self.x[idx], self.y[idx] def __len__(self): return len(self.x)Neural Network Model 神经网络模型通过修改下面的类来尝试不同的模型架构。class My_Model(nn.Module): def __init__(self, input_dim): super(My_Model, self).__init__() # TODO: modify models structure, be aware of dimensions. self.layers nn.Sequential( nn.Linear(input_dim, 16), nn.ReLU(), nn.Linear(16, 8), nn.ReLU(), nn.Linear(8, 1) ) def forward(self, x): x self.layers(x) x x.squeeze(1) # (B, 1) - (B) return xFeature Selection 特征选取通过修改下面的函数来选择你认为有用的特征def select_feat(train_data, valid_data, test_data, select_allTrue): Selects useful features to perform regression y_train, y_valid train_data[:,-1], valid_data[:,-1] raw_x_train, raw_x_valid, raw_x_test train_data[:,:-1], valid_data[:,:-1], test_data if select_all: feat_idx list(range(raw_x_train.shape[1])) else: feat_idx [0,1,2,3,4] # TODO: Select suitable feature columns. return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_validTraining Loopdef trainer(train_loader, valid_loader, model, config, device): criterion nn.MSELoss(reductionmean) # Define your loss function, do not modify this. # Define your optimization algorithm. # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms. # TODO: L2 regularization (optimizer(weight decay...) or implement by your self). optimizer torch.optim.SGD(model.parameters(), lrconfig[learning_rate], momentum0.9) writer SummaryWriter() # Writer of tensoboard. if not os.path.isdir(./models): os.mkdir(./models) # Create directory of saving models. n_epochs, best_loss, step, early_stop_count config[n_epochs], math.inf, 0, 0 for epoch in range(n_epochs): model.train() # Set your model to train mode. loss_record [] # tqdm is a package to visualize your training progress. train_pbar tqdm(train_loader, position0, leaveTrue) for x, y in train_pbar: optimizer.zero_grad() # Set gradient to zero. x, y x.to(device), y.to(device) # Move your data to device. pred model(x) loss criterion(pred, y) loss.backward() # Compute gradient(backpropagation). optimizer.step() # Update parameters. step 1 loss_record.append(loss.detach().item()) # Display current epoch number and loss on tqdm progress bar. train_pbar.set_description(fEpoch [{epoch1}/{n_epochs}]) train_pbar.set_postfix({loss: loss.detach().item()}) mean_train_loss sum(loss_record)/len(loss_record) writer.add_scalar(Loss/train, mean_train_loss, step) model.eval() # Set your model to evaluation mode. loss_record [] for x, y in valid_loader: x, y x.to(device), y.to(device) with torch.no_grad(): pred model(x) loss criterion(pred, y) loss_record.append(loss.item()) mean_valid_loss sum(loss_record)/len(loss_record) print(fEpoch [{epoch1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}) writer.add_scalar(Loss/valid, mean_valid_loss, step) if mean_valid_loss best_loss: best_loss mean_valid_loss torch.save(model.state_dict(), config[save_path]) # Save your best model print(Saving model with loss {:.3f}....format(best_loss)) early_stop_count 0 else: early_stop_count 1 if early_stop_count config[early_stop]: print(\nModel is not improving, so we halt the training session.) returnConfigurations 配置参数模块configcontains hyper-parameters for training and the path to save your model.device cuda if torch.cuda.is_available() else cpu config { seed: 5201314, # Your seed number, you can pick your lucky number. :) select_all: True, # Whether to use all features. valid_ratio: 0.2, # validation_size train_size * valid_ratio n_epochs: 3000, # Number of epochs. batch_size: 256, learning_rate: 1e-5, early_stop: 400, # If model has not improved for this many consecutive epochs, stop training. save_path: ./models/model.ckpt # Your model will be saved here. }数据读取和 DataLoader 模块# Set seed for reproducibility same_seed(config[seed]) # train_data size: 2699 x 118 (id 37 states 16 features x 5 days) # test_data size: 1078 x 117 (without last days positive rate) train_data, test_data pd.read_csv(./covid.train.csv).values, pd.read_csv(./covid.test.csv).values train_data, valid_data train_valid_split(train_data, config[valid_ratio], config[seed]) # Print out the data size. print(ftrain_data size: {train_data.shape} valid_data size: {valid_data.shape} test_data size: {test_data.shape}) # Select features x_train, x_valid, x_test, y_train, y_valid select_feat(train_data, valid_data, test_data, config[select_all]) # Print out the number of features. print(fnumber of features: {x_train.shape[1]}) train_dataset, valid_dataset, test_dataset COVID19Dataset(x_train, y_train), \ COVID19Dataset(x_valid, y_valid), \ COVID19Dataset(x_test) # Pytorch data loader loads pytorch dataset into batches. train_loader DataLoader(train_dataset, batch_sizeconfig[batch_size], shuffleTrue, pin_memoryTrue) valid_loader DataLoader(valid_dataset, batch_sizeconfig[batch_size], shuffleTrue, pin_memoryTrue) test_loader DataLoader(test_dataset, batch_sizeconfig[batch_size], shuffleFalse, pin_memoryTrue)Start training! 开始训练model My_Model(input_dimx_train.shape[1]).to(device) # put your model and data on the same computation device. trainer(train_loader, valid_loader, model, config, device)Plot learning curves withtensorboard(optional)TensorBoard 可视化模块%reload_ext tensorboard %tensorboard --logdir./runs/TestingThe predictions of your model on testing set will be stored atpred.csv.def save_pred(preds, file): Save predictions to specified file with open(file, w) as fp: writer csv.writer(fp) writer.writerow([id, tested_positive]) for i, p in enumerate(preds): writer.writerow([i, p]) model My_Model(input_dimx_train.shape[1]).to(device) model.load_state_dict(torch.load(config[save_path])) preds predict(test_loader, model, device) save_pred(preds, pred.csv)