AlexNet手写字识别的代码实现
python
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
# 定义改进的AlexNet模型
class AlexNet(nn.Module):
def __init__(self, num_classes=10):
super(AlexNet, self).__init__()
# 特征提取层 - 减小卷积核尺寸,适应MNIST
self.features = nn.Sequential(
# 第一个卷积层 - 减小卷积核和步长
# 输入: 224x224 -> 输出: 112x112
nn.Conv2d(1, 64, kernel_size=5, stride=2, padding=2),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
# 输出: 56x56
nn.MaxPool2d(kernel_size=2, stride=2),
# 第二个卷积层
# 输出: 56x56
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
# 输出: 28x28
nn.MaxPool2d(kernel_size=2, stride=2),
# 第三个卷积层
# 输出: 28x28
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
# 输出: 14x14
nn.MaxPool2d(kernel_size=2, stride=2),
# 第四个卷积层
# 输出: 14x14
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
# 输出: 7x7
nn.MaxPool2d(kernel_size=2, stride=2),
)
# 计算特征图大小
# 输入图像 224x224
# 第一个卷积层(stride=2): 112x112
# 第一个池化层(stride=2): 56x56
# 第二个卷积层: 56x56
# 第二个池化层(stride=2): 28x28
# 第三个卷积层: 28x28
# 第三个池化层(stride=2): 14x14
# 第四个卷积层: 14x14
# 第四个池化层(stride=2): 7x7
# 最终特征图大小为 256 x 7 x 7 = 12544
# 分类器层
self.classifier = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(256 * 7 * 7, 2048),
nn.BatchNorm1d(2048),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(2048, 1024),
nn.BatchNorm1d(1024),
nn.ReLU(inplace=True),
nn.Linear(1024, num_classes),
)
# 权重初始化
self._initialize_weights()
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = self.features(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
def train_model(model, train_loader, test_loader, num_epochs=20, device='cuda'):
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',
factor=0.5, patience=2)
# 记录训练过程
train_losses = []
test_accuracies = []
best_accuracy = 0.0
# 将模型移到指定设备
model = model.to(device)
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# 前向传播
outputs = model(images)
loss = criterion(outputs, labels)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
if (i + 1) % 100 == 0:
print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
# 计算平均训练损失
epoch_loss = running_loss / len(train_loader)
train_losses.append(epoch_loss)
# 在测试集上评估模型
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
test_accuracies.append(accuracy)
# 更新学习率并打印当前学习率
old_lr = optimizer.param_groups[0]['lr']
scheduler.step(accuracy)
new_lr = optimizer.param_groups[0]['lr']
if old_lr != new_lr:
print(f'Learning rate decreased from {old_lr} to {new_lr}')
# 保存最佳模型
if accuracy > best_accuracy:
best_accuracy = accuracy
torch.save(model.state_dict(), 'best_alexnet_mnist.pth')
print(f'New best accuracy: {accuracy:.2f}%')
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Test Accuracy: {accuracy:.2f}%')
return train_losses, test_accuracies
def main():
# 检查是否可以使用GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
# 数据预处理和增强
train_transform = transforms.Compose([
transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.9, 1.1)),
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
test_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
# 加载MNIST数据集
train_dataset = torchvision.datasets.MNIST(root='./data', train=True,
download=True, transform=train_transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False,
download=True, transform=test_transform)
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)
# 创建模型实例
model = AlexNet(num_classes=10)
# 训练模型
train_losses, test_accuracies = train_model(model, train_loader, test_loader,
num_epochs=20, device=device)
# 绘制训练过程
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.subplot(1, 2, 2)
plt.plot(test_accuracies)
plt.title('Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.tight_layout()
plt.savefig('training_results.png')
plt.show()
if __name__ == '__main__':
main()