文章目录
1 搭建环境
1.1 下载 libtorch c++ 版本
由于我当前采用虚拟机形式,使用cpu处理器,没有使用GPU处理器。
wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip
unzip libtorch-cxx11-abi-shared-with-deps-2.0.0+cpu.zip
1.2 下载数字数据集
wget https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
wget https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
wget https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
wget https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
gunzip t10k-images-idx3-ubyte.gz
gunzip t10k-labels-idx1-ubyte.gz
gunzip train-images-idx3-ubyte.gz
gunzip train-labels-idx1-ubyte.gz
1.3 目录结构
xhome@ubuntu:~/project_ai$ ls
data image libtorch mnist_demo
2 源码目录
xhome@ubuntu:~/project_ai/mnist_demo$ ls
build CMakeLists.txt src
xhome@ubuntu:~/project_ai/mnist_demo/src$ ls
dataset.h main.cpp model.cpp model.h predict.cpp
2.1 CMakeLists.txt文件
xhome@ubuntu:~/project_ai/mnist_demo$ cat CMakeLists.txt
cmake_minimum_required(VERSION 3.0)
project(mnist_demo)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# 直接在这里设置 LibTorch 路径(修改为你的实际路径)
#set(CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/libtorch")
# 或者使用绝对路径
set(CMAKE_PREFIX_PATH "/home/xhome/project_ai/libtorch")
find_package(Torch REQUIRED)
find_package(OpenCV REQUIRED)
# 添加源文件
add_executable(mnist_demo
src/main.cpp
src/model.cpp
)
# 添加预测程序
add_executable(mnist_predict
src/predict.cpp
src/model.cpp
)
# 包含头文件目录
target_include_directories(mnist_demo PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
# 链接 LibTorch
target_link_libraries(mnist_demo "${TORCH_LIBRARIES}" ${OpenCV_LIBS} pthread)
target_link_libraries(mnist_predict ${TORCH_LIBRARIES} ${OpenCV_LIBS} pthread)
# 设置编译选项
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
2.2 训练集主程序 main.cpp
xhome@ubuntu:~/project_ai/mnist_demo/src$ cat main.cpp
#include <torch/torch.h>
#include "model.h"
#include "dataset.h"
#include <iostream>
#include <iomanip>
#include <chrono>
// 训练一个epoch
template<typename DataLoader>
void train(
Net& model,
DataLoader& data_loader,
torch::optim::Optimizer& optimizer,
size_t epoch,
size_t dataset_size) {
model.train();
size_t batch_idx = 0;
size_t processed = 0;
float last_loss = 0.0f; // 添加这行来存储最后一个loss值
for (auto& batch : data_loader) {
auto data = batch.data;
auto target = batch.target;
optimizer.zero_grad();
auto output = model.forward(data);
auto loss = torch::nll_loss(output, target);
loss.backward();
optimizer.step();
processed += batch.data.size(0);
last_loss = loss.template item<float>(); // 保存最后一个loss值
if (batch_idx++ % 100 == 0) {
std::cout << "\rTrain Epoch: " << epoch
<< " [" << processed << "/"
<< dataset_size
<< " (" << std::fixed << std::setprecision(1)
<< (100.0 * processed / dataset_size)
<< "%)] Loss: " << std::fixed << std::setprecision(4)
<< last_loss << std::flush;
}
}
// 在epoch结束时显示最终进度
std::cout << "\rTrain Epoch: " << epoch
<< " [" << dataset_size << "/"
<< dataset_size
<< " (100.0%)] Loss: " << std::fixed << std::setprecision(4)
<< last_loss << std::endl;
}
// 测试模型
template<typename DataLoader>
void test(
Net& model,
DataLoader& data_loader,
size_t dataset_size) {
model.eval();
double test_loss = 0;
int32_t correct = 0;
torch::NoGradGuard no_grad;
for (const auto& batch : data_loader) {
auto data = batch.data;
auto target = batch.target;
auto output = model.forward(data);
test_loss += torch::nll_loss(
output,
target,
/*weight=*/{},
torch::Reduction::Sum
).template item<float>();
auto pred = output.argmax(1);
correct += pred.eq(target).sum().template item<int64_t>();
}
test_loss /= dataset_size;
double accuracy = 100.0 * correct / dataset_size;
std::cout << "Test set: Average loss: " << std::fixed << std::setprecision(4)
<< test_loss << ", Accuracy: " << correct << "/"
<< dataset_size << " (" << std::fixed << std::setprecision(1)
<< accuracy << "%)\n";
}
int main() {
try {
// 设置随机种子
torch::manual_seed(1);
// 使用CPU
torch::Device device(torch::kCPU);
std::cout << "Training on CPU." << std::endl;
// 创建模型
auto model = std::make_shared<Net>();
model->to(device);
// 创建数据集
const std::string data_path = "data/MNIST/raw";
std::cout << "Loading training dataset..." << std::endl;
auto train_dataset = MNISTDataset(data_path, MNISTDataset::Mode::kTrain)
.map(torch::data::transforms::Stack<>());
std::cout << "Loading test dataset..." << std::endl;
auto test_dataset = MNISTDataset(data_path, MNISTDataset::Mode::kTest)
.map(torch::data::transforms::Stack<>());
// 获取并存储数据集大小
const size_t train_size = train_dataset.size().value();
const size_t test_size = test_dataset.size().value();
std::cout << "Training dataset size: " << train_size << std::endl;
std::cout << "Test dataset size: " << test_size << std::endl;
// 创建数据加载器
auto train_loader = torch::data::make_data_loader<torch::data::samplers::RandomSampler>(
std::move(train_dataset),
torch::data::DataLoaderOptions().batch_size(32).workers(1));
auto test_loader = torch::data::make_data_loader(
std::move(test_dataset),
torch::data::DataLoaderOptions().batch_size(100).workers(1));
// 创建优化器
torch::optim::SGD optimizer(
model->parameters(),
torch::optim::SGDOptions(0.01).momentum(0.5));
// 记录开始时间
auto start_time = std::chrono::high_resolution_clock::now();
// 训练循环
const int num_epochs = 10;
for (size_t epoch = 1; epoch <= num_epochs; ++epoch) {
std::cout << "\nEpoch " << epoch << "/" << num_epochs << std::endl;
train(*model, *train_loader, optimizer, epoch, train_size);
test(*model, *test_loader, test_size);
}
// 计算总训练时间
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::minutes>(
end_time - start_time);
std::cout << "\nTraining completed in "
<< duration.count() << " minutes" << std::endl;
// 保存模型
torch::save(model, "mnist_cnn.pt");
std::cout << "Model saved to mnist_cnn.pt" << std::endl;
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return -1;
}
return 0;
}
2.2 dataset.h
#ifndef __DATA_SET_HEAD_H
#define __DATA_SET_HEAD_H
#include <torch/torch.h>
#include <string>
#include <vector>
#include <fstream>
#include <iostream>
class MNISTDataset : public torch::data::Dataset<MNISTDataset> {
public:
enum Mode { kTrain, kTest };
explicit MNISTDataset(const std::string& root, Mode mode = Mode::kTrain) {
std::string prefix = mode == Mode::kTrain ? "train" : "t10k";
// 读取图像
std::string image_file = root + "/" + prefix + "-images-idx3-ubyte";
std::cout << "Loading images from: " << image_file << std::endl;
images = read_images(image_file);
// 读取标签
std::string label_file = root + "/" + prefix + "-labels-idx1-ubyte";
std::cout << "Loading labels from: " << label_file << std::endl;
labels = read_labels(label_file);
std::cout << "Dataset size: " << images.size(0) << std::endl;
}
torch::data::Example<> get(size_t index) override {
return {images[index], labels[index]};
}
torch::optional<size_t> size() const override {
return images.size(0);
}
private:
torch::Tensor images, labels;
torch::Tensor read_images(const std::string& path) {
std::ifstream file(path, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file: " + path);
}
int32_t magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0;
file.read(reinterpret_cast<char*>(&magic_number), sizeof(magic_number));
file.read(reinterpret_cast<char*>(&n_images), sizeof(n_images));
file.read(reinterpret_cast<char*>(&n_rows), sizeof(n_rows));
file.read(reinterpret_cast<char*>(&n_cols), sizeof(n_cols));
// 转换字节序
magic_number = __builtin_bswap32(magic_number);
n_images = __builtin_bswap32(n_images);
n_rows = __builtin_bswap32(n_rows);
n_cols = __builtin_bswap32(n_cols);
// 读取图像数据
std::vector<uint8_t> buffer(n_images * n_rows * n_cols);
file.read(reinterpret_cast<char*>(buffer.data()), buffer.size());
auto tensor = torch::from_blob(buffer.data(),
{n_images, n_rows, n_cols}, torch::kUInt8).clone();
// 添加通道维度并归一化
tensor = tensor.unsqueeze(1).to(torch::kFloat32).div(255.0);
return tensor;
}
torch::Tensor read_labels(const std::string& path) {
std::ifstream file(path, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file: " + path);
}
int32_t magic_number = 0, n_labels = 0;
file.read(reinterpret_cast<char*>(&magic_number), sizeof(magic_number));
file.read(reinterpret_cast<char*>(&n_labels), sizeof(n_labels));
magic_number = __builtin_bswap32(magic_number);
n_labels = __builtin_bswap32(n_labels);
std::vector<uint8_t> buffer(n_labels);
file.read(reinterpret_cast<char*>(buffer.data()), n_labels);
return torch::from_blob(buffer.data(),
{n_labels}, torch::kUInt8).clone().to(torch::kLong);
}
};
#endif
2.3 model.cpp
#include "model.h"
Net::Net() {
// 第一个卷积块: 1->32 通道, 3x3 卷积核
conv1 = register_module("conv1",
torch::nn::Conv2d(
torch::nn::Conv2dOptions(1, 32, 3)
.stride(1)
.padding(1)
));
batch_norm1 = register_module("batch_norm1",
torch::nn::BatchNorm2d(32));
// 第二个卷积块: 32->64 通道, 3x3 卷积核
conv2 = register_module("conv2",
torch::nn::Conv2d(
torch::nn::Conv2dOptions(32, 64, 3)
.stride(1)
.padding(1)
));
batch_norm2 = register_module("batch_norm2",
torch::nn::BatchNorm2d(64));
// 全连接层
fc1 = register_module("fc1",
torch::nn::Linear(7 * 7 * 64, 128));
fc2 = register_module("fc2",
torch::nn::Linear(128, 10));
// Dropout层
dropout = register_module("dropout",
torch::nn::Dropout(torch::nn::DropoutOptions(0.25)));
}
torch::Tensor Net::forward(torch::Tensor x) {
// 第一个卷积块
x = conv1->forward(x);
x = batch_norm1->forward(x);
x = torch::relu(x);
x = torch::max_pool2d(x, 2);
// 第二个卷积块
x = conv2->forward(x);
x = batch_norm2->forward(x);
x = torch::relu(x);
x = torch::max_pool2d(x, 2);
// 展平
x = x.view({-1, 7 * 7 * 64});
// 全连接层
x = torch::relu(fc1->forward(x));
x = dropout->forward(x);
x = fc2->forward(x);
return torch::log_softmax(x, 1);
}
2.4 识别主程序
#include <torch/torch.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include <iomanip>
#include <memory>
#include <string>
#include "model.h"
torch::Tensor preprocess_image(const std::string& image_path) {
// 读取图片
cv::Mat image = cv::imread(image_path, cv::IMREAD_GRAYSCALE);
if (image.empty()) {
throw std::runtime_error("Error: Could not read image: " + image_path);
}
std::cout << "Original image size: " << image.size() << std::endl;
// 使用Otsu's方法进行自动阈值二值化
cv::Mat binary;
cv::threshold(image, binary, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);
// 应用形态学操作来改善数字形状
cv::Mat morph;
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(3, 3));
cv::morphologyEx(binary, morph, cv::MORPH_CLOSE, kernel);
// 找到数字的边界框
std::vector<std::vector<cv::Point>> contours;
cv::findContours(255 - morph, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE);
// 找到最大的轮廓
cv::Rect boundingBox;
double maxArea = 0;
for (const auto& contour : contours) {
double area = cv::contourArea(contour);
if (area > maxArea) {
maxArea = area;
boundingBox = cv::boundingRect(contour);
}
}
// 确保边界框是正方形,并且保持纵横比
int maxSide = std::max(boundingBox.width, boundingBox.height);
int xCenter = boundingBox.x + boundingBox.width / 2;
int yCenter = boundingBox.y + boundingBox.height / 2;
// 扩展边界框为正方形
boundingBox.x = xCenter - maxSide / 2;
boundingBox.y = yCenter - maxSide / 2;
boundingBox.width = maxSide;
boundingBox.height = maxSide;
// 添加padding
int padding = maxSide / 4;
boundingBox.x = std::max(0, boundingBox.x - padding);
boundingBox.y = std::max(0, boundingBox.y - padding);
boundingBox.width = std::min(image.cols - boundingBox.x, boundingBox.width + 2 * padding);
boundingBox.height = std::min(image.rows - boundingBox.y, boundingBox.height + 2 * padding);
// 裁剪图像
cv::Mat cropped = morph(boundingBox);
// 调整大小为20x20
cv::Mat resized;
cv::resize(cropped, resized, cv::Size(20, 20), 0, 0, cv::INTER_AREA);
// 添加4像素边框
cv::Mat padded;
cv::copyMakeBorder(resized, padded, 4, 4, 4, 4,
cv::BORDER_CONSTANT, cv::Scalar(255));
// 反转颜色
cv::Mat inverted;
cv::bitwise_not(padded, inverted);
// 应用轻微的高斯模糊
cv::Mat blurred;
cv::GaussianBlur(inverted, blurred, cv::Size(3, 3), 0.5);
// 再次应用阈值,确保清晰的边界
cv::Mat final;
cv::threshold(blurred, final, 127, 255, cv::THRESH_BINARY);
// 转换为浮点数并归一化
cv::Mat float_img;
final.convertTo(float_img, CV_32F, 1.0/255.0);
// 保存所有处理步骤的图片
cv::imwrite("step1_binary.jpg", binary);
cv::imwrite("step2_morph.jpg", morph);
cv::imwrite("step3_cropped.jpg", cropped);
cv::imwrite("step4_resized.jpg", resized);
cv::imwrite("step5_padded.jpg", padded);
cv::imwrite("step6_inverted.jpg", inverted);
cv::imwrite("step7_final.jpg", final);
std::cout << "Preprocessing steps saved as images" << std::endl;
// 转换为tensor
auto tensor = torch::from_blob(float_img.data, {1, 28, 28}, torch::kFloat32).clone();
tensor = tensor.unsqueeze(0);
return tensor;
}
void display_tensor(const torch::Tensor& tensor) {
std::cout << "\nProcessed image (ASCII art):\n";
for (int i = 0; i < 28; ++i) {
for (int j = 0; j < 28; ++j) {
float pixel = tensor[0][0][i][j].item<float>();
if (pixel < 0.2) std::cout << " ";
else if (pixel < 0.4) std::cout << "..";
else if (pixel < 0.6) std::cout << "**";
else if (pixel < 0.8) std::cout << "##";
else std::cout << "@@";
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cerr << "Usage: " << argv[0] << " <image_path>" << std::endl;
return 1;
}
try {
// 使用CPU
torch::Device device(torch::kCPU);
// 加载模型
auto model = std::make_shared<Net>();
torch::load(model, "mnist_cnn.pt");
model->to(device);
model->eval();
// 预处理图片
auto input = preprocess_image(argv[1]);
// 显示处理后的图像
display_tensor(input);
// 进行预测
torch::NoGradGuard no_grad;
auto output = model->forward(input);
auto probabilities = torch::softmax(output, 1);
auto prediction = output.argmax(1);
// 打印预测结果
std::cout << "\nPredicted digit: " << prediction.item<int>() << std::endl;
// 打印每个数字的概率
std::cout << "\nProbabilities for each digit:" << std::endl;
for (int i = 0; i < 10; ++i) {
std::cout << "Digit " << i << ": "
<< std::fixed << std::setprecision(4)
<< probabilities[0][i].item<float>() * 100 << "%"
<< std::endl;
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return -1;
}
return 0;
}
3 运行结果
3.1 训练集程序运行
xhome@ubuntu:~/project_ai/mnist_demo/build$ ./mnist_demo
Training on CPU.
Loading training dataset...
Loading images from: data/MNIST/raw/train-images-idx3-ubyte
Loading labels from: data/MNIST/raw/train-labels-idx1-ubyte
Dataset size: 60000
Loading test dataset...
Loading images from: data/MNIST/raw/t10k-images-idx3-ubyte
Loading labels from: data/MNIST/raw/t10k-labels-idx1-ubyte
Dataset size: 10000
Training dataset size: 60000
Test dataset size: 10000
Epoch 1/10
Train Epoch: 1 [60000/60000 (100.0%)] Loss: 0.0716
Test set: Average loss: 0.0504, Accuracy: 9840/10000 (98.4%)
Epoch 2/10
Train Epoch: 2 [60000/60000 (100.0%)] Loss: 0.0070
Test set: Average loss: 0.0420, Accuracy: 9863/10000 (98.6%)
Epoch 3/10
Train Epoch: 3 [60000/60000 (100.0%)] Loss: 0.0337
Test set: Average loss: 0.0352, Accuracy: 9889/10000 (98.9%)
Epoch 4/10
Train Epoch: 4 [60000/60000 (100.0%)] Loss: 0.0009
Test set: Average loss: 0.0325, Accuracy: 9893/10000 (98.9%)
Epoch 5/10
Train Epoch: 5 [60000/60000 (100.0%)] Loss: 0.0131
Test set: Average loss: 0.0277, Accuracy: 9907/10000 (99.1%)
Epoch 6/10
Train Epoch: 6 [60000/60000 (100.0%)] Loss: 0.0219
Test set: Average loss: 0.0270, Accuracy: 9910/10000 (99.1%)
Epoch 7/10
Train Epoch: 7 [60000/60000 (100.0%)] Loss: 0.0033
Test set: Average loss: 0.0254, Accuracy: 9917/10000 (99.2%)
Epoch 8/10
Train Epoch: 8 [60000/60000 (100.0%)] Loss: 0.0033
Test set: Average loss: 0.0247, Accuracy: 9915/10000 (99.2%)
Epoch 9/10
Train Epoch: 9 [60000/60000 (100.0%)] Loss: 0.0561
Test set: Average loss: 0.0221, Accuracy: 9926/10000 (99.3%)
Epoch 10/10
Train Epoch: 10 [60000/60000 (100.0%)] Loss: 0.0003
Test set: Average loss: 0.0248, Accuracy: 9914/10000 (99.1%)
Training completed in 1 minutes
Model saved to mnist_cnn.pt
3.2 识别程序
xhome@ubuntu:~/project_ai/mnist_demo/build$ ./mnist_predict ../../image/1.png
Original image size: [456 x 479]
Preprocessing steps saved as images
Processed image (ASCII art):
@@@@@@
@@@@@@@@@@
@@@@@@@@
@@@@@@@@
@@@@@@
@@@@@@
@@@@@@
@@@@@@
@@@@@@
@@@@@@
@@@@@@
@@@@@@
@@@@@@@@
@@@@@@@@
@@@@@@@@@@@@
Predicted digit: 1
Probabilities for each digit:
Digit 0: 0.0047%
Digit 1: 94.2735%
Digit 2: 0.0737%
Digit 3: 0.0274%
Digit 4: 0.0054%
Digit 5: 2.7657%
Digit 6: 0.0065%
Digit 7: 0.0035%
Digit 8: 2.7485%
Digit 9: 0.0911%
xhome@ubuntu:~/project_ai/mnist_demo/build$ ./mnist_predict ../../image/5.png
Original image size: [508 x 512]
Preprocessing steps saved as images
Processed image (ASCII art):
@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@
@@@@@@@@
@@@@@@@@
@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@
@@@@@@@@ @@@@@@@@
@@@@@@@@
@@@@@@@@
@@@@@@@@ @@@@@@@@
@@@@@@@@ @@@@@@@@
@@@@@@@@ @@@@@@@@
@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@
@@@@@@@@
Predicted digit: 5
Probabilities for each digit:
Digit 0: 0.0118%
Digit 1: 0.0003%
Digit 2: 0.0079%
Digit 3: 0.3717%
Digit 4: 0.0000%
Digit 5: 84.2349%
Digit 6: 0.0197%
Digit 7: 0.0005%
Digit 8: 13.8340%
Digit 9: 1.5192%