量化感知训练工具的基本使用流程如下:
下面以 torchvision 中的 MobileNetV2 模型为例,介绍流程中每个阶段的具体操作。
出于流程展示的执行速度考虑,我们使用了 cifar-10 数据集,而不是 ImageNet-1K 数据集。
import os
import copy
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch import Tensor
from torch.quantization import DeQuantStub
from torchvision.datasets import CIFAR10
from torchvision.models.mobilenetv2 import MobileNetV2
from torch.utils import data
from typing import Optional, Callable, List, Tuple
from horizon_plugin_pytorch.functional import rgb2centered_yuv
import torch.quantization
from horizon_plugin_pytorch.march import March, set_march
from horizon_plugin_pytorch.quantization import (
QuantStub,
prepare,
set_fake_quantize,
FakeQuantState,
)
from horizon_plugin_pytorch.quantization.qconfig import (
default_calib_8bit_fake_quant_qconfig,
default_qat_8bit_fake_quant_qconfig,
default_qat_8bit_weight_32bit_out_fake_quant_qconfig,
default_calib_8bit_weight_32bit_out_fake_quant_qconfig,
)
from hbdk4 import compiler as hb4
import logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name: str, fmt=":f"):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
return fmtstr.format(**self.__dict__)
def accuracy(output: Tensor, target: Tensor, topk=(1,)) -> List[Tensor]:
"""Computes the accuracy over the k top predictions for the specified
values of k
"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].float().sum()
res.append(correct_k.mul_(100.0 / batch_size))
return res
def evaluate(
model: nn.Module, data_loader: data.DataLoader, device: torch.device
) -> Tuple[AverageMeter, AverageMeter]:
top1 = AverageMeter("Acc@1", ":6.2f")
top5 = AverageMeter("Acc@5", ":6.2f")
with torch.no_grad():
for image, target in data_loader:
image, target = image.to(device), target.to(device)
output = model(image)
acc1, acc5 = accuracy(output, target, topk=(1, 5))
top1.update(acc1, image.size(0))
top5.update(acc5, image.size(0))
print(".", end="", flush=True)
print()
return top1, top5
def train_one_epoch(
model: nn.Module,
criterion: Callable,
optimizer: torch.optim.Optimizer,
scheduler: Optional[torch.optim.lr_scheduler._LRScheduler],
data_loader: data.DataLoader,
device: torch.device,
) -> None:
top1 = AverageMeter("Acc@1", ":6.3f")
top5 = AverageMeter("Acc@5", ":6.3f")
avgloss = AverageMeter("Loss", ":1.5f")
model.to(device)
for image, target in data_loader:
image, target = image.to(device), target.to(device)
output = model(image)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if scheduler is not None:
scheduler.step()
acc1, acc5 = accuracy(output, target, topk=(1, 5))
top1.update(acc1, image.size(0))
top5.update(acc5, image.size(0))
avgloss.update(loss, image.size(0))
print(".", end="", flush=True)
print()
print(
"Full cifar-10 train set: Loss {:.3f} Acc@1"
" {:.3f} Acc@5 {:.3f}".format(avgloss.avg, top1.avg, top5.avg)
)
首先,对浮点模型做必要的改造,以支持量化相关操作。模型改造必要的操作有:
QuantStub。DequantStub。改造模型时需要注意:
QuantStub 和 DequantStub 必须注册为模型的子模块,否则将无法正确处理它们的量化状态。QuantStub,否则请为每个输入定义单独的 QuantStub。"pyramid",请手动设置对应 QuantStub 的 scale 参数为 1/128。torch.quantization.QuantStub,但是仅有 horizon_plugin_pytorch.quantization.QuantStub 支持通过参数手动固定 scale。改造后的模型可以无缝加载改造前模型的参数,因此若已有训练好的浮点模型,直接加载即可,否则需要正常进行浮点训练。
模型上板时的输入图像数据一般为 centered_yuv444 格式,因此模型训练时需要把图像转换成 centered_yuv444 格式(注意下面代码中对 rgb2centered_yuv 的使用)。
如果无法转换成 centered_yuv444 格式进行模型训练,请在模型部署时在输入上插入适当的颜色空间转换结点。(注意,该方法可能导致模型精度降低)
本示例中浮点和 QAT 训练的 epoch 较少,仅为说明训练工具使用流程,精度不代表模型最好水平。
######################################################################
# 用户可根据需要修改以下参数
# 1. 模型 ckpt 和编译产出物的保存路径
model_path = "model/mobilenetv2"
# 2. 数据集下载和保存的路径
data_path = "data"
# 3. 训练时使用的 batch_size
train_batch_size = 256
# 4. 预测时使用的 batch_size
eval_batch_size = 256
# 5. 训练的 epoch 数
epoch_num = 10
# 6. 模型保存和执行计算使用的 device
device = (
torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)
######################################################################
# 准备数据集,请注意 collate_fn 中对 rgb2centered_yuv 的使用
def prepare_data_loaders(
data_path: str, train_batch_size: int, eval_batch_size: int
) -> Tuple[data.DataLoader, data.DataLoader]:
normalize = transforms.Normalize(mean=0.0, std=128.0)
def collate_fn(batch):
batched_img = torch.stack(
[
torch.from_numpy(np.array(example[0], np.uint8, copy=True))
for example in batch
]
).permute(0, 3, 1, 2)
batched_target = torch.tensor([example[1] for example in batch])
batched_img = rgb2centered_yuv(batched_img)
batched_img = normalize(batched_img.float())
return batched_img, batched_target
train_dataset = CIFAR10(
data_path,
True,
transforms.Compose(
[
transforms.RandomHorizontalFlip(),
transforms.RandAugment(),
]
),
download=True,
)
eval_dataset = CIFAR10(
data_path,
False,
download=True,
)
train_data_loader = data.DataLoader(
train_dataset,
batch_size=train_batch_size,
sampler=data.RandomSampler(train_dataset),
num_workers=8,
collate_fn=collate_fn,
pin_memory=True,
)
eval_data_loader = data.DataLoader(
eval_dataset,
batch_size=eval_batch_size,
sampler=data.SequentialSampler(eval_dataset),
num_workers=8,
collate_fn=collate_fn,
pin_memory=True,
)
return train_data_loader, eval_data_loader
# 对浮点模型做必要的改造
class QATReadyMobileNetV2(MobileNetV2):
def __init__(
self,
num_classes: int = 10,
width_mult: float = 1.0,
inverted_residual_setting: Optional[List[List[int]]] = None,
round_nearest: int = 8,
):
super().__init__(
num_classes, width_mult, inverted_residual_setting, round_nearest
)
self.quant = QuantStub(scale=1 / 128)
self.dequant = DeQuantStub()
def forward(self, x: Tensor) -> Tensor:
x = self.quant(x)
x = super().forward(x)
x = self.dequant(x)
return x
if not os.path.exists(model_path):
os.makedirs(model_path, exist_ok=True)
# 浮点模型初始化
float_model = QATReadyMobileNetV2()
# 准备数据集
train_data_loader, eval_data_loader = prepare_data_loaders(
data_path, train_batch_size, eval_batch_size
)
# 由于模型的最后一层和预训练模型不一致,需要进行浮点 finetune
optimizer = torch.optim.Adam(
float_model.parameters(), lr=0.001, weight_decay=1e-3
)
best_acc = 0
for nepoch in range(epoch_num):
float_model.train()
train_one_epoch(
float_model,
nn.CrossEntropyLoss(),
optimizer,
None,
train_data_loader,
device,
)
# 浮点精度测试
float_model.eval()
top1, top5 = evaluate(float_model, eval_data_loader, device)
print(
"Float Epoch {}: evaluation Acc@1 {:.3f} Acc@5 {:.3f}".format(
nepoch, top1.avg, top5.avg
)
)
if top1.avg > best_acc:
best_acc = top1.avg
# 保存最佳浮点模型参数
torch.save(
float_model.state_dict(),
os.path.join(model_path, "float-checkpoint.ckpt"),
)
Files already downloaded and verified
Files already downloaded and verified
....................................................................................................................................................................................................
Full cifar-10 train set: Loss 2.113 Acc@1 20.826 Acc@5 71.182
........................................
Float Epoch 0: evaluation Acc@1 33.140 Acc@5 85.710
...
....................................................................................................................................................................................................
Full cifar-10 train set: Loss 1.167 Acc@1 58.864 Acc@5 94.682
........................................
Float Epoch 9: evaluation Acc@1 64.490 Acc@5 96.400
模型改造完成并完成浮点训练后,便可进行 Calibration。此过程通过在模型中插入 Observer 的方式,在 forward 过程中统计各处的数据分布情况,从而计算出合理的量化参数:
######################################################################
# 用户可根据需要修改以下参数
# 1. Calibration 时使用的 batch_size
calib_batch_size = 256
# 2. Validation 时使用的 batch_size
eval_batch_size = 256
# 3. Calibration 使用的数据量,配置为 inf 以使用全部数据
num_examples = float("inf")
# 4. 目标硬件平台的代号
march = March.NASH_E
# 5. 模型 trace 和 export 使用的输入
example_input = torch.rand(1, 3, 32, 32, device=device)
######################################################################
# 在进行模型转化前,必须设置好模型将要执行的硬件平台
set_march(march)
# 输出模型会共享输入模型的 attributes,为不影响 float_model 的后续使用,
# 此处进行了 deepcopy
calib_model = copy.deepcopy(float_model)
calib_model.qconfig = default_calib_8bit_fake_quant_qconfig
calib_model.classifier.qconfig = (
default_calib_8bit_weight_32bit_out_fake_quant_qconfig
)
# 将模型转化为 Calibration 状态,以统计各处数据的数值分布特征
calib_model = prepare(calib_model, example_inputs=example_input)
# 准备数据集
calib_data_loader, eval_data_loader = prepare_data_loaders(
data_path, calib_batch_size, eval_batch_size
)
# 执行 Calibration 过程(不需要 backward)
# 注意此处对模型状态的控制,模型需要处于 eval 状态以使 Bn 的行为符合要求
calib_model.eval()
set_fake_quantize(calib_model, FakeQuantState.CALIBRATION)
with torch.no_grad():
cnt = 0
for image, target in calib_data_loader:
image, target = image.to(device), target.to(device)
calib_model(image)
print(".", end="", flush=True)
cnt += image.size(0)
if cnt >= num_examples:
break
print()
# 测试伪量化精度
# 注意此处对模型状态的控制
calib_model.eval()
set_fake_quantize(calib_model, FakeQuantState.VALIDATION)
top1, top5 = evaluate(
calib_model,
eval_data_loader,
device,
)
print(
"Calibration: evaluation Acc@1 {:.3f} Acc@5 {:.3f}".format(
top1.avg, top5.avg
)
)
# 保存 Calibration 模型参数
torch.save(
calib_model.state_dict(),
os.path.join(model_path, "calib-checkpoint.ckpt"),
)
2024-06-11 14:16:18,510 INFO: Begin check qat model...
2024-06-11 14:16:18,834 INFO: All fusable modules are fused in model!
2024-06-11 14:16:18,834 INFO: All modules in the model run exactly once.
2024-06-11 14:16:18,835 WARNING: Please check these modules qconfig if expected:
+---------------+---------------------------------------------------------+-----------------------------------------+
| module name | module type | msg |
|---------------+---------------------------------------------------------+-----------------------------------------|
| quant | <class 'horizon_plugin_pytorch.nn.qat.stubs.QuantStub'> | Fixed scale 0.0078125 |
| classifier.1 | <class 'horizon_plugin_pytorch.nn.qat.linear.Linear'> | activation is None. Maybe output layer? |
+---------------+---------------------------------------------------------+-----------------------------------------+
2024-06-11 14:16:18,848 INFO: Check full result in ./model_check_result.txt
2024-06-11 14:16:18,848 INFO: End check
Files already downloaded and verified
Files already downloaded and verified
....................................................................................................................................................................................................
........................................
Calibration: evaluation Acc@1 64.280 Acc@5 96.240
模型经过 Calibration 后的量化精度若已满足要求,便可直接进行模型部署的步骤,否则需要进行量化感知训练进一步提升精度。
量化感知训练通过在模型中插入伪量化节点的方式,在训练过程中使模型感知到量化带来的影响,在这种情况下对模型参数进行微调,以提升量化后的精度。
######################################################################
# 用户可根据需要修改以下参数
# 1. 训练时使用的 batch_size
train_batch_size = 256
# 2. Validation 时使用的 batch_size
eval_batch_size = 256
# 3. 训练的 epoch 数
epoch_num = 3
######################################################################
# 准备数据集
train_data_loader, eval_data_loader = prepare_data_loaders(
data_path, train_batch_size, eval_batch_size
)
qat_model = copy.deepcopy(float_model)
qat_model.qconfig = default_qat_8bit_fake_quant_qconfig
qat_model.classifier.qconfig = (
default_qat_8bit_weight_32bit_out_fake_quant_qconfig
)
# 将模型转为 QAT 状态
qat_model = prepare(qat_model, example_inputs=example_input)
# 加载 Calibration 模型中的量化参数
qat_model.load_state_dict(calib_model.state_dict())
# 进行量化感知训练
# 作为一个 filetune 过程,量化感知训练一般需要设定较小的学习率
optimizer = torch.optim.Adam(
qat_model.parameters(), lr=1e-3, weight_decay=1e-4
)
best_acc = 0
for nepoch in range(epoch_num):
# 注意此处对 QAT 模型 training 状态的控制方法
qat_model.train()
set_fake_quantize(qat_model, FakeQuantState.QAT)
train_one_epoch(
qat_model,
nn.CrossEntropyLoss(),
optimizer,
None,
train_data_loader,
device,
)
# 注意此处对 QAT 模型 eval 状态的控制方法
qat_model.eval()
set_fake_quantize(qat_model, FakeQuantState.VALIDATION)
top1, top5 = evaluate(
qat_model,
eval_data_loader,
device,
)
print(
"QAT Epoch {}: evaluation Acc@1 {:.3f} Acc@5 {:.3f}".format(
nepoch, top1.avg, top5.avg
)
)
if top1.avg > best_acc:
best_acc = top1.avg
torch.save(
qat_model.state_dict(),
os.path.join(model_path, "qat-checkpoint.ckpt"),
)
Files already downloaded and verified
Files already downloaded and verified
2024-06-11 14:20:45,090 INFO: Begin check qat model...
2024-06-11 14:20:45,236 INFO: All fusable modules are fused in model!
2024-06-11 14:20:45,236 INFO: All modules in the model run exactly once.
2024-06-11 14:20:45,237 WARNING: Please check these modules qconfig if expected:
+---------------+---------------------------------------------------------+-----------------------------------------+
| module name | module type | msg |
|---------------+---------------------------------------------------------+-----------------------------------------|
| quant | <class 'horizon_plugin_pytorch.nn.qat.stubs.QuantStub'> | Fixed scale 0.0078125 |
| classifier.1 | <class 'horizon_plugin_pytorch.nn.qat.linear.Linear'> | activation is None. Maybe output layer? |
+---------------+---------------------------------------------------------+-----------------------------------------+
2024-06-11 14:20:45,249 INFO: Check full result in ./model_check_result.txt
2024-06-11 14:20:45,249 INFO: End check
2024-06-11 14:20:45,687 WARNING: fast training is experimental
....................................................................................................................................................................................................
Full cifar-10 train set: Loss 1.279 Acc@1 55.026 Acc@5 93.572
........................................
QAT Epoch 0: evaluation Acc@1 62.830 Acc@5 95.950
....................................................................................................................................................................................................
Full cifar-10 train set: Loss 1.151 Acc@1 59.474 Acc@5 94.822
........................................
QAT Epoch 1: evaluation Acc@1 65.940 Acc@5 96.520
....................................................................................................................................................................................................
Full cifar-10 train set: Loss 1.102 Acc@1 61.114 Acc@5 95.546
........................................
QAT Epoch 2: evaluation Acc@1 66.340 Acc@5 96.940
伪量化精度达标后,便可执行模型部署的相关流程。
模型部署首先需要将伪量化模型导出为 Hbir 模型。
######################################################################
# 用户可根据需要修改以下参数
# 1. 使用哪个模型作为流程的输入,可以选择 calib_model 或 qat_model
base_model = qat_model
######################################################################
from horizon_plugin_pytorch.quantization.hbdk4 import export
hbir_qat_model = export(base_model, (example_input,))
2024-06-11 14:24:16,708 INFO: Model ret: Tensor(shape=(1, 10), dtype=torch.float32, device=cuda:0)
伪量化精度达标后,便可将模型转为定点模型。一般认为定点模型的结果和编译后模型的结果是完全一致的。
Tensor 或 Tuple[Tensor], 输出仅支持 Tuple[Tensor]。# 将模型转为定点状态,注意此处的 march 需要区分 nash-e/m/p
# 注意此处的 perf_output_dir 参数,必须和后续 internal_compile 中一致!
hbir_quantized_model = hb4.convert(
hbir_qat_model,
March.NASH_E,
)
# hbir 精度测试使用的 dataloader,注意此处的 batch_size 必须和 export hbir 时
# 使用的 example_input 相同
_, eval_hbir_data_loader = prepare_data_loaders(
data_path, train_batch_size, 1
)
def evaluate_hbir(
model: hb4.Module, data_loader: data.DataLoader
) -> Tuple[AverageMeter, AverageMeter]:
top1 = AverageMeter("Acc@1", ":6.2f")
top5 = AverageMeter("Acc@5", ":6.2f")
for image, target in data_loader:
image, target = image.cpu(), target.cpu()
output = model.functions[0](image)[0]
acc1, acc5 = accuracy(output, target, topk=(1, 5))
top1.update(acc1, image.size(0))
top5.update(acc5, image.size(0))
return top1, top5
# 测试定点模型精度
top1, top5 = evaluate_hbir(
hbir_quantized_model,
eval_hbir_data_loader,
)
print(
"Quantized model: evaluation Acc@1 {:.3f} Acc@5 {:.3f}".format(
top1.avg, top5.avg
)
)
Files already downloaded and verified
Files already downloaded and verified
Quantized model: evaluation Acc@1 65.250 Acc@5 93.990
测试定点模型精度并确认符合要求后,便可以进行模型编译、性能测试和可视化。
######################################################################
# 用户可根据需要修改以下参数
# 1. 编译时启用的优化等级,等级越高编译出的模型上板执行速度越快,但编译过程会慢
compile_opt = 1
######################################################################
# 模型编译
hb4.compile(
hbir_quantized_model,
os.path.join(model_path, "model.hbm"),
March.NASH_E,
opt=compile_opt,
)
# 模型性能测试
hb4.hbm_perf(
os.path.join(model_path, "model.hbm"),
March.NASH_E,
output_dir=model_path,
)
[2024-06-11 14:26:34.809] [warning] Performance information does not include operators in the model running on the CPU
[2024-06-11 14:26:34.809] [warning] Invalid debug data size, we will not use debug data.
[2024-06-11 14:26:34.809] [warning] Invalid debug data size, we will not use debug data.
FPS=11624.299999999999, latency = 86 us, DDR = 2576640 bytes (see model/mobilenetv2/forward.html)
HBDK hbm perf SUCCESS
[06h:26m:34s:692209us INFO hbrt4_loader::parsing] pid:107773 tid:107773 hbrt4_loader/src/parsing.rs:31: Load hbm header from file; filename="model/mobilenetv2/model.hbm"
[06h:26m:34s:694516us INFO hbrt4_log::logger] pid:107773 tid:107773 hbrt4_log/src/logger.rs:388: Logger of HBRT4 initialized, version = 4.0.21.post0.dev202406040731+3f96886
[06h:26m:34s:694525us INFO hbrt4_loader::parsing] pid:107773 tid:107773 hbrt4_loader/src/parsing.rs:53: Load hbm from file; filename="model/mobilenetv2/model.hbm"
0
# 模型可视化
hb4.visualize(hbir_quantized_model)