我们将处理以下常见 NLP
任务:token classification
、掩码语言建模(如 BERT
)、文本摘要、翻译、因果语言建模(如 GPT
系列模型 )、问答。
:token classification
任务可以为文本中的单词或字符分配标签,如:
实体命名识别 (NER
):找出句子中的实体(如人物、地点或组织)。
词性标注 (POS
):将句子中的每个单词标记为对应的词性(如名词、动词、形容词等)。
分块(chunking
):找到属于同一实体的 Token
(如,找出名词短语、动词短语等)。
查看数据集:
xxxxxxxxxx
from datasets import load_dataset
raw_datasets = load_dataset("conll2003")
print(raw_datasets) # 有三个数据集:train, validation, test
# DatasetDict({
# train: Dataset({
# features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
# num_rows: 14041
# })
# validation: Dataset({
# features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
# num_rows: 3250
# })
# test: Dataset({
# features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
# num_rows: 3453
# })
# })
print(raw_datasets["train"][0]["tokens"]) # 训练集第零个样本的单词序列
# ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
print(raw_datasets["train"][0]["ner_tags"]) # 训练集第零个样本的 label 序列
# [3, 0, 7, 0, 0, 0, 7, 0, 0]
ner_feature = raw_datasets["train"].features["ner_tags"]
print(ner_feature) # NER label 的名称
# Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)
print(ner_feature.feature.names) # label 编号从 0 ~ 8 对应的名称
# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
# O 表示这个词不对应任何实体。
# B-PER/I-PER意味着这个词对应于人名实体的开头/内部。
# B-ORG/I-ORG 的意思是这个词对应于组织名称实体的开头/内部。
# B-LOC/I-LOC 指的是是这个词对应于地名实体的开头/内部。
# B-MISC/I-MISC 表示该词对应于一个杂项实体的开头/内部。
我们的文本需要转换为 Token ID
,然后模型才能理解它们。
xxxxxxxxxx
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(tokenizer.is_fast)
# True
## 注意,样本,如 raw_datasets["train"][0]["tokens"] 已经是完成了分词,因此这里无需分词步骤 (is_split_into_words 设为 True )
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True) # 执行 tokenization
print(inputs)
# {
# 'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102],
# 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# }
print(inputs.tokens())
# ['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
注意:单词 "lamb"
被分为两个子词 "la"
和 "##mb"
。这导致了 token
序列和标签序列之间的不匹配:标签序列只有 9
个元素,而 token
序列有 12
个 token
。我们需要扩展标签序列以匹配 token
序列 。这里有两条规则:
special token
(如,[CLS], [SEP]
)的标签为 -100
。这是因为默认情况下 -100
是一个在我们将使用的损失函数(交叉熵)中被忽略的 index
。
每个 token
都会获得与其所在单词相同的标签,因为它们是同一实体的一部分。对于单词内部但不在开头的 token
,我们将 B-
替换为 I-
,因为单词内部的 token
不是实体的开头。
xxxxxxxxxx
def align_labels_with_tokens(labels, word_ids):
# labels : word_id -> label 的映射
# word_ids: 每个 token 对应的 word_id
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word: # 遇到一个新的单词
current_word = word_id
label = -100 if word_id is None else labels[word_id]
new_labels.append(label)
elif word_id is None: # special token
new_labels.append(-100)
else: # 同一个单词内部
label = labels[word_id]
if label % 2 == 1: # 如果 label 是 B-XXX,那么修改为 I-XXX
label += 1
new_labels.append(label)
return new_labels
为了预处理整个数据集,我们需要 tokenize
所有输入并在所有 label
上应用 align_labels_with_tokens()
。为了利用我们的快速分词器的速度优势,最好同时对大量文本进行 tokenize
,因此我们将编写一个处理样本列表的函数并使用带 batched = True
的 Dataset.map()
方法。
xxxxxxxxxx
def tokenize_and_align_labels(examples): # examples 是样本的列表
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
all_labels = examples["ner_tags"]
new_labels = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(i) # 获取第 i 个样本的 word_ids
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels # 调整标签
return tokenized_inputs
tokenized_datasets = raw_datasets.map(
tokenize_and_align_labels,
batched=True, # 一次性处理 batch 的样本
remove_columns=raw_datasets["train"].column_names,
)
print(tokenized_datasets)
# DatasetDict({
# train: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
# num_rows: 14041
# })
# validation: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
# num_rows: 3250
# })
# test: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
# num_rows: 3453
# })
# })
print(tokenized_datasets["train"][0]["input_ids"]) # 训练集第零个样本的 token id 序列
# [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102]
print(tokenized_datasets["train"][0]["labels"]) # 训练集第零个样本的 label id 序列
# [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
print(tokenized_datasets["train"].features["labels"])
# Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)
前面的数据处理可以批量地将输入文本转换为 input_ids
( token id
序列),但是我们还需要将样本拼接成 mini-batch
然而馈入模型。DataCollatorForTokenClassification
支持以相同的方式来填充输入文本和 label
,使得 input_ids
和 labels
的长度保持相同。label
的 padding
值为 -100
,这样在损失计算中就可以忽略相应的预测。
注意:input_ids
的 padding
值为 0
、labels
的 padding
值为 -100
。
xxxxxxxxxx
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
sample_list = [tokenized_datasets["train"][i] for i in range(3)] # 3 个样本,每个样本的序列长度不同
print([len(i['labels']) for i in sample_list])
# [12, 4, 11]
batch = data_collator(sample_list)
print(batch)
# {'input_ids':
# tensor([[ 101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913,
# 119, 102],
# [ 101, 1943, 14428, 102, 0, 0, 0, 0, 0, 0,
# 0, 0],
# [ 101, 26660, 13329, 12649, 15928, 1820, 118, 4775, 118, 1659,
# 102, 0]]),
# 'token_type_ids':
# tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
# 'attention_mask':
# tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]),
# 'labels':
# tensor([[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100],
# [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100],
# [-100, 5, 6, 6, 6, 0, 0, 0, 0, 0, -100, -100]])}
用于评估 Token
分类效果的传统框架是 seqeval
。要使用此指标,我们首先需要安装 seqeval
库:
xxxxxxxxxx
pip install seqeval
然后我们可以通过 load_metric()
函数加载它:
xxxxxxxxxx
from datasets import load_metric
metric = load_metric("seqeval")
这个评估框架以字符串而不是整数的格式传入 label
,因此在将 prediction
和 label
传递给它之前需要对 prediction/label
进行解码。让我们看看它是如何工作的:
首先,我们获得第一个训练样本的标签:
xxxxxxxxxx
labels = raw_datasets["train"][0]["ner_tags"]
print(labels)
# [3, 0, 7, 0, 0, 0, 7, 0, 0]
label_names = raw_datasets["train"].features["ner_tags"].feature.names
labels = [label_names[i] for i in labels]
print(labels)
# ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
然后我们可以通过更改 labels
来创建 ”虚拟” 的 prediction
:
xxxxxxxxxx
predictions = labels.copy() # 注意,需要执行深层拷贝使得 predictions 和 labels 不共享底层数据
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])
metric.compute
的输入是一组样本的 prediction
(不仅仅是单个样本的 prediction
)、以及一组样本的 label
。输出为:
xxxxxxxxxx
{'MISC': {'precision': 1.0, 'recall': 0.5, 'f1': 0.6666666666666666, 'number': 2},
'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
'overall_precision': 1.0,
'overall_recall': 0.6666666666666666,
'overall_f1': 0.8,
'overall_accuracy': 0.8888888888888888}
可以看到它返回了每个单独实体(如 "MISC"、"ORG"
)、以及整体的精度、召回率、以及 F1
分数。对于单独实体还返回出现的次数("number"
),对于整体还返回准确率。
为了让 Trainer
在每个 epoch
计算一个指标,我们需要定义一个 compute_metrics()
函数,该函数接受 prediction
和 label
,并返回一个包含指标名称和值的字典。
这个compute_metrics()
首先对 logits
执行 argmax
从而得到 prediction
(这里无需采用 softmax
,因为我们只需要找到概率最大的那个 token id
即可,也就是 logit
最大的那个 token id
)。然后我们将 prediction
和 label
从整数转换为字符串(注意删除 label = -100
的 label
及其对应位置的 prediction
)。然后我们将 prediction
字符串和 label
字符串传递给 metric.compute()
方法:
xxxxxxxxxx
import numpy as np
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
# 移除 label = -100 位置的数据 (包括 token 和 label)
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
true_predictions = [
[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
由于我们正在研究 Token
分类问题,因此我们将使用 AutoModelForTokenClassification
类。
首先我们定义两个字典 id2label
和 label2id
,其中包含从 label id
到 label name
的映射:
xxxxxxxxxx
id2label = {i: label for i, label in enumerate(label_names)}
print(id2label)
# {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
label2id = {v: k for k, v in id2label.items()}
print(label2id)
# {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
然后我们加载预训练好的模型:
xxxxxxxxxx
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(
"bert-base-cased", id2label=id2label, label2id=label2id
)
print(model)
# BertForTokenClassification(
# (bert): BertModel(
# (embeddings): BertEmbeddings(
# ...
# )
# (encoder): BertEncoder(
# ...
# )
# )
# (dropout): Dropout(p=0.1, inplace=False)
# (classifier): Linear(in_features=768, out_features=9, bias=True)
#)
注意,预训练好的模型必须和 AutoTokenizer
相匹配,这里都是用的 "bert-base-cased"
。
创建模型会发出警告,提示某些权重未被使用(来自 pretrained-head
的权重),另外提示某些权重被随机初始化(来新分类头的权重),我们将要训练这个模型因此可以忽略这两种警告。
我们可以查看模型配置:
xxxxxxxxxx
print(model.config)
# BertConfig {
# "_name_or_path": "bert-base-cased",
# "architectures": [
# "BertForMaskedLM"
# ],
# "attention_probs_dropout_prob": 0.1,
# "classifier_dropout": null,
# "gradient_checkpointing": false,
# "hidden_act": "gelu",
# "hidden_dropout_prob": 0.1,
# "hidden_size": 768,
# "id2label": {
# ...
# },
# "initializer_range": 0.02,
# "intermediate_size": 3072,
# "label2id": {
# ...
# },
# "layer_norm_eps": 1e-12,
# "max_position_embeddings": 512,
# "model_type": "bert",
# "num_attention_heads": 12,
# "num_hidden_layers": 12,
# "pad_token_id": 0,
# "position_embedding_type": "absolute",
# "transformers_version": "4.25.1",
# "type_vocab_size": 2,
# "use_cache": true,
# "vocab_size": 28996
# }
然后我们需要定义训练参数,以及登录 Hugging Face
(如果不需要把训练结果推送到 HuggingFace
,则无需登录并且设置 push_to_hub=False
)。
登录:
xxxxxxxxxx
huggingface-cli login
定义训练参数:
xxxxxxxxxx
from transformers import TrainingArguments
args = TrainingArguments(
"bert-finetuned-ner", # 输出模型的名称
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True, # 注意,如果你不想 push 到 HuggingFace,设置它为 False
)
print(args) # 参数含义可以参考前面对应章节的内容
# TrainingArguments(
# _n_gpu=0, # 如果机器上有 GPU,那么这里表示 GPU 数量
# adafactor=False,
# adam_beta1=0.9,
# adam_beta2=0.999,
# adam_epsilon=1e-08,
# auto_find_batch_size=False,
# bf16=False,
# bf16_full_eval=False,
# data_seed=None,
# dataloader_drop_last=False,
# dataloader_num_workers=0,
# dataloader_pin_memory=True,
# ddp_bucket_cap_mb=None,
# ddp_find_unused_parameters=None,
# ddp_timeout=1800,
# debug=[],
# deepspeed=None,
# disable_tqdm=False,
# do_eval=True,
# do_predict=False,
# do_train=False,
# eval_accumulation_steps=None,
# eval_delay=0,
# eval_steps=None,
# evaluation_strategy=epoch,
# fp16=False,
# fp16_backend=auto,
# fp16_full_eval=False,
# fp16_opt_level=O1,
# fsdp=[],
# fsdp_min_num_params=0,
# fsdp_transformer_layer_cls_to_wrap=None,
# full_determinism=False,
# gradient_accumulation_steps=1,
# gradient_checkpointing=False,
# greater_is_better=None,
# group_by_length=False,
# half_precision_backend=auto,
# hub_model_id=None,
# hub_private_repo=False,
# hub_strategy=every_save,
# hub_token=<HUB_TOKEN>,
# ignore_data_skip=False,
# include_inputs_for_metrics=False,
# jit_mode_eval=False,
# label_names=None,
# label_smoothing_factor=0.0,
# learning_rate=2e-05,
# length_column_name=length,
# load_best_model_at_end=False,
# local_rank=-1,
# log_level=passive,
# log_level_replica=passive,
# log_on_each_node=True,
# logging_dir=bert-finetuned-ner/runs/Apr15_06-45-26_SHAUNHUA-MB0,
# logging_first_step=False,
# logging_nan_inf_filter=True,
# logging_steps=500,
# logging_strategy=steps,
# lr_scheduler_type=linear,
# max_grad_norm=1.0,
# max_steps=-1,
# metric_for_best_model=None,
# mp_parameters=,
# no_cuda=False,
# num_train_epochs=3,
# optim=adamw_hf,
# optim_args=None,
# output_dir=bert-finetuned-ner,
# overwrite_output_dir=False,
# past_index=-1,
# per_device_eval_batch_size=8,
# per_device_train_batch_size=8,
# prediction_loss_only=False,
# push_to_hub=True,
# push_to_hub_model_id=None,
# push_to_hub_organization=None,
# push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
# ray_scope=last,
# remove_unused_columns=True,
# report_to=['tensorboard'],
# resume_from_checkpoint=None,
# run_name=bert-finetuned-ner,
# save_on_each_node=False,
# save_steps=500,
# save_strategy=epoch,
# save_total_limit=None,
# seed=42,
# sharded_ddp=[],
# skip_memory_metrics=True,
# tf32=None,
# torchdynamo=None,
# tpu_metrics_debug=False,
# tpu_num_cores=None,
# use_ipex=False,
# use_legacy_prediction_loop=False,
# use_mps_device=False,
# warmup_ratio=0.0,
# warmup_steps=0,
# weight_decay=0.01,
# xpu_backend=None,
# )
接下来我们构建 Trainer
并启动训练:
xxxxxxxxxx
from transformers import Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=tokenizer,
)
trainer.train()
# TrainOutput(global_step=5268, training_loss=0.06639557918455896,
# metrics={'train_runtime': 198.3512, 'train_samples_per_second': 212.366, 'train_steps_per_second': 26.559, 'total_flos': 923954298531210.0, 'train_loss': 0.06639557918455896, 'epoch': 3.0})
建议使用 GPU
训练,CPU
速度太慢。在 MacBook Pro 2018
(型号 A1990
)上训练速度:0.38 iteration/s
,在 RTX 4090
上训练速度 26.48 iteration/s
。
如果 push_to_hub=True
,那么训练期间每次保存模型时(这里是每个 epooch
保存一次),trainer
都会在后台将 checkpoint
上传到 HuggingFace Model Hub
。这样,你就能够在另一台机器上继续训练。并且训练完成之后,可以上传模型的最终版本:
xxxxxxxxxx
trainer.push_to_hub(commit_message="Training finish!")
这 Trainer
还创建了一张包含所有评估结果的 Model Card
并上传。在此阶段,你可以使用 Hugging Face Model Hub
上的 inference widget
来测试该模型,并分享给其它人。
通过指定模型名称来使用微调后的模型:
xxxxxxxxxx
from transformers import pipeline
token_classifier = pipeline( # model 参数指定模型名称(对应于 HuggingFace Model Hub、或者模型权重的位置(对应于本地)
"token-classification", model="./bert-finetuned-ner/checkpoint-5268"", aggregation_strategy="simple"
)
token_classifier("Wuhan is a beautiful city in China.")
# [{'entity_group': 'LOC', 'score': 0.99749047, 'word': 'Wuhan', 'start': 0, 'end': 5},
# {'entity_group': 'LOC', 'score': 0.9990609, 'word': 'China', 'start': 29, 'end': 34}]
我们也可以自定义训练过程,从而代替 Trainer API
,这样可以对训练过程进行更精细的控制。
xxxxxxxxxx
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import load_metric
from tqdm.auto import tqdm
import torch
##****************** 创建数据集 **********************
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
def align_labels_with_tokens(labels, word_ids):
# labels : word_id -> label 的映射
# word_ids: 每个 token 对应的 word_id
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word: # 遇到一个新的单词
current_word = word_id
label = -100 if word_id is None else labels[word_id]
new_labels.append(label)
elif word_id is None: # special token
new_labels.append(-100)
else: # 同一个单词内部
label = labels[word_id]
if label % 2 == 1: # 如果 label 是 B-XXX,那么修改为 I-XXX
label += 1
new_labels.append(label)
return new_labels
def tokenize_and_align_labels(examples): # examples 是样本的列表
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
all_labels = examples["ner_tags"]
new_labels = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(i) # 获取第 i 个样本的 word_ids
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels # 调整标签
return tokenized_inputs
raw_datasets = load_dataset("conll2003")
tokenized_datasets = raw_datasets.map(
tokenize_and_align_labels,
batched=True, # 一次性处理 batch 的样本
remove_columns=raw_datasets["train"].column_names,
)
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=8,
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)
##****************** 创建模型 **********************
label_names = raw_datasets["train"].features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
model = AutoModelForTokenClassification.from_pretrained(
model_checkpoint, id2label=id2label, label2id=label2id,
)
##****************** 创建优化器 **********************
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)
##****************** 调用 accelerator **********************
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
##****************** 创建调度器 **********************
from transformers import get_scheduler
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader) # 注意,必须在 accelerator.prepare() 之后在调用数据集的 len(...),因为 accelerator.prepare() 可能改变数据集的长度
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
##****************** 创建 Repo **********************
from huggingface_hub import Repository, get_full_repo_name
# model_name = "bert-finetuned-ner-accelerate"
# repo_name = get_full_repo_name(model_name)
# print(repo_name)
output_dir = "bert-finetuned-ner-accelerate"
# repo = Repository(output_dir, clone_from=repo_name) # 也可以忽略这一步
##****************** 定义 metric 、以及后处理函数 **********************
metric = load_metric("seqeval")
def postprocess(predictions, labels): # 将模型输出和标签转换为 metric 可接受的格式
predictions = predictions.detach().cpu().clone().numpy()
labels = labels.detach().cpu().clone().numpy()
# 移除 label = -100 位置的数据 (包括 token 和 label)
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
true_predictions = [
[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
return true_labels, true_predictions
##****************** 训练流程 **********************
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Training
model.train()
for batch in train_dataloader:
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step() # 参数更新
lr_scheduler.step() # 学习率更新
optimizer.zero_grad() # 梯度清零
progress_bar.update(1)
# Evaluation
model.eval()
for batch in eval_dataloader:
with torch.no_grad():
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1)
labels = batch["labels"]
# 在多进程场景下,可能两个进程将 predictions/labels 在进程内部对齐,但是在进程间不一致,这里需要跨进程对齐
predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
predictions_gathered = accelerator.gather(predictions) # 跨进程收集
labels_gathered = accelerator.gather(labels)
true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
metric.add_batch(predictions=true_predictions, references=true_labels)
results = metric.compute() # 计算验证集的指标
print(f"epoch {epoch}:",
{ key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]},
)
# epoch 0: {'precision': 0.930, 'recall': 0.904, 'f1': 0.917, 'accuracy': 0.982}
# epoch 1: {'precision': 0.944, 'recall': 0.912, 'f1': 0.928, 'accuracy': 0.985}
# epoch 2: {'precision': 0.948, 'recall': 0.928, 'f1': 0.938, 'accuracy': 0.987}
##****************** 保存和上传模型 **********************
accelerator.wait_for_everyone() # 所有进程都到达这个阶段然后继续执行,如果任何一个进程未到达,则所有其他进程都要等待
unwrapped_model = accelerator.unwrap_model(model) # 获取底层的模型,因为 accelerator 可能在分布式环境中工作
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) # 用 accelerator.save 来保存
if accelerator.is_main_process: # 只有主进程才执行下面的步骤
tokenizer.save_pretrained(output_dir) # 保存 tokenizer
# repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False) # blocking = False 表示异步推送
首先为 masked language modeling
选择一个合适的预训练模型,如前面用到的 "bert-base-cased"
。
xxxxxxxxxx
from transformers import AutoModelForMaskedLM
model_checkpoint = "bert-base-cased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
num_parameters = model.num_parameters() / 1_000_000
print(f"BERT_Base number of parameters: {round(num_parameters)}M'")
# BERT_Base number of parameters: 108M'
现在我们来看看 BERT_Base
如何补全一个被掩码的单词:
xxxxxxxxxx
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
text = "WuHan City a great [MASK]."
inputs = tokenizer(text, return_tensors="pt")
print(inputs)
# {
# 'input_ids': tensor([[ 101, 8769, 3048, 1389, 1392, 170, 1632, 103, 119, 102]]),
# 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
# 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
# }
token_logits = model(**inputs).logits # shape: [1, 10, 28996]
##************ 找到 [MASK] 的位置并抽取它的 logits ***********
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
print(mask_token_index) # tokenizer.mask_token_id = 103
# tensor([7])
mask_token_logits = token_logits[0, mask_token_index, :] # shape: [1, 28996]
##************ 返回 [MASK] 的 top-k 候选 ***********
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
# '>>> WuHan City a great city.'
# '>>> WuHan City a great place.'
# '>>> WuHan City a great town.'
# '>>> WuHan City a great village.'
# '>>> WuHan City a great name.'
加载数据集:我们在 Large Movie Review Dataset: IMDb
上微调 BERT_Base
。该数据集有训练集、测试集、还有 unsupervised
等三个 split
。
xxxxxxxxxx
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
print(imdb_dataset)
# DatasetDict({
# train: Dataset({
# features: ['text', 'label'],
# num_rows: 25000
# })
# test: Dataset({
# features: ['text', 'label'],
# num_rows: 25000
# })
# unsupervised: Dataset({
# features: ['text', 'label'],
# num_rows: 50000
# })
# })
数据处理:对于自回归语言建模、以及掩码语言建模,一个常见的预处理步骤是拼接所有样本,然后将整个语料库拆分为相同大小的 block
。我们还需要保留 word id
序列,以便后续用于全词掩码( whole word masking
)。
xxxxxxxxxx
result = tokenizer("Welcome to WuHan City", is_split_into_words=False) # 执行 tokenization
print(result)
# {
# 'input_ids': [101, 12050, 1106, 8769, 3048, 1389, 1392, 102],
# 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0],
# 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]
# }
print(result.word_ids()) # 每个 token 对应的 word id
# [None, 0, 1, 2, 2, 2, 3, None]
此外,我们删除 text
字段和 label
字段,因为不再需要。我们构建一个函数来执行这些:
xxxxxxxxxx
def tokenize_function(examples): # examples 是一个 batch 的样本
result = tokenizer(examples["text"]) # result 包含 batch 结果
if tokenizer.is_fast: # result.word_ids(i) 返回第 i 个样本的 word id 序列
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result
# 使用 batched=True
tokenized_datasets = imdb_dataset.map(
tokenize_function, batched=True, remove_columns=["text", "label"]
)
print(tokenized_datasets) # word_ids 列是我们人工添加的
# DatasetDict({
# train: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
# num_rows: 25000
# })
# test: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
# num_rows: 25000
# })
# unsupervised: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
# num_rows: 50000
# })
# })
现在我们已经完成了 tokenization
。下一步是将它们拼接在一起然后分块。块的大小怎么选择?这取决于 GPU
的显存大小。此外,还可以参考模型的最大上下文的长度,这可以通过 tokenizer.model_max_length
属性来判断:
xxxxxxxxxx
print(tokenizer.model_max_length)
# 512
然后我们拼接文本并拆分为大小为 block_size
的块。
xxxxxxxxxx
def group_texts(examples, chunk_size = 128):
# keys() 为 ('input_ids', 'token_type_ids', 'attention_mask', 'word_ids')
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # 拼接样本
total_length = len(concatenated_examples[list(examples.keys())[0]]) # 计算总的 token 长度
total_length = (total_length // chunk_size) * chunk_size # 移除最后一个小于 chunk_size 的块(也可以填充最后一个块到 chunk_size 长度)
result = {
k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] # 执行分块
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy() # label 就是 input token 序列,因为 MLM 的 label 就是被掩码的 token
return result
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
print(lm_datasets)
# DatasetDict({
# train: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
# num_rows: 63037
# })
# test: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
# num_rows: 61623
# })
# unsupervised: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
# num_rows: 126497
# })
# })
以训练集为例,可以看到,样本数量要比原始的 25k
个样本要多。因为现在的样本是 contiguous token
,而不是原始的情感分类样本。
现在缺少关键的步骤:在输入的随机位置插入 [MASK] token
。这需要在训练过程中动态地插入,而不是静态地提前准备好。
如前所述,我们需要再训练过程中动态地在输入的随机位置插入 [MASK] token
。这需要一个特殊的 data collator
从而可以在训练过程中动态地随机掩码输入文本中的一些 token
,即 DataCollatorForLanguageModeling
。我们需要向它传入 mlm_probability
参数从而指定 masked token
的占比。我们选择 15%
,因为这是论文中常用的配置:
xxxxxxxxxx
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
_ = sample.pop("word_ids") # 移除 word_ids ,否则 data_collator(samples) 抛出异常
for chunk in data_collator(samples)["input_ids"]:
print(f"\n'>>> {tokenizer.decode(chunk)}'")
# >>> [CLS] I rented I AM [MASK] deliberate [MASK]US - YEL [MASK]OW from my video store because of all thedating that surrounded it when it [MASK] first released in 1967. I also heard [MASK] [MASK] first it was seized by U. S. [MASK] if it ever tried to enter this [MASK], [MASK] being a fan of films [MASK] " controversial " I really had to see this for myself. < br / [MASK] < br / > The plot [MASK] centered around a young Swedish drama student named Lena who neighbouring to learn everything she can about [MASK]. In particular [MASK] wants [MASK] focus her attention [MASK] to making some sort of documentary on [MASK] the average Swed
# >>> ##e thought about [MASK] political [MASK] such as [MASK] Vietnam War and [MASK] issues in [MASK] [MASK] States. In between asking politicians [MASK] ordinary [MASK] [MASK]mony of Stockholm about their opinions on politics, she [MASK] [MASK] with [MASK] drama [MASK] [MASK] classmates, [MASK] married men. [MASK] br / Quaker < br / > What kills me about I AM CURIOUS - YELLOW is that 40 years ago, this was considered pornographic. Really [MASK] [MASK] sex and nudi [MASK] scenes are few and far between [MASK] even then it's not shot like some cheaply made [MASK]orno [MASK] While my countrymen mind find it shocking, in [MASK]
随机掩码的一个副作用是,当使用 Trainer
时,我们的评估指标可能是随机的(每一次评估的结果可能各不相同),因为测试集使用的也是相同的 DataCollatorForLanguageModeling
。然而,我们可以利用 Accelerate
来自定义训练过程(而不是 Trainer
封装好的训练过程),从而在训练过程中冻结随机性。
全词掩码 whole word masking: WWM
:全词掩码是掩码整个单词,而不仅是是掩码单词内的单个 token
。如果我们想使用全词掩码,我们需要自己构建一个 data collator
。此时,我们需要用到之前计算的 word_ids
,它给出了每个 token
对应的 word id
。注意,除了与 [MASK]
对应的 label
以外,所有的其他 label
都是 -100
。
xxxxxxxxxx
import collections
import numpy as np
from transformers import default_data_collator
wwm_probability = 0.2
def whole_word_masking_data_collator(features):
for feature in features:
word_ids = feature.pop("word_ids")
mapping = collections.defaultdict(list) # 存放 word_id 到它包含的 token id list 的映射
current_word_index = -1
current_word = None
for idx, word_id in enumerate(word_ids):
if word_id is not None:
if word_id != current_word:
current_word = word_id
current_word_index += 1
mapping[current_word_index].append(idx)
# 随机掩码 word
mask = np.random.binomial(1, wwm_probability, (len(mapping),)) # 注意,单个元素的元组 (xxx, )
input_ids = feature["input_ids"]
labels = feature["labels"]
new_labels = [-100] * len(labels) # 默认全为 -100
for word_id in np.where(mask)[0]: # np.where(mask) 返回一个元组,
word_id = word_id.item()
for idx in mapping[word_id]: # 被掩码的单词所对应的 token_id
new_labels[idx] = labels[idx]
input_ids[idx] = tokenizer.mask_token_id
feature["labels"] = new_labels
return default_data_collator(features)
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)
for chunk in batch["input_ids"]:
print(f"\n'>>> {tokenizer.decode(chunk)}'")
# >>> [CLS] I rented I AM [MASK] [MASK] [MASK] [MASK] - YELLOW from my [MASK] store because of all the controversy that [MASK] it when it was first released [MASK] [MASK]. [MASK] also heard that at first it was [MASK] [MASK] U [MASK] S [MASK] [MASK] if it [MASK] tried to enter this country, therefore [MASK] a fan [MASK] [MASK] [MASK] " [MASK] " I really had [MASK] see this for myself [MASK] [MASK] br / [MASK] < br / > The plot [MASK] [MASK] around a young [MASK] drama student [MASK] Lena who wants to learn everything she [MASK] [MASK] life. In particular she wants to [MASK] her attentions to making some sort of documentary [MASK] what [MASK] average Swed
# >>> ##e thought about certain political issues such as the Vietnam War and race issues in [MASK] United States [MASK] In between asking politicians and ordinary denizens of [MASK] about their opinions on politics, [MASK] has [MASK] with [MASK] [MASK] teacher [MASK] [MASK], and married men. [MASK] [MASK] / [MASK] < br [MASK] > What kills me about I [MASK] CURIOUS - [MASK] [MASK] [MASK] [MASK] is that 40 years ago, this was considered pornographic. Really, the sex [MASK] nudity scenes are few and [MASK] [MASK] [MASK] even then it's [MASK] [MASK] like some cheaply made [MASK] [MASK] [MASK]. [MASK] my countrymen mind find [MASK] [MASK], in [MASK]
数据集缩小:为了演示的方便,我们将训练集缩小为数千个样本。
xxxxxxxxxx
train_size = 10_000
test_size = int(0.1 * train_size)
downsampled_dataset = lm_datasets["train"].train_test_split(
train_size=train_size, test_size=test_size, seed=42
)
print(downsampled_dataset)
# DatasetDict({
# train: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
# num_rows: 10000
# })
# test: Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
# num_rows: 1000
# })
# })
配置 Trainer
:接下来我们可以登录 Hugging Face Hub
(可选的,方式为在命令行中执行命令 huggingface-cli login
)。
xxxxxxxxxx
from transformers import TrainingArguments
batch_size = 64
logging_steps = len(downsampled_dataset["train"]) // batch_size # 每个 epoch 打印 training loss
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
output_dir=f"{model_name}-finetuned-imdb",
overwrite_output_dir=True,
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
weight_decay=0.01,
num_train_epochs=3,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
push_to_hub=False, # 这里暂时先不 push 到 hub
fp16=True, # 混合精度训练从而加速训练过程
logging_steps=logging_steps, # 设置 logging_steps
remove_unused_columns = False, # 用于 WWM data_collator
)
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=downsampled_dataset["train"],
eval_dataset=downsampled_dataset["test"],
# data_collator=data_collator,
data_collator=whole_word_masking_data_collator # WWM data_collator
)
默认情况下, Trainer
将删除不属于模型的 forward()
方法的列。这意味着, 如果你使用 WWM data_collator
,你还需要设置 remove_unused_columns = False
,以确保我们不会在训练期间丢失 word_ids
列。
语言模型的困惑度( perplexity
):一个好的语言模型是为语法正确的句子分配高概率,为无意义的句子分配低概率。我们通过困惑度来衡量这种概率。困惑度有多种数学定义,这里我们采用交叉熵损失的指数。因此,我们可以通过 Trainer.evaluate()
函数计算测试集上的交叉熵损失,然后取结果的指数来计算预训练模型的困惑度:
xxxxxxxxxx
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
# >>> Perplexity: 39.75
trainer.train()
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
# >>> Perplexity: 22.11
trainer.push_to_hub()
tokenizer.save_pretrained(output_dir) # 保存 tokenizer
较低的困惑度分数意味着更好的语言模型。可以看到:模型困惑度降低了很多。这表明模型已经了解了一些关于电影评论领域的知识。
使用模型:现在可以通过 Transformers
的 pipeline
来调用微调后的模型:
xxxxxxxxxx
from transformers import pipeline
mask_filler = pipeline( # 注意,tokenizer 也需要保存在这个目录下
"fill-mask", model="./bert-base-cased-finetuned-imdb/checkpoint-471"
)
preds = mask_filler("WuHan City a great [MASK].")
for pred in preds:
print(f">>> {pred['sequence']}")
# >>> WuHan City a great city.
# >>> WuHan City a great place.
# >>> WuHan City a great town.
# >>> WuHan City a great one.
# >>> WuHan City a great name.
DataCollatorForLanguageModeling
对每次评估过程采用随机掩码,因此每次训练运行时,我们都会看到困惑度分数的一些波动。消除这种随机性来源的一种方法是在整个测试集上应用一次掩码,然后使用 Transformers
中的默认 data collator
。
注意,自定义训练过程人工创建了
dataloader
,而Trainer API
只需要传入dataset
而无需创建dataloader
。
整体代码如下所示:
xxxxxxxxxx
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch
import math
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import collections
import numpy as np
from transformers import default_data_collator
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
##********** 加载 pre-trained model, tokenizer 和数据集 ********
model_checkpoint = "bert-base-cased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
imdb_dataset = load_dataset("imdb")
##********* tokenization ****************
def tokenize_function(examples): # examples 是一个 batch 的样本
result = tokenizer(examples["text"]) # result 包含 batch 结果
if tokenizer.is_fast: # result.word_ids(i) 返回第 i 个样本的 word id 序列
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result
tokenized_datasets = imdb_dataset.map(
tokenize_function, batched=True, remove_columns=["text", "label"]
)
##********* 分块 ****************
def group_texts(examples, chunk_size = 128):
# keys() 为 ('input_ids', 'token_type_ids', 'attention_mask', 'word_ids')
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # 拼接样本
total_length = len(concatenated_examples[list(examples.keys())[0]]) # 计算总的 token 长度
total_length = (total_length // chunk_size) * chunk_size # 移除最后一个小于 chunk_size 的块(也可以填充最后一个块到 chunk_size 长度)
result = {
k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] # 执行分块
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy() # label 就是 input token 序列,因为 MLM 的 label 就是被掩码的 token
return result
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
train_size = 10_000
test_size = int(0.1 * train_size)
downsampled_dataset = lm_datasets["train"].train_test_split( # demo: 减小数据规模
train_size=train_size, test_size=test_size, seed=42
)
##********** 创建 WWM data collator ***********
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
wwm_probability = 0.2
def whole_word_masking_data_collator(features):
for feature in features:
word_ids = feature.pop("word_ids")
mapping = collections.defaultdict(list) # 存放 word_id 到它包含的 token id list 的映射
current_word_index = -1
current_word = None
for idx, word_id in enumerate(word_ids):
if word_id is not None:
if word_id != current_word:
current_word = word_id
current_word_index += 1
mapping[current_word_index].append(idx)
# 随机掩码 word
mask = np.random.binomial(1, wwm_probability, (len(mapping),)) # 注意,单个元素的元组 (xxx, )
input_ids = feature["input_ids"]
labels = feature["labels"]
new_labels = [-100] * len(labels) # 默认全为 -100
for word_id in np.where(mask)[0]: # np.where(mask) 返回一个元组,
word_id = word_id.item()
for idx in mapping[word_id]: # 被掩码的单词所对应的 token_id
new_labels[idx] = labels[idx]
input_ids[idx] = tokenizer.mask_token_id
feature["labels"] = new_labels
return default_data_collator(features)
##************ 对测试集进行静态掩码 ***************
def insert_random_mask(batch):
features = [dict(zip(batch, t)) for t in zip(*batch.values())]
masked_inputs = whole_word_masking_data_collator(features)
# 对于数据集中的每一列,创建一个对应的 masked 列
return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}
# downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"]) # 如果是 whole_word_masking_data_collator 则注释掉这一行
eval_dataset = downsampled_dataset["test"].map(
insert_random_mask,
batched=True,
remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
{
"masked_input_ids": "input_ids",
"masked_attention_mask": "attention_mask",
"masked_labels": "labels",
}
).remove_columns(["masked_token_type_ids"]) # 移除一些不需要的列
##************ 创建 data loader ***************
batch_size = 64
train_dataloader = DataLoader(
downsampled_dataset["train"],
shuffle=True,
batch_size=batch_size,
collate_fn=whole_word_masking_data_collator,
)
eval_dataloader = DataLoader(
eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)
##************ 创建训练组件***************
optimizer = AdamW(model.parameters(), lr=5e-5)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
##********** 在 Hugging Face Hub 上创建一个模型库 (可以忽略) **********
# from huggingface_hub import get_full_repo_name
model_name = "%s-finetuned-imdb-accelerate"%model_checkpoint
# repo_name = get_full_repo_name(model_name)
# repo_name
# rom huggingface_hub import Repository
output_dir = model_name
# repo = Repository(output_dir, clone_from=repo_name)
##************ 训练和评估 ****************
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Training
model.train()
for batch in train_dataloader:
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(accelerator.gather(loss)) # 跨进程收集每个样本的 loss
losses = torch.cat(losses)
losses = losses[: len(eval_dataset)] # 获取验证集每个样本的 loss
try:
perplexity = math.exp(torch.mean(losses))
except OverflowError:
perplexity = float("inf")
print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
# >>> Epoch 0: Perplexity: 22.54525292335159
# >>> Epoch 1: Perplexity: 21.186613045279536
# >>> Epoch 2: Perplexity: 20.757056615284373
##*********** 保存、上传微调好的模型 ****************
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) # 用 accelerator.save
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
# repo.push_to_hub(
# commit_message=f"Training in progress epoch {epoch}", blocking=False
# )
这里我们将采用不同的方法并从头开始训练一个全新的因果语言模型。为了减小数据规模从而用于演示,我们将使用 Python
代码的子集专注于单行代码的补全(而不是补全完整的函数或类)。
加载数据集:CodeParrot
数据集来自于 Google's BigQuery
数据集, 使用了大约 180 GB
的 GitHub dump
,包含大约 20M
个 Python
文件。创建过程:
xxxxxxxxxx
SELECT
f.repo_name, f.path, c.copies, c.size, c.content, l.license
FROM
`bigquery-public-data.github_repos.files` AS f
JOIN
`bigquery-public-data.github_repos.contents` AS c
ON
f.id = c.id
JOIN
`bigquery-public-data.github_repos.licenses` AS l
ON
f.repo_name = l.repo_name
WHERE
NOT c.binary AND ((f.path LIKE '%.py') AND (c.size BETWEEN 1024 AND 1048575))
为了演示的效果,我们进一步仅考虑与 Python
数据科学相关的子集。我们使用过滤函数:
xxxxxxxxxx
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
def any_keyword_in_string(string, keywords):
for keyword in keywords:
if keyword in string:
return True
return False
然后我们用这个过滤函数来流式地过滤数据集:
xxxxxxxxxx
def filter_streaming_dataset(dataset, filters):
filtered_dict = defaultdict(list)
total = 0
for sample in tqdm(iter(dataset)):
total += 1
if any_keyword_in_string(sample["content"], filters):
for k, v in sample.items():
filtered_dict[k].append(v)
print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
return Dataset.from_dict(filtered_dict)
from datasets import load_dataset
split = "train" # "valid"
data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
filtered_data = filter_streaming_dataset(data, filters)
# 3.26% of data after filtering.
这个加载数据集并过滤的耗时非常长,可能需要数个小时。过滤之后保留了大约 3%
的原始数据,仍然高达 6GB
,包含大约 600k
个 python
文件。HuggingFace
提供好了过滤后的数据集:
xxxxxxxxxx
from datasets import load_dataset, DatasetDict
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")
raw_datasets = DatasetDict( { "train": ds_train, "valid": ds_valid })
print(raw_datasets)
# DatasetDict({
# train: Dataset({
# features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
# num_rows: 606720
# })
# valid: Dataset({
# features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
# num_rows: 3322
# })
# })
Tokenization
:第一步是对数据集进行 tokenization
。我们的目标单行代码的补全,因此可以选择较短的上下文。这样做的好处是,我们可以更快地训练模型并且需要更少的内存。如果你需要更长的上下文(如,补全函数、类、或自动生成单元测试),那么需要设置较长的上下文。
这里我们选择上下文为 128
个 token
(相比之下,GPT2
选择了 1024
个 token
、GPT3
选择了 2048
个 token
)。大多数文档包含超过 128
个 token
,如果简单地进行数据截断,那么将丢弃大多数的数据。相反,我们使用 return_overflowing_tokens
来执行 tokenizeation
从而返回几个块,并且还使用 return_length
来返回每个块的长度。通常最后一个块会小于上下文大小,我们会去掉这些块从而免于填充,因为这里的数据量已经足够用了。
xxxxxxxxxx
from transformers import AutoTokenizer
context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
##********** 如果 return_overflowing_tokens 未设置 **********
outputs = tokenizer(
raw_datasets["train"][:2]["content"],
truncation=True,
max_length=context_length,
)
print(f"Input IDs length: {len(outputs['input_ids'])}")
# Input IDs length: 2
##********** 如果 return_overflowing_tokens 被设置 **********
outputs = tokenizer(
raw_datasets["train"][:2]["content"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
print(f"Input IDs length: {len(outputs['input_ids'])}")
# Input IDs length: 34
print(f"Input chunk lengths: {(outputs['length'])}")
# Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117,
# 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
# Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
可以看到:从两个样本中我们得到了 34
个块,其中:
outputs['input_ids']
存放每个块的数据。
outputs['length']
存放每个块的长度。可以看到,每个文档的末尾的块,它的长度都小于 128
(分别为 117
和 41
)。由于这种不足 128
的块的占比很小,因此我们可以将它们丢弃。
outputs['overflow_to_sample_mapping']
存放每个块属于哪个样本。
然后我们把上述代码封装到一个函数中,并在 Dataset.map()
中调用:
xxxxxxxxxx
def tokenize(element):
outputs = tokenizer(
element["content"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
input_batch = []
for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
if length == context_length: # 过滤掉长度小于 context_length 的块
input_batch.append(input_ids)
return {"input_ids": input_batch}
tokenized_datasets = raw_datasets.map(
tokenize, batched=True,
remove_columns=raw_datasets["train"].column_names # 我们只需要 input_ids 列,因此移除所有其它的列
)
print(tokenized_datasets)
# DatasetDict({
# train: Dataset({
# features: ['input_ids'],
# num_rows: 16702061
# })
# valid: Dataset({
# features: ['input_ids'],
# num_rows: 93164
# })
# })
我们现在有 16.70M
样本,每个样本有 128
个 token
,总计相当于大约 2.1B tokens
。作为参考,OpenAI
的 GPT-3
和 Codex
模型分别在 30B
和 100B
个 token
上训练,其中 Codex
模型从 GPT-3 checkpoint
初始化。
我们初始化一个 GPT-2
模型。我们采用与 GPT-2
相同的配置,并确保词表规模与 tokenizer
规模相匹配,然后设置 bos_token_id
和 eos_token_id
。利用该配置,我们加载一个新模型。注意,这是我们首次不使用 from_pretrained()
函数,因为我们实际上是在自己初始化模型:
xxxxxxxxxx
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
config = AutoConfig.from_pretrained(
"gpt2",
vocab_size=len(tokenizer),
n_ctx=context_length,
bos_token_id=tokenizer.bos_token_id, # 句子开始的 token
eos_token_id=tokenizer.eos_token_id, # 句子结束的 token
)
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")
# GPT-2 size: 124.2M parameters
该模型有 124.2 M
参数。
在开始训练之前,我们需要设置一个负责创建 batch
的 data collator
。我们可以使用 DataCollatorForLanguageModeling
,它是专为语言建模而设计。除了 batch
和 padding
,它还负责创建语言模型的标签:在因果语言建模中,input
也用作 label
(只是右移一个位置),并且这个 data collator
在训练期间动态地创建 label
,所以我们不需要复制 input_ids
。
注意 DataCollatorForLanguageModeling
支持掩码语言建模 (Masked Language Model: MLM
) 和因果语言建模 (Causal Language Model: CLM
)。默认情况下它为 MLM
准备数据,但我们可以通过设置 mlm=False
参数切换到 CLM
:
xxxxxxxxxx
from transformers import DataCollatorForLanguageModeling
print(tokenizer.pad_token, tokenizer.eos_token)
# None <|endoftext|>
tokenizer.pad_token = tokenizer.eos_token # 利用 eos_token 来填充
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
用法示例:
xxxxxxxxxx
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
print(f"{key} shape: {out[key].shape}")
# input_ids shape: torch.Size([5, 128])
# attention_mask shape: torch.Size([5, 128])
# labels shape: torch.Size([5, 128])
接下来就是配置 TrainingArguments
并启动 Trainer
。我们将使用余弦学习率,进行一些 warmup
,设置有效 batch size = 256
( per_device_train_batch_size * gradient_accumulation_steps
)。gradient_accumulation_steps
用于梯度累积,它通过多次前向传播和反向传播但是只有一次梯度更新,从而实现 large batch size
的效果。当我们使用Accelerate
手动创建训练循环时,我们将看到这一点。
xxxxxxxxxx
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
output_dir="codeparrot-ds",
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
evaluation_strategy="steps", # 每隔若干个 step 执行一次评估
save_strategy="epoch", # 每个 epoch 保存一次
eval_steps=5_000,
logging_steps=5_000,
gradient_accumulation_steps=8, # 梯度累积
num_train_epochs=1,
weight_decay=0.1,
warmup_steps=1_000,
lr_scheduler_type="cosine",
learning_rate=5e-4,
save_steps=5_000,
fp16=True,
push_to_hub=False, # 暂时不推送到 HuggingFace Hub
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"].select(range(100000)), # 为演示目的,仅用 10 万个 batch
eval_dataset=tokenized_datasets["valid"].select(range(10000)),
)
trainer.train()
# TrainOutput(
# global_step=390,
# training_loss=4.230728540665064,
# metrics={
# 'train_runtime': 235.341,
# 'train_samples_per_second': 424.915,
# 'train_steps_per_second': 1.657,
# 'total_flos': 6521849118720000.0,
# 'train_loss': 4.230728540665064,
# 'epoch': 1.0}
# )
tokenizer.save_pretrained("codeparrot-ds") # 保存 tokenizer
# trainer.push_to_hub() # 训练完成后,将模型和 tokenizer 推送到 HuggingFace Hub
使用模型:现在可以通过 Transformers
的 pipeline
来调用微调后的模型:
xxxxxxxxxx
import torch
from transformers import pipeline
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
"text-generation", model="./codeparrot-ds/", device=device # 使用 GPU 加速生成过程
)
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)
# create scatter plot with x, y
"""
result = pipe(txt, num_return_sequences=1)
print(result)
# [{'generated_text': '# create some data\nx = np.random.randn(100)\ny = np.random.randn(100)\n\n# create scatter plot with x, y\ny = rng.randn(n_samples, (1,'}]
print([0]["generated_text"]) # 因为训练不充分,这里的结果仅供参考
有时我们想要完全控制训练循环,或者我们想要进行一些特殊的更改,这时我们可以利用 Accelerate
来进行自定义的训练过程。
众所周知,数学科学的 package
中有一些关键字,如 plt, pd, sk, fit, predict
等等。我们仅关注那些表示为单个 token
的关键字,并且我们还关注那些带有一个空格作为前缀的关键字版本。
xxxxxxxxxx
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
keytoken_ids = []
for keyword in [ "plt", "pd", "sk", "fit", "predict",
" plt", " pd", " sk", " fit", " predict",
]:
ids = tokenizer([keyword]).input_ids[0]
if len(ids) == 1: # 仅考虑单个 token 的关键字
keytoken_ids.append(ids[0])
else:
print(f"Keyword has not single token: {keyword}")
print(keytoken_ids)
# [8436, 4289, 1201, 2770, 5431, 2564, 2604, 2110, 2872, 4969]
我们可以计算每个样本的损失,并计算每个样本中所有关键字的出现次数。然后我们以这个出现次数为权重来对样本的损失函数进行加权,使得模型更加注重那些具有多个关键字的样本。这是一个自定义的损失函数,它将输入序列、logits
、以及我们刚刚选择的关键字 token
作为输入,然后输出关键字频率加权的损失函数:
xxxxxxxxxx
from torch.nn import CrossEntropyLoss
import torch
def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
shift_labels = inputs[..., 1:].contiguous() # input 序列第 i+1 位就是第 i 个标签
shift_logits = logits[..., :-1, :].contiguous() # 最后一个位置不需要预测,因为没有 label
loss_fct = CrossEntropyLoss(reduce=False) # 用于计算 per-token 的损失
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1) # 该样本的平均损失
weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum( # 每个样本出现的所有关键字的数量
axis=[0, 2]
)
weights = alpha * (1.0 + weights)
weighted_loss = (loss_per_sample * weights).mean() # 计算 batch 的加权平均损失
return weighted_loss
加载数据集:
xxxxxxxxxx
from datasets import load_dataset, DatasetDict
context_length = 128
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")
raw_datasets = DatasetDict( { "train": ds_train, "valid": ds_valid })
def tokenize(element):
outputs = tokenizer(
element["content"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
input_batch = []
for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
if length == context_length: # 过滤掉长度小于 context_length 的块
input_batch.append(input_ids)
return {"input_ids": input_batch}
tokenized_datasets = raw_datasets.map(
tokenize, batched=True,
remove_columns=raw_datasets["train"].column_names # 我们只需要 input_ids 列,因此移除所有其它的列
)
tokenized_dataset.set_format("torch") # 设置为 torch 格式
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)
设置 weight-decay
:我们对参数进行分组,以便优化器知道哪些将获得额外的 weight-decay
。通常,所有的 bias
项和 LayerNorm weight
都不需要 weight-decay
:
xxxxxxxxxx
weight_decay = 0.1
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
params_with_wd, params_without_wd = [], []
for name, param in model.named_parameters():
if any(nd in name for nd in no_decay): # 判断 bias 字符串是否出现在 parameter.name 中,因为 parameter.name 可能为 attention1.bias1
params_without_wd.append(param)
else:
params_with_wd.append(param)
return [
{"params": params_with_wd, "weight_decay": weight_decay},
{"params": params_without_wd, "weight_decay": 0.0},
]
评估函数:由于我们希望在训练期间定期地在验证集上评估模型,因此我们也为此编写一个函数。它只是运行 eval_dataloader
并收集跨进程的所有损失函数值:
xxxxxxxxxx
def evaluate():
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(batch["input_ids"], labels=batch["input_ids"])
losses.append(accelerator.gather(outputs.loss)) # 跨进程收集每个样本的 loss
loss = torch.mean(torch.cat(losses))
try:
perplexity = torch.exp(loss) # 计算困惑度
except OverflowError:
perplexity = float("inf")
return loss.item(), perplexity.item()
这个评估函数用于获取损失函数值、以及困惑度。
完整的训练过程:
xxxxxxxxxx
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
##************** 加载数据集 ********************
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
context_length = 128
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")
raw_datasets = DatasetDict( { "train": ds_train, "valid": ds_valid })
def tokenize(element):
outputs = tokenizer(
element["content"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
input_batch = []
for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
if length == context_length: # 过滤掉长度小于 context_length 的块
input_batch.append(input_ids)
return {"input_ids": input_batch}
tokenized_datasets = raw_datasets.map(
tokenize, batched=True,
remove_columns=raw_datasets["train"].column_names # 我们只需要 input_ids 列,因此移除所有其它的列
)
batch_size = 32
tokenized_dataset.set_format("torch") # 设置为 torch 格式
train_dataloader = DataLoader(tokenized_dataset["train"].select(range(100000),
batch_size=batch_size, shuffle=True) # 为演示方便,用了更少的数据
eval_dataloader = DataLoader(tokenized_dataset["valid"].select(range(10000),
batch_size=batch_size) # 为演示方便,用了更少的数据
##************** 定义加权损失函数 ***************
keytoken_ids = []
for keyword in [ "plt", "pd", "sk", "fit", "predict",
" plt", " pd", " sk", " fit", " predict",
]:
ids = tokenizer([keyword]).input_ids[0]
if len(ids) == 1: # 仅考虑单个 token 的关键字
keytoken_ids.append(ids[0])
else:
print(f"Keyword has not single token: {keyword}")
def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
shift_labels = inputs[..., 1:].contiguous() # input 序列第 i+1 位就是第 i 个标签
shift_logits = logits[..., :-1, :].contiguous() # 最后一个位置不需要预测,因为没有 label
loss_fct = CrossEntropyLoss(reduce=False) # 用于计算 per-token 的损失
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1) # 该样本的平均损失
weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum( # 每个样本出现的所有关键字的数量
axis=[0, 2]
)
weights = alpha * (1.0 + weights)
weighted_loss = (loss_per_sample * weights).mean() # 计算 batch 的加权平均损失
return weighted_loss
##************** 定义评估函数 ***************
def evaluate():
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(batch["input_ids"], labels=batch["input_ids"])
losses.append(accelerator.gather(outputs.loss)) # 跨进程收集每个样本的 loss
loss = torch.mean(torch.cat(losses))
try:
perplexity = torch.exp(loss) # 计算困惑度
except OverflowError:
perplexity = float("inf")
return loss.item(), perplexity.item()
##****************** 定义 weight-decay ****************
weight_decay = 0.1
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
params_with_wd, params_without_wd = [], []
for name, param in model.named_parameters():
if any(nd in name for nd in no_decay): # 判断 bias 字符串是否出现在 parameter.name 中,因为 parameter.name 可能为 attention1.bias1
params_without_wd.append(param)
else:
params_with_wd.append(param)
return [
{"params": params_with_wd, "weight_decay": weight_decay},
{"params": params_without_wd, "weight_decay": 0.0},
]
##***************** 配置模型及其训练组件 *********************
config = AutoConfig.from_pretrained(
"gpt2",
vocab_size=len(tokenizer),
n_ctx=context_length,
bos_token_id=tokenizer.bos_token_id, # 句子开始的 token
eos_token_id=tokenizer.eos_token_id, # 句子结束的 token
)
model = GPT2LMHeadModel(config)
optimizer = AdamW(get_grouped_params(model), lr=5e-4) # 使用 weight-decay
accelerator = Accelerator(mixed_precision='fp16')
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader) # 必须在 accelerator.prepare() 之后执行,因为 accelerator.prepare 可能会改变 dataloader
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
name="linear",
optimizer=optimizer,
num_warmup_steps=1_000,
num_training_steps=num_training_steps,
)
##***************** 创建 Repository *********************
model_name = "codeparrot-ds-accelerate"
# repo_name = get_full_repo_name(model_name) #用于推送到 HuggingFace Hub
output_dir = "codeparrot-ds-accelerate"
# repo = Repository(output_dir, clone_from=repo_name) #用于推送到 HuggingFace Hub
##***************** Training Loop *********************
evaluate() # 先评估下结果,看看未训练的模型的效果
gradient_accumulation_steps = 8 # 每隔 8 个 step 来累积一次梯度 (可以通过 accumulator 的 gradient_accumulation_steps 选项来优化这里的梯度累积代码)
eval_steps = 5_000 # 每隔 eval_steps * gradient_accumulation_steps 步时评估一次
model.train()
completed_steps = 0 # 存放梯度更新的次数
for epoch in range(num_train_epochs):
for step, batch in tqdm(
enumerate(train_dataloader, start=1), total=len(train_dataloader)
):
logits = model(batch["input_ids"]).logits
loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids) # 使用自定义的损失函数
if step % 100 == 0:
accelerator.print(
{
"lr": lr_scheduler.get_last_lr(),
"samples": step * batch_size,
"steps": completed_steps, # 梯度更新的次数
"loss/train": loss.item(),
}
)
loss = loss / gradient_accumulation_steps # 缩放损失从而对梯度取平均
accelerator.backward(loss)
if step % gradient_accumulation_steps == 0: # 执行梯度更新
accelerator.clip_grad_norm_(model.parameters(), 1.0) # 梯度范数裁剪
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
completed_steps += 1
if (step % (eval_steps * gradient_accumulation_steps)) == 0:
eval_loss, perplexity = evaluate()
accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
model.train() # 评估完成之后,需要设置为 training 模式
accelerator.wait_for_everyone()
evaluate() # 再次评估下结果,看看训练好的模型的效果
# (4.45915412902832, 86.41439056396484)
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
# repo.push_to_hub(
# commit_message=f"Training in progress step {step}", blocking=False
# )
文本摘要将长的文章压缩为摘要,这需要理解文章内容并生成捕获了文档主题的连贯的文本。
加载数据集:我们使用 “多语言亚马逊评论语料库” 来创建一个双语的 summarizer
。该语料库由六种语言的亚马逊商品评论组成,通常用于对多语言分类器进行基准测试。然而,由于每条评论都附有一个简短的标题,我们可以使用标题作为我们模型学习的 target
摘要 。
首先下载数据集,这里下载英语和西班牙语的子集:
xxxxxxxxxx
from datasets import load_dataset
spanish_dataset = load_dataset("amazon_reviews_multi", "es")
english_dataset = load_dataset("amazon_reviews_multi", "en")
print(english_dataset)
# DatasetDict({
# train: Dataset({
# features: ['review_id', 'product_id', 'reviewer_id',
# 'stars', 'review_body', 'review_title', 'language', 'product_category'],
# num_rows: 200000
# })
# validation: Dataset({
# features: ['review_id', 'product_id', 'reviewer_id',
# 'stars', 'review_body', 'review_title', 'language', 'product_category'],
# num_rows: 5000
# })
# test: Dataset({
# features: ['review_id', 'product_id', 'reviewer_id',
# 'stars', 'review_body', 'review_title', 'language', 'product_category'],
# num_rows: 5000
# })
# })
可以看到,对每种语言,train split
有 200k
条评论、validation split
有 5k
条评论、test split
有 5k
条评论。我们感兴趣的评论信息在 review_body
和 review_title
字段。
我们可以创建一个简单的函数来查看一些样本:
xxxxxxxxxx
def show_samples(dataset, num_samples=3, seed=42):
sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
for example in sample:
print(f"\n'>> Title: {example['review_title']}'")
print(f"'>> Review: {example['review_body']}'")
show_samples(english_dataset)
# '>> Title: Worked in front position, not rear'
# '>> Review: 3 stars because these are not rear brakes as stated in the item description. At least the mount adapter only worked on the front fork of the bike that I got it for.'
# ....
然后,我们进行样本过滤。在单个 GPU
上训练所有 400k
条评论(两种语言,每种语言的训练集包含 200k
条评论)的摘要模型将花费太长时间,这里我们选择书籍(包括电子书)类目的评论。
xxxxxxxxxx
def filter_books(example):
return (
example["product_category"] == "book"
or example["product_category"] == "digital_ebook_purchase"
)
spanish_books = spanish_dataset.filter(filter_books)
english_books = english_dataset.filter(filter_books)
show_samples(english_books)
# '>> Title: I'm dissapointed.'
# '>> Review: I guess I had higher expectations for this book from the reviews. I really thought I'd at least like it. The plot idea was great. I loved Ash but, it just didnt go anywhere. Most of the book was about their radio show and talking to callers. I wanted the author to dig deeper so we could really get to know the characters. All we know about Grace is that she is attractive looking, Latino and is kind of a brat. I'm dissapointed.'
# ....
然后,我们需要将英语评论和西班牙语评论合并为一个 DatasetDict
对象:
xxxxxxxxxx
from datasets import concatenate_datasets, DatasetDict
books_dataset = DatasetDict()
for split in english_books.keys():
books_dataset[split] = concatenate_datasets( [english_books[split], spanish_books[split]])
books_dataset[split] = books_dataset[split].shuffle(seed=42)
show_samples(books_dataset)
# '>> Title: Easy to follow!!!!'
# '>> Review: I loved The dash diet weight loss Solution. Never hungry. I would recommend this diet. Also the menus are well rounded. Try it. Has lots of the information need thanks.'
# ....
现在,train/validation/test split
都是英语和西班牙语的混合评论。
现在,我们过滤掉太短的标题。如果reference
摘要(在这里就是标题)太短,则使得模型偏向于仅生成包含一两个单词的摘要。
xxxxxxxxxx
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)
预处理数据:现在需要对评论极其标题进行 tokenization
和编码。
首先加载 tokenizer
。这里我们使用 mt5-base
模型。
xxxxxxxxxx
from transformers import AutoTokenizer
model_checkpoint = "/mnt/disk_b/ModelZoo/mt5-base" # 提前下载到本地
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer("I really enjoy reading!")
print(inputs)
# {'input_ids': [336, 259, 4940, 9070, 11807, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
print(tokenizer.convert_ids_to_tokens(inputs.input_ids))
# ['▁I', '▁', 'really', '▁enjoy', '▁reading', '!', '</s>']
特殊的 Unicode
字符 ▁
和序列结束符 </s>
表明我们正在处理 SentencePiece tokenizer
。 SentencePiece tokenizer
基于 Unigram
tokenization
算法,该算法对多语言语料库特别有用,因为它允许 SentencePiece
在不知道重音、标点符号以及没有空格分隔字符(例如中文)的情况下对文本进行 tokenization
。
为了对文本进行 tokenization
,我们必须处理与摘要相关的细节:因为 label
也是文本,它也可能超过模型的最大上下文大小。这意味着我们需要同时对评论和标题进行截断,确保不会将太长的输入传递给模型。Transformers
中的 tokenizer
提供了一个 as_target_tokenizer()
函数,从而允许你相对于 input
并行地对 label
进行 tokenize
。
xxxxxxxxxx
max_input_length = 512 # 评论的长度的上限
max_target_length = 30 # 标题的长度的上限
def preprocess_function(examples):
model_inputs = tokenizer(
examples["review_body"], max_length=max_input_length, truncation=True
)
# Set up the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples["review_title"], max_length=max_target_length, truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)
评估指标:衡量文本生成任务(如文本摘要、翻译)的性能并不那么简单。最常用的指标之一是 Recall-Oriented Understudy for Gisting Evaluation: ROUGE
得分。该指标背后的基本思想是:将生成的摘要与一组参考摘要(通常由人类创建)进行比较。具体而言,假设我们比较如下的两个摘要:
xxxxxxxxxx
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"
比较它们的一种方法是计算重叠单词的数量,在这种情况下为 6
。但是,这有点粗糙,因此 ROUGE
是基于重叠的单词来计算 precision
和 recall
。
recall
:衡量生成的摘要召回了参考摘要(reference summary
)中的多少内容。如果只是比较单词,那么 recall
为:
precision
:衡量生成的摘要中有多少内容是和参考摘要有关。如果只是比较单词,那么 precision
为:
在实践中,我们通常计算 precision
和 recall
,然后报告 F1-score
。我们可以安装 rouge_score package
,并在 datasets
中调用该指标。
xxxxxxxxxx
# pip install rouge_score (首先安装)
from datasets import load_metric
rouge_score = load_metric("rouge")
scores = rouge_score.compute(
predictions=[generated_summary], references=[reference_summary]
)
print(scores)
# {
# 'rouge1': AggregateScore(
# low=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923),
# mid=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923),
# high=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923)),
# 'rouge2': AggregateScore(
# low=Score(precision=0.6666666666666666, recall=0.8, fmeasure=0.7272727272727272),
# mid=Score(precision=0.6666666666666666, recall=0.8, fmeasure=0.7272727272727272),
# high=Score(precision=0.6666666666666666, recall=0.8, fmeasure=0.7272727272727272)),
# 'rougeL': AggregateScore(
# low=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923),
# mid=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923),
# high=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923)),
# 'rougeLsum': AggregateScore(
# low=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923),
# mid=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923),
# high=Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923))
# }
rouge_score.compute()
会一次性计算所有指标。输出的含义如下:
首先,rouge_score
计算了 precision, recall, F1-score
的置信区间,即 low/mid/high
属性。
其次,rouge_score
在比较生成的摘要和参考摘要时,会考虑不同的粒度:rouge1
是 unigram
粒度、rouge2
是 bigram
粒度。rougeL
和 rougeLsum
通过在生成的摘要和参考摘要之间查找最长公共子串,从而得到重叠的单词序列。其中,rougeLsum
表示指标是在整个摘要上计算的,而 rougeL
为单个句子的指标的均值。因为上述例子只有一个句子,因此 rougeLsum
和 rougeL
的输出结果相同。
强大的 baseline
:文本摘要的一个常见baseline
是简单地取一篇文章的前三个句子,通常称之为 lead-3 baseline
。我们可以使用句号(英文使用 "."
)来断句,但这在 "U.S."
或者 "U.N."
之类的首字母缩略词上会失败。所以我们将使用 nltk
库,它包含一个更好的算法来处理这些情况。
xxxxxxxxxx
# pip install nltk 首先安装 nltk
import nltk
nltk.download("punkt") # 下载标点符号规则
from nltk.tokenize import sent_tokenize # 导入 sentence tokenizer
def three_sentence_summary(text):
return "\n".join(sent_tokenize(text)[:3]) # 提取前三个句子
print(three_sentence_summary(books_dataset["train"][1]["review_body"]))
# I grew up reading Koontz, and years ago, I stopped,convinced i had "outgrown" him.
# Still,when a friend was looking for something suspenseful too read, I suggested Koontz.
# She found Strangers.
由于文本摘要任务的约定是用换行符来分隔每个摘要,因此我们这里用 "\n"
来拼接前三个句子。
然后我们实现一个函数,该函数从数据集中提取 lead-3
摘要并计算 baseline
的 ROUGE
得分:
xxxxxxxxxx
def evaluate_baseline(dataset, metric):
summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
return metric.compute(predictions=summaries, references=dataset["review_title"])
然后我们可以使用这个函数来计算验证集的 ROUGE
分数:
xxxxxxxxxx
score = evaluate_baseline(books_dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
print(rouge_dict)
# {'rouge1': 16.77, 'rouge2': 8.87, 'rougeL': 15.55, 'rougeLsum': 15.92}
我们可以看到 rouge2
分数明显低于其他的 rouge
分数。 这可能反映了这样一个事实,即评论标题通常很简洁,因此 lead-3 baseline
过于冗长。
我们首先加载预训练模型。由于文本摘要是一个 seq-to-seq
的任务,我们可以使用 AutoModelForSeq2SeqLM
类加载模型:
xxxxxxxxxx
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto') # mt5-base 占用 2.5G 显存
对于 seq-to-seq
任务,AutoModelForSeq2SeqLM
模型保留了所有的网络权重。相反,文本分类任务重,预训练模型的 head
被随机初始化的网络所替代。
然后我们定义超参数和其它参数。我们使用专用的 Seq2SeqTrainingArguments
和 Seq2SeqTrainer
类。
xxxxxxxxxx
from transformers import Seq2SeqTrainingArguments
batch_size = 8
num_train_epochs = 2 # 为演示方便,暂时仅用 2 个 epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size # 每个 epoch 记录一次日志
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
output_dir=f"{model_name}-finetuned-amazon-en-es",
evaluation_strategy="epoch",
learning_rate=5.6e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
save_total_limit=1, # 最多保存 1 个 checkpoint,因为每个 checkpoint 太大了 (2.5GB)
num_train_epochs=num_train_epochs,
predict_with_generate=True, # 在评估期间生成摘要从而计算 ROUGE 得分
logging_steps=logging_steps,
push_to_hub=False, # 是否允许我们将训练好的模型推送到 Hub
)
predict_with_generate=True
会告诉 Seq2SeqTrainer
在评估时调用模型的 generate()
方法来生成摘要。
然后我们为 Trainer
提供一个 compute_metrics()
函数,以便在训练期间评估模型。这里稍微有点复杂,因为我们需要在计算 ROUGE
分数之前将 output
和 label
解码为文本,从而提供给 rouge_score.compute()
来使用。此外,还需要利用 nltk
中的 sent_tokenize()
函数来用换行符分隔摘要的句子:
xxxxxxxxxx
import numpy as np
def compute_metrics(eval_pred):
predictions, labels = eval_pred
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # 对 prediction 解码
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # 用 pad_token_id 替换 label = -100
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # 对 label 进行解码
decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds] # 对每个样本进行断句
decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels] # 对每个样本进行断句
result = rouge_score.compute( # 计算 ROUGE 分
predictions=decoded_preds, references=decoded_labels, use_stemmer=True
)
result = {key: value.mid.fmeasure * 100 for key, value in result.items()} # 仅获取 mid score
return {k: round(v, 4) for k, v in result.items()}
接下来我们需要为 seq-to-seq
任务定义一个 data collator
。在解码过程中,对于 mT5
,我们需要将 label
右移一位从而作为 decoder
的输入。Transformers
提供了一个 DataCollatorForSeq2Seq
,它为我们动态地填充 input
和 label
。要实例化这个collator
,我们只需要提供 tokenizer
和 model
:
xxxxxxxxxx
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
我们看看这个 collator
在输入一个 mini batch
样本时会产生什么。
首先移除所有的类型为字符串的列,因为 collator
不知道如何处理这些列:
xxxxxxxxxx
tokenized_datasets = tokenized_datasets.remove_columns(
books_dataset["train"].column_names
)
# tokenized_datasets 包含的列: review_id, product_id, reviewer_id, stars, review_body, review_title, language, product_category, input_ids, attention_mask, labels
# books_dataset["train"] 包含的列: review_id, product_id, reviewer_id, stars, review_body, review_title, language, product_category
由于 collator
需要一个 dict
的列表,其中每个 dict
代表数据集中的一个样本,我们还需要在将数据传递给 data collator
之前将数据整理成预期的格式:
xxxxxxxxxx
features = [tokenized_datasets["train"][i] for i in range(2)]
print(data_collator(features))
# {'input_ids': tensor([[...],
# [...]]),
# 'attention_mask': tensor([[1...],
# [...]]),
# 'labels': tensor([[ 298, 259, 5994, 269, 774, 5547, 1],
# [ 298, 10380, 304, 13992, 291, 1, -100]]),
# 'decoder_input_ids': tensor([[ 0, 298, 259, 5994, 269, 774, 5547],
# [ 0, 298, 10380, 304, 13992, 291, 1]])}
如果某一个样本比另一个样本要短,那么它的 input_ids/attention_mask
右侧将被填充 [PAD] token
(token ID
为 0
)。 类似地,我们可以看到 labels
已用 -100
填充,以确保 pad token
被损失函数忽略。最后,我们可以看到一个新的 decoder_input_ids
,它通过在开头插入 [PAD] token
将标签向右移动来形成。
现在开始实例化 Trainer
并进行训练了:
xxxxxxxxxx
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train() # 训练
trainer.evaluate() # 评估
# {'eval_loss': nan,
# 'eval_rouge1': 4.0461,
# 'eval_rouge2': 0.7318,
# 'eval_rougeL': 3.9266,
# 'eval_rougeLsum': 3.9468,
# 'eval_runtime': 6.6003,
# 'eval_samples_per_second': 36.059,
# 'eval_steps_per_second': 4.545,
# 'epoch': 2.0}
# trainer.push_to_hub(commit_message="Training complete", tags="summarization") # 推送到 huggingface
使用微调的模型:
xxxxxxxxxx
from transformers import pipeline
summarizer = pipeline("summarization", model = "./mt5-base-finetuned-amazon-en-es/checkpoint-2000")
我们可以将测试集中的一些样本(模型还没有看到)馈入 pipeline
,从而了解生成的摘要的质量。
我们实现一个简单的函数来一起显示评论、标题、以及生成的摘要:
xxxxxxxxxx
def print_summary(idx):
review = books_dataset["test"][idx]["review_body"]
title = books_dataset["test"][idx]["review_title"]
summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
print(f"'>>> Review: {review}'")
print(f"\n'>>> Title: {title}'")
print(f"\n'>>> Summary: {summary}'")
print_summary(100)
使用 Accelerate
来微调 mT5
的过程,与微调文本分类模型非常相似。区别在于这里需要在训练期间显式生成摘要,并定义如何计算ROUGE
分数。
创建 dataloader
:我们需要做的第一件事是为每个数据集的每个 split
创建一个DataLoader
。 由于 PyTorch dataloader
需要batch
的张量,我们需要在数据集中将格式设置为torch
:
xxxxxxxxxx
from datasets import load_dataset
from datasets import concatenate_datasets, DatasetDict
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
##************* 创建 dataloader *****************
##***** 加载数据
spanish_dataset = load_dataset("amazon_reviews_multi", "es")
english_dataset = load_dataset("amazon_reviews_multi", "en")
def filter_books(example):
return (
example["product_category"] == "book"
or example["product_category"] == "digital_ebook_purchase"
)
spanish_books = spanish_dataset.filter(filter_books)
english_books = english_dataset.filter(filter_books)
##****** 合并这两种语言的数据集
books_dataset = DatasetDict()
for split in english_books.keys():
books_dataset[split] = concatenate_datasets( [english_books[split], spanish_books[split]])
books_dataset[split] = books_dataset[split].shuffle(seed=42)
##****** 短标题过滤
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)
##****** tokenization
model_checkpoint = "/mnt/disk_b/ModelZoo/mt5-base" # 提前下载到本地
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_input_length = 512 # 评论的长度的上限
max_target_length = 30 # 标题的长度的上限
def preprocess_function(examples):
model_inputs = tokenizer(
examples["review_body"], max_length=max_input_length, truncation=True
)
# Set up the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples["review_title"], max_length=max_target_length, truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
books_dataset["train"].column_names
)
##****** 创建 model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
##****** data_collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
##********* 创建 dataloader
tokenized_datasets.set_format("torch")
batch_size = 8
train_dataloader = DataLoader(
tokenized_datasets["train"],
shuffle=True,
collate_fn=data_collator,
batch_size=batch_size,
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size
)
创建训练组件:
xxxxxxxxxx
##******************** 创建训练组件 *************
##****** 优化器
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)
##***** 创建 accelerator
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
##***** 创建学习率调度器
from transformers import get_scheduler
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader) # 必须在 accelerator.prepare() 之后调用
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
##***** 创建 rouge_score
from datasets import load_metric
rouge_score = load_metric("rouge")
后处理:
将生成的摘要进行断句(拆分为 "\n"
换行的句子),这是 ROUGE
需要的格式。
xxxxxxxxxx
import nltk
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]
# ROUGE expects a newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
return preds, labels
在 Hugging Face Hub
创建一个 repository
来存储模型。如果不需要上传,那么这一步不需要。
xxxxxxxxxx
# from huggingface_hub import get_full_repo_name
# model_name = "test-bert-finetuned-squad-accelerate"
# repo_name = get_full_repo_name(model_name)
# from huggingface_hub import Repository
output_dir = "results-mt5-finetuned-squad-accelerate"
# repo = Repository(output_dir, clone_from=repo_name)
开始训练:(4090
显卡,模型大小 2.5G
,训练期间占用内存 22.7G
)
xxxxxxxxxx
from tqdm.auto import tqdm
import torch
import numpy as np
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
## Training 阶段
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation 阶段
model.eval()
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
generated_tokens = accelerator.unwrap_model(model).generate(
batch["input_ids"],
attention_mask=batch["attention_mask"],
)
## 填充所生成的文本
## 在多进程场景下,可能两个进程将 predictions/labels 在进程内部对齐,但是在进程间不一致,这里需要跨进程对齐
generated_tokens = accelerator.pad_across_processes(
generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
)
labels = batch["labels"]
# 如果预处理阶段没有填充那到最大长度,那么这里需要对 label 也进行填充
labels = accelerator.pad_across_processes(
batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
)
generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
labels = accelerator.gather(labels).cpu().numpy()
# Replace -100 in the labels as we can't decode them
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(
generated_tokens, skip_special_tokens=True
)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(
decoded_preds, decoded_labels
)
rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)
# 计算指标
result = rouge_score.compute()
# 计算 median ROUGE score
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
result = {k: round(v, 4) for k, v in result.items()}
print(f"Epoch {epoch}:", result)
# 在每个 epoch 结束时保存模型
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
# repo.push_to_hub(
# commit_message=f"Training in progress epoch {epoch}", blocking=False
# )
# Epoch 0: {'rouge1': 5.5492, 'rouge2': 0.6587, 'rougeL': 5.5844, 'rougeLsum': 5.5422}
# Epoch 1: {'rouge1': 8.154, 'rouge2': 2.5786, 'rougeL': 8.0205, 'rougeLsum': 7.9891}
# Epoch 2: {'rouge1': 13.8772, 'rouge2': 5.9258, 'rougeL': 13.86, 'rougeLsum': 13.858}
# Epoch 3: {'rouge1': 14.3815, 'rouge2': 6.0753, 'rougeL': 14.1405, 'rougeLsum': 14.2002}
# Epoch 4: {'rouge1': 12.9502, 'rouge2': 5.3787, 'rougeL': 12.8429, 'rougeLsum': 12.8553}
# Epoch 5: {'rouge1': 13.613, 'rouge2': 6.2498, 'rougeL': 13.3715, 'rougeLsum': 13.3895}
# Epoch 6: {'rouge1': 13.3266, 'rouge2': 6.0245, 'rougeL': 13.0357, 'rougeLsum': 13.0793}
# Epoch 7: {'rouge1': 13.8225, 'rouge2': 6.4, 'rougeL': 13.5457, 'rougeLsum': 13.6644}
# Epoch 8: {'rouge1': 13.9203, 'rouge2': 6.5123, 'rougeL': 13.6504, 'rougeLsum': 13.6976}
# Epoch 9: {'rouge1': 14.374, 'rouge2': 6.9012, 'rougeL': 14.1307, 'rougeLsum': 14.2309}
使用:
xxxxxxxxxx
from transformers import pipeline
summarizer = pipeline("summarization", model = "./mt5-base-finetuned-amazon-en-es/checkpoint-2000")
def print_summary(idx):
review = books_dataset["test"][idx]["review_body"]
title = books_dataset["test"][idx]["review_title"]
summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
print(f"'>>> Review: {review}'")
print(f"\n'>>> Title: {title}'")
print(f"\n'>>> Summary: {summary}'")
print_summary(-100)
# '>>> Review: The story was all over the place. I felt no connection to Evelyn/Eva/Evie, and the ending was anticlimactic. Thank goodness it was a free book.'
# '>>> Title: Neither gripping or emotional.'
# '>>> Summary: Good book.'
翻译是另一个 seq-to-seq
任务,它非常类似于文本摘要任务。你可以将我们将在此处学习到的一些内容迁移到其他的 seq-to-seq
问题。
加载数据集:我们将使用 KDE4
数据集,该数据集是 KDE
应用程序本地化文件的数据集。该数据集有 92
种语言可用,这里我们选择英语和法语。
xxxxxxxxxx
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")
print(raw_datasets)
# DatasetDict({
# train: Dataset({
# features: ['id', 'translation'],
# num_rows: 210173
# })
# })
我们有 210,173
对句子,但是我们需要创建自己的验证集:
xxxxxxxxxx
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test") # 将 test 重命名为 validation
print(split_datasets)
# DatasetDict({
# train: Dataset({
# features: ['id', 'translation'],
# num_rows: 189155
# })
# validation: Dataset({
# features: ['id', 'translation'],
# num_rows: 21018
# })
# })
我们可以查看数据集的一个元素:
xxxxxxxxxx
print(split_datasets["train"][1]["translation"])
# {'en': 'Default to expanded threads', 'fr': 'Par défaut, développer les fils de discussion'}
我们使用的预训练模型已经在一个更大的法语和英语句子语料库上进行了预训练。我们看看这个预训练模型的效果:
xxxxxxxxxx
from transformers import pipeline
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr" # 大约 300MB
translator = pipeline("translation", model=model_checkpoint)
print(translator("Default to expanded threads"))
# [{'translation_text': 'Par défaut pour les threads élargis'}]
数据预处理:所有文本都需要转换为 token ID
。
xxxxxxxxxx
from transformers import AutoTokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")
但是,对于 target
,需要将 tokenizer
包装在上下文管理器 "as_target_tokenizer()"
中,因为不同的语言需要不同的 tokenization
:
xxxxxxxxxx
max_input_length = 128
max_target_length = 128 # 这里设置了 label 和 input 的最大长度相同(也可以不同)
def preprocess_function(examples):
inputs = [ex["en"] for ex in examples["translation"]]
targets = [ex["fr"] for ex in examples["translation"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
remove_columns=split_datasets["train"].column_names,
)
这里我们使用 Seq2SeqTrainer
,它是 Trainer
的子类,它可以正确处理这种 seq-to-seq
的评估,并使用 generate()
方法来预测输出。
首先我们加载一个 AutoModelForSeq2SeqLM
模型:
xxxxxxxxxx
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
然后我们使用 DataCollatorForSeq2Seq
来创建一个 data_collator
,它不仅预处理输入,也同时预处理 label
。
xxxxxxxxxx
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
现在我们来测试这个 data_collator
:
xxxxxxxxxx
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
print(batch)
# {
# 'input_ids':
# tensor([[47591,12,9842,19634,9, 0,59513,59513,59513,59513,59513, 59513, 59513, 59513, 59513],
# [ 1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 28149, 139, 33712, 25218, 0]]),
# 'attention_mask':
# tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
# 'labels':
# tensor([[577, 5891, 2, 3184, 6,2542, 5,1710, 0, -100, -100, -100, -100, -100, -100, -100],
# [1211,3,49,9409,1211, 3, 29140,817, 3124,817, 550,7032,5821,7907, 12649, 0]]),
# 'decoder_input_ids':
# tensor([[59513,577,5891,2,3184,16,2542,5,1710, 0,59513, 59513, 59513, 59513, 59513, 59513],
# [59513,1211,3,49,9409,1211,3,29140,817,3124,817,550,7032,5821,7907,12649]])}
可以看到,label
已经用 -100
填充到 batch
的最大长度。
评估指标:相比较于父类 Trainer
,Seq2SeqTrainer
的一个额外功能是,在评估或预测期间调用 generate()
方法从而进行评估。
用于翻译任务的传统指标是 BLUE
分数,它评估翻译与 label
的接近程度。BLUE
分数不衡量翻译结果的语法正确性或可读性,而是使用统计规则来确保生成输出中的所有单词也出现在 label
中。BLUE
的一个缺点是,它需要确保文本已被 tokenization
,这使得比较使用不同 tokenizer
的模型之间的 BLUE
分数变得困难。因此,目前用于翻译任务的常用指标是 SacreBLUE
,它通过标准化 tokenization
步骤解决这个缺点(以及其他的一些缺点)。
xxxxxxxxxx
# pip install sacrebleu # 首先安装
from datasets import load_metric
metric = load_metric("sacrebleu")
该指标接受多个 acceptable labels
,因为同一个句子通常有多个可接受的翻译。在我们的这个例子中,每个句子只有一个 label
。因此,预测结果是关于句子的一个列表,而 reference
也是关于句子的一个列表。
xxxxxxxxxx
predictions = [ "This plugin lets you translate web pages between several languages automatically."]
references = [
[
"This plugin allows you to automatically translate web pages between several languages."
]
]
print(metric.compute(predictions=predictions, references=references))
# {
# 'score': 46.750469682990165,
# 'counts': [11, 6, 4, 3],
# 'totals': [12, 11, 10, 9],
# 'precisions': [91.66666666666667, 54.54545454545455, 40.0, 33.333333333333336],
# 'bp': 0.9200444146293233,
# 'sys_len': 12,
# 'ref_len': 13
# }
这得到了 46.75
的 BLUE
得分,看起来相当不错。如果我们尝试使用翻译模型中经常出现的两种糟糕的预测类型(大量重复、或者太短),我们将得到相当糟糕的BLEU
分数:
xxxxxxxxxx
predictions = ["This This This This"] # 大量重复
predictions = ["This plugin"] # 翻译太短
为了从模型输出转换为文本从而计算 BLUE
得分,我们需要使用 tokenizer.batch_decode()
方法来解码。注意,我们需要清理 label
中的所有 -100
。
xxxxxxxxxx
import numpy as np
def compute_metrics(eval_preds):
preds, labels = eval_preds
# In case the model returns more than the prediction logits
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # 解码 output
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # 替换 -100 为 pad_token_id
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # 解码 label
decoded_preds = [pred.strip() for pred in decoded_preds] # 格式清理
decoded_labels = [[label.strip()] for label in decoded_labels] # 格式清理
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
return {"bleu": result["score"]}
执行微调:
首先我们创建 Seq2SeqTrainingArguments
的实例,其中 Seq2SeqTrainingArguments
是 TrainingArguments
的子类。
xxxxxxxxxx
from transformers import Seq2SeqTrainingArguments
args = Seq2SeqTrainingArguments(
f"marian-finetuned-kde4-en-to-fr",
evaluation_strategy="no", # 训练过程中不评估验证集(因为我们会在训练之前和训练结束后分别手动评估验证集)
save_strategy="epoch", # 每个 epoch 保存一次
learning_rate=2e-5,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=3,
predict_with_generate=True, # 通过 model.generate() 来执行预测
fp16=True, # FP16 混合精度训练
push_to_hub=False, # 不上传到 HuggingFace Hub
)
然后我们创建 Seq2SeqTrainer
:
xxxxxxxxxx
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
训练之前,我们首先评估模型在验证集上的 BLUE
分数:
xxxxxxxxxx
print(trainer.evaluate(max_length=max_target_length)) # 大约10分钟(AMD epyc 7543 CPU, 128G内存, 4090显卡)
# {
# 'eval_loss': 1.6964517831802368,
# 'eval_bleu': 25.572039589826357,
# 'eval_runtime': 698.3895,
# 'eval_samples_per_second': 30.095,
# 'eval_steps_per_second': 0.471
# }
然后开始训练(模型大小 300M
,训练显存消耗 15.7G
):
xxxxxxxxxx
trainer.train() # 每个 epoch 都会保存
# trainer.push_to_hub(tags="translation", commit_message="Training complete") # 可选:推送模型到 HuggingFace Hub
训练完成之后,我们再次评估模型:
xxxxxxxxxx
print(trainer.evaluate(max_length=max_target_length))
# {
# 'eval_loss': 0.8559923768043518,
# 'eval_bleu': 44.69236449595659,
# 'eval_runtime': 669.7527,
# 'eval_samples_per_second': 31.382,
# 'eval_steps_per_second': 0.491,
# 'epoch': 3.0
# }
可以看到 BLUE
分有 19.12
分的提高。
自定义训练过程如下:
xxxxxxxxxx
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from transformers import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
import numpy as np
from datasets import load_metric
##******************** 创建 DataLoader ********************
##***** 加载数据集
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test") # 将 test 重命名为 validation
##***** tokenization
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")
max_input_length = 128
max_target_length = 128 # 这里设置了 label 和 input 的最大长度相同(也可以不同)
def preprocess_function(examples):
inputs = [ex["en"] for ex in examples["translation"]]
targets = [ex["fr"] for ex in examples["translation"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
remove_columns=split_datasets["train"].column_names,
)
tokenized_datasets.set_format("torch")
##******* 创建 data_collator
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
##****** 创建 dataloader
train_dataloader = DataLoader(
tokenized_datasets["train"],
shuffle=True,
collate_fn=data_collator,
batch_size=8,
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)
##*************** 创建训练组件 *********************
##******* 优化器
optimizer = AdamW(model.parameters(), lr=2e-5)
##****** accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
##***** 学习率调度器
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
##***** 创建 Repository (可选)
# from huggingface_hub import Repository, get_full_repo_name
# model_name = "marian-finetuned-kde4-en-to-fr-accelerate"
# repo_name = get_full_repo_name(model_name)
output_dir = "marian-finetuned-kde4-en-to-fr-accelerate"
# repo = Repository(output_dir, clone_from=repo_name)
##******* 评估方法
metric = load_metric("sacrebleu")
def postprocess(predictions, labels):
predictions = predictions.cpu().numpy()
labels = labels.cpu().numpy()
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # 替换 label 中的 -100
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds = [pred.strip() for pred in decoded_preds]
decoded_labels = [[label.strip()] for label in decoded_labels]
return decoded_preds, decoded_labels
##*************** 训练 *********************
from tqdm.auto import tqdm
import torch
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Training
model.train()
for batch in train_dataloader:
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation
model.eval()
for batch in tqdm(eval_dataloader):
with torch.no_grad():
# 注意:我们需要调用底层模型的 generate() 方法,因此这里需要调用 accelerator.unwrap_model()
generated_tokens = accelerator.unwrap_model(model).generate(
batch["input_ids"],
attention_mask=batch["attention_mask"],
max_length=128, # 最大生成序列的长度为 128
)
labels = batch["labels"]
# 需要在 accelerator.gather() 调用之前,首先跨所有进程把 generated_tokens/labels 填充到相同的长度
generated_tokens = accelerator.pad_across_processes(
generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
)
labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
# 跨进程收集 generated_tokens/labels,收集的结果拼接到 batch 维
predictions_gathered = accelerator.gather(generated_tokens)
labels_gathered = accelerator.gather(labels)
decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
results = metric.compute()
print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")
# 每个 epoch 结束时保存模型
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
# repo.push_to_hub(
# commit_message=f"Training in progress epoch {epoch}", blocking=False
# )
# 训练输出:
# epoch 0, BLEU score: 50.54
# epoch 1, BLEU score: 53.21
# epoch 2, BLEU score: 53.83
调用微调好的模型:
xxxxxxxxxx
from transformers import pipeline
translator = pipeline("translation", model="./marian-finetuned-kde4-en-to-fr-accelerate")
print(translator("Default to expanded threads"))
# [{'translation_text': 'Par défaut, développer les fils de discussion'}]
这里将使用SQuAD
问答数据集微调一个 BERT
模型。
下载数据:
xxxxxxxxxx
from datasets import load_dataset
raw_datasets = load_dataset("squad")
print(raw_datasets)
# DatasetDict({
# train: Dataset({
# features: ['id', 'title', 'context', 'question', 'answers'],
# num_rows: 87599
# })
# validation: Dataset({
# features: ['id', 'title', 'context', 'question', 'answers'],
# num_rows: 10570
# })
# })
print("Context: ", raw_datasets["train"][0]["context"])
# Context: Architecturally, ...
print("Question: ", raw_datasets["train"][0]["question"])
# Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
print("Answer: ", raw_datasets["train"][0]["answers"])
# Answer: {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}
context
和 question
字段使用起来非常简单。但是 answers
字段有点棘手,因为它是一个字典并且两个字段都是列表。这是在评估过程中 squad metric
所期望的格式。answers
中的text
给出了答案的文本,answer
中的 answer_start
字段给出了答案文本在 context
中的起始位置。
在训练期间,只有一种可能的答案。我们可以使用 Dataset.filter()
方法来确认这一点:
xxxxxxxxxx
print(raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1))
# Dataset({
# features: ['id', 'title', 'context', 'question', 'answers'],
# num_rows: 0
# })
在评估期间,每个问题都有几个可能得答案:
xxxxxxxxxx
print(raw_datasets["validation"][0]["answers"])
# {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}
print(raw_datasets["validation"][2]["answers"])
# {'text': ['Santa Clara, California', "Levi's Stadium", "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."], 'answer_start': [403, 355, 355]}
数据处理:困难的部分将是为答案生成 label
,这将是答案对应于上下文中对应的开始位置和结束位置。
首先,我们需要使用 tokenizer
将输入中的文本转换为模型可以理解的ID
:
xxxxxxxxxx
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(tokenizer.is_fast)
# True
你也可以使用其它模型,只要它实现了 fast tokenizer
即可。
现在我们检查 tokenization
:
xxxxxxxxxx
context = raw_datasets["train"][0]["context"]
question = raw_datasets["train"][0]["question"]
inputs = tokenizer(question, context)
print(inputs)
# {
# 'input_ids': [101, 1706, 2292, ..., 102], # 每个 token 的 token id
# 'token_type_ids': [0, 0, 0, ..., 1], # 每个 token 属于 question 还是 context
# 'attention_mask': [1, 1, 1, ..., 1] # 每个 token 的 attention mask
# }
print(tokenizer.decode(inputs["input_ids"]))
# [CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally,.... [SEP]
这个 tokenizer
将会插入 special token
,从而形成如下格式的句子:
xxxxxxxxxx
[CLS] question [SEP] context [SEP]
然后,上下文可能太长,因此需要截断:
xxxxxxxxxx
inputs = tokenizer(
question,
context,
max_length=100, # 最大长度设置为 100
truncation="only_second", # 仅截断第二个输入,即上下文
stride=60, # 滑动窗口的步长为 60,这使得连续两个截断块之间重叠的 token 数为 40
return_overflowing_tokens=True, # 让 tokenizer 知道溢出的 token
)
for ids in inputs["input_ids"]:
print(tokenizer.decode(ids))
# [CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, ...[SEP]
# [CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] statue of the Virgin Mary. ... [SEP]
# [CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Christ with arms ... [SEP]
# [CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] the Main Building is ... [SEP]
# [CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP], a Marian place of ... [SEP]
可以看到,我们的样本被分为四个输入,每个输入都包含了整个问题、以及一部分的上下文。
然后我们需要在上下文中找到答案的开始位置和结束位置。我们需要得到字符位置到 token
位置之间的映射:
xxxxxxxxxx
inputs = tokenizer(
question,
context,
max_length=100,
truncation="only_second",
stride=50,
return_overflowing_tokens=True,
return_offsets_mapping=True, # 返回字符位置到 token 位置之间的映射
)
print(inputs.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
print(inputs["overflow_to_sample_mapping"]) # 每个拆分后的新样本属于拆分前的第几个样本
# [0, 0, 0, 0]
print(inputs['offset_mapping']) # 每个拆分后的新样本的每个token,在原始样本中的位置,格式为(start_index, end_index)
# [
# [(0, 0), (0, 2), (3, 7), (8, 11), (12, 15), (16, 22), (23, 27), (28, 37), (38, 44), (45, 47), (48, 52), (53, 55), (56, 59), (59, 63), (64, 70), (70, 71), (0, 0), (0, 13), ..., (0, 0)],
# [(0, 0), (0, 2), (3, 7), (8, 11), (12, 15), (16, 22), (23, 27), (28, 37), (38, 44), (45, 47), (48, 52), (53, 55), (56, 59), (59, 63), (64, 70), (70, 71), (0, 0), (152, 155), ..., (0, 0)],
# [(0, 0), (0, 2), (3, 7), (8, 11), (12, 15), (16, 22), (23, 27), (28, 37), (38, 44), (45, 47), (48, 52), (53, 55), (56, 59), (59, 63), (64, 70), (70, 71), (0, 0), (271, 275), ..., (0, 0)],
# [(0, 0), (0, 2), (3, 7), (8, 11), (12, 15), (16, 22), (23, 27), (28, 37), (38, 44), (45, 47), (48, 52), (53, 55), (56, 59), (59, 63), (64, 70), (70, 71), (0, 0), (420, 421), ..., (0, 0)]
# ]
print(inputs.sequence_ids(0)) # 第0个新样本的原始样本中每个 token 对应于 question 还是 context
# [None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None]
print(inputs.token_type_ids) # 新样本中,每个 token 对应于 question 序列还是 context 序列
# [
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# ]
对于多个样本:
xxxxxxxxxx
inputs = tokenizer(
raw_datasets["train"][2:6]["question"],
raw_datasets["train"][2:6]["context"],
max_length=100,
truncation="only_second",
stride=50,
return_overflowing_tokens=True,
return_offsets_mapping=True,
)
print(f"The 4 examples create {len(inputs['input_ids'])} new records.")
# The 4 examples create 19 new records.
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")
# Here is where each comes from: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3].
这些信息将有助于我们将答案映射到对应的 label
,其中:
如果 label
是 (0,0)
,则表明答案不在上下文的相应范围内。
否则,如果 label
是 (start_position, end_position)
,则表明答案在上下文的相应范围内,其中 start_position
是答案开始的 token index
、end_position
是答案结束的 token index
。
xxxxxxxxxx
answers = raw_datasets["train"][2:6]["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(inputs["offset_mapping"]):
sample_idx = inputs["overflow_to_sample_mapping"][i] # 第 i 个新样本对应于原始的第 sample_idx 个样本
answer = answers[sample_idx] # 第 sample_idx 个原始样本的答案
start_char = answer["answer_start"][0] # 训练样本只有一个答案,获取它的开始
end_char = answer["answer_start"][0] + len(answer["text"][0]) # 训练样本只有一个答案,获取它的结束
sequence_ids = inputs.sequence_ids(i) # 第i个新样本的原始样本中每个 token 对应于 question 还是 context
idx = 0
while sequence_ids[idx] != 1: # 找到 context 的起始为止
idx += 1
context_start = idx
while sequence_ids[idx] == 1: # 找到 context 的结束位置
idx += 1
context_end = idx - 1
# 如果答案不在当前新样本中, 则 label 为 (0, 0)
# offset 给出当前新样本包含的 token 所对应于原始样本中的位置 (token_start, token_end)
if start_char < offset[context_start][0] or end_char > offset[context_end][1]:
start_positions.append(0)
end_positions.append(0)
else: # 答案在当前新样本中
idx = context_start
while idx <= context_end and offset[idx][0] <= start_char: # 答案在新样本中的开始位置
idx += 1
start_positions.append(idx - 1)
idx = context_end
while idx >= context_start and offset[idx][1] >= end_char: # 答案在新样本中的结束位置
idx -= 1
end_positions.append(idx + 1)
print(start_positions)
# [83, 51, 19, 0, 0, 64, 27, 0, 34, 0, 0, 0, 67, 34, 0, 0, 0, 0, 0]
print(end_positions)
# [85, 53, 21, 0, 0, 70, 33, 0, 40, 0, 0, 0, 68, 35, 0, 0, 0, 0, 0]
注意,这要求答案不能太长,使得答案必须位于拆分后的某个新样本的上下文中。
现在我们验证该方法是否正确:
xxxxxxxxxx
###************* 答案在新样本的上下文中 ************
idx = 0 # 第一个新样本
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0] # 直接从 answers 中提取
start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1]) # 从新样本中解码
print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
# Theoretical answer: the Main Building, labels give: the Main Building
###************* 答案不在新样本的上下文中 ************
idx = 4
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]
decoded_example = tokenizer.decode(inputs["input_ids"][idx])
print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")
# Theoretical answer: a Marian place of prayer and reflection, decoded example: [CLS] What is the Grotto at Notre Dame? [SEP] Architecturally, the school has a Catholic character.... [SEP]
现在开始准备预处理函数:
xxxxxxxxxx
max_length = 384
stride = 128
def preprocess_training_examples(examples):
questions = [q.strip() for q in examples["question"]] # 清除空格
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second", # 仅截断 context
stride=stride, # 滑动窗口的步长
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length", # 填充到全局最大
)
offset_mapping = inputs.pop("offset_mapping")
sample_map = inputs.pop("overflow_to_sample_mapping")
answers = examples["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(offset_mapping):
sample_idx = sample_map[i]
answer = answers[sample_idx]
start_char = answer["answer_start"][0]
end_char = answer["answer_start"][0] + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(i) # 第i个新样本的原始样本中每个 token 对应于 question 还是 context
# Find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
# If the answer is not fully inside the context, label is (0, 0)
if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
start_positions.append(0)
end_positions.append(0)
else:
# Otherwise it's the start and end token positions
idx = context_start
while idx <= context_end and offset[idx][0] <= start_char:
idx += 1
start_positions.append(idx - 1)
idx = context_end
while idx >= context_start and offset[idx][1] >= end_char:
idx -= 1
end_positions.append(idx + 1)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
return inputs
然后将这个函数应用到 Dataset.map()
方法:
xxxxxxxxxx
train_dataset = raw_datasets["train"].map(
preprocess_training_examples,
batched=True,
remove_columns=raw_datasets["train"].column_names,
)
print(len(raw_datasets["train"]))
# 87599
print(len(train_dataset))
# 88729
处理验证数据集:预处理验证数据会稍微容易一些,因为我们不需要生成 label
(除非我们想计算验证集损失,但这个指标并不能真正帮助我们理解模型有多好)。需要注意的是:我们要将模型的预测结果解释为原始上下文的跨度。为此, 我们只需要存储 offset_mapping
、以及某种方式来将每个创建的新样本与原始样本相匹配。
xxxxxxxxxx
def preprocess_validation_examples(examples):
questions = [q.strip() for q in examples["question"]]
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second",
stride=stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
sample_map = inputs.pop("overflow_to_sample_mapping")
example_ids = [] # 样本ID,由数据集的 id 字段提供,格式为 "56be4db0acb8001400a502ec" 这种
for i in range(len(inputs["input_ids"])):
sample_idx = sample_map[i] # 第 i 个新样本对应于第 sample_idx 个原始样本
example_ids.append(examples["id"][sample_idx]) # 原始样本的样本id
sequence_ids = inputs.sequence_ids(i) # 第i个新样本的原始样本中每个 token 对应于 question 还是 context
offset = inputs["offset_mapping"][i]
inputs["offset_mapping"][i] = [
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
]
inputs["example_id"] = example_ids
return inputs
validation_dataset = raw_datasets["validation"].map(
preprocess_validation_examples,
batched=True,
remove_columns=raw_datasets["validation"].column_names,
)
print(len(raw_datasets["validation"]))
# 10570
print(len(validation_dataset))
# 10822
注意,我们仅保留 context
的 offset_mapping
,而将 question
的 offset_mapping
设置为 None
。这么做是为了后处理步骤做准备。
在这个例子中,compute_metrics()
函数是个难点。由于我们人工降所有样本填充到我们设置的最大长度,因此无需定义 data_collator
。困难的部分是将模型预测进行后处理,从而得到原始样本中的文本范围。
后处理:模型输出的是答案开始位置的 logit
、答案结束位置的 logit
。通常而言,我们需要做如下的处理:
首先,屏蔽掉上下文之外的 token
所对应的 logit
,因为答案必须位于上下文中。
然后,我们使用 softmax
将这些 logit
转换为概率。
然后,我们通过获取对应的两个概率的乘积,从而为每个 (start_token, end_token)
组合赋值。
最后,我们寻找有效的、答案分数最高的 (start_token, end_token)
组合。这里有效指的是,例如,start_token
必须位于 end_token
之前。
在这里我们稍微改变下这个流程,因为我们在 compute_metrics()
函数中不需要计算实际得分,因此可以跳过 softmax
的计算,然后通过 start logit
和 end logit
之和来获得 (start_token, end_token)
组合的得分。这里我们没有用乘积,因为
此外,我们也没有对所有可能的 (start_token, end_token)
组合进行评分,而是对 top n_best
的 start_token
和 top n_best
的 end_token
的组合进行评分,其中 n_best = 20
。
为了检验该做法的合理性,我们使用一个预训练好的模型来生成一些 prediction
:
xxxxxxxxxx
##************* 选择预训练好的一个模型来生成 prediction *********
small_eval_set = raw_datasets["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
preprocess_validation_examples,
batched=True,
remove_columns=raw_datasets["validation"].column_names,
)
##************** 切换回 tokenizer,这里用不上,为了后续流程做准备 *****************
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
##************* 执行预测 ********************
import torch
from transformers import AutoModelForQuestionAnswering
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"]) # 移除不需要的列
eval_set_for_model.set_format("torch")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names} # data 移动到GPU
print(batch)
# {'input_ids': tensor(..., device='cuda:0'), 'attention_mask': tensor(..., device='cuda:0')}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)
with torch.no_grad():
outputs = trained_model(**batch)
print(outputs)
# QuestionAnsweringModelOutput(loss=None,
# start_logits=tensor(..., device='cuda:0'),
# end_logits=tensor(..., device='cuda:0'), hidden_states=None, attentions=None)
由于 Trainer
将为我们提供的 prediction
是 NumPy
数组格式,我们也进行这种格式转换:
xxxxxxxxxx
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()
现在,我们需要在原始的 small_eval_set
中找到模型对每个样本所预测的答案。一个原始样本可能已经在 eval_set
中拆分为多个新样本,因此第一步是将 small_eval_set
中的原始样本映射到 eval_set
中相应的新样本:
xxxxxxxxxx
import collections
example_to_features = collections.defaultdict(list) # string -> list 的字典
for idx, feature in enumerate(eval_set):
# feature["example_id"] 存放原始样本的样本编号(一个字符串), idx 为新样本的编号(一个整数)
example_to_features[feature["example_id"]].append(idx)
现在我们开始执行如前所示的处理流程:
xxxxxxxxxx
import numpy as np
n_best = 20
max_answer_length = 30 # 答案最多 30 个 token
predicted_answers = []
for example in small_eval_set:
example_id = example["id"]
context = example["context"]
answers = []
for feature_index in example_to_features[example_id]:
start_logit = start_logits[feature_index] # 第 feature_index 个新样本的 start_logit
end_logit = end_logits[feature_index] # 第 feature_index 个新样本的 end_logit
offsets = eval_set["offset_mapping"][feature_index] # 新样本的每个token,在原始样本中的位置,格式为(start_index, end_index)
start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() # top n 的 start_logit
end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() # top n 的 end_logit
for start_index in start_indexes:
for end_index in end_indexes:
# 答案并不是完全位于 context 中,因此不考虑 (这意味着最多只有一个新样本包含完整的答案)
if offsets[start_index] is None or offsets[end_index] is None:
continue
# 如果答案长度为负、或者答案长度超过 max_answer_length,则不考虑
if (
end_index < start_index
or end_index - start_index + 1 > max_answer_length
):
continue
answers.append(
{ # 取 start_token 的开始位置、end_token 的结束位置
"text": context[offsets[start_index][0] : offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
}
)
best_answer = max(answers, key=lambda x: x["logit_score"]) # 取 logit_score 的最大值
predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})
print(predicted_answers[:3])
# [
# {'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'},
# {'id': '56be4db0acb8001400a502ed', 'prediction_text': 'Carolina Panthers'},
# {'id': '56be4db0acb8001400a502ee', 'prediction_text': "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California"}
# ]
评估指标:我们使用 load_metric
来加载 squad
的评估指标:
xxxxxxxxxx
from datasets import load_metric
metric = load_metric("squad")
这个指标预期我们提供:
预测答案:一个关于字典的列表,其中字典格式为 {"id": 样本id, "prediction_text": 预测文本}
。
注意:如果答案太长导致超过 max_answer_length
,或者因为原始样本截断导致答案被截断到两个新样本中,那么这个样本就没有预测答案。
真实答案:一个关于字典的列表,其中字典格式为 {"id": 样本id, "answers": 一组真实的参考答案}
。
现在我们计算预测答案的得分:
xxxxxxxxxx
theoretical_answers = [
{"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set # 从原始样本中
]
print(predicted_answers[0])
# {'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'}
print(theoretical_answers[0])
# {'id': '56be4db0acb8001400a502ec', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}}
print(metric.compute(predictions=predicted_answers, references=theoretical_answers))
# {'exact_match': 83.0, 'f1': 88.25000000000004}
现在把刚才所做的一切放在 compute_metrics()
函数中,我们将在 Trainer
中使用它。通常,compute_metrics()
函数只接收一个 eval_preds
,它包含 logits
和 labels
的元组。这里我们需要更多输入,因为我们必须在截断后的新数据集中查找 offset
。
compute_metrics()
函数几乎与前面的步骤相同,这里我们只是添加一个小检查,从而防止我们没有提出任何有效的答案(在这种情况下,我们预测一个空字符串)。
xxxxxxxxxx
from tqdm.auto import tqdm
def compute_metrics(start_logits, end_logits, features, examples):
##************* 记录每个原始样本被拆分到哪些新样本中 ***********
example_to_features = collections.defaultdict(list) # string -> list
for idx, feature in enumerate(features):
example_to_features[feature["example_id"]].append(idx)
##*************** 获取每个原始样本的预测结果 ***************
predicted_answers = []
for example in tqdm(examples):
example_id = example["id"]
context = example["context"]
answers = []
# 遍历每个原始样本
for feature_index in example_to_features[example_id]:
start_logit = start_logits[feature_index]
end_logit = end_logits[feature_index]
offsets = features[feature_index]["offset_mapping"]
start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() # top-n start_logit
end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() # top-n end_logit
for start_index in start_indexes:
for end_index in end_indexes:
# 答案并不是完全位于 context 中,因此不考虑 (这意味着最多只有一个新样本包含完整的答案)
if offsets[start_index] is None or offsets[end_index] is None:
continue
# 如果答案长度为负、或者答案长度超过 max_answer_length,则不考虑
if (
end_index < start_index
or end_index - start_index + 1 > max_answer_length
):
continue
answer = {
"text": context[offsets[start_index][0] : offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
}
answers.append(answer)
# 如果找到一组候选答案,则寻找得分最大的那个
if len(answers) > 0:
best_answer = max(answers, key=lambda x: x["logit_score"])
predicted_answers.append(
{"id": example_id, "prediction_text": best_answer["text"]}
)
else: # 否则没有候选答案,则默认为空字符串
predicted_answers.append({"id": example_id, "prediction_text": ""})
theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
return metric.compute(predictions=predicted_answers, references=theoretical_answers)
然后我们检验下该函数:
xxxxxxxxxx
print(compute_metrics(start_logits, end_logits, eval_set, small_eval_set))
# {'exact_match': 83.0, 'f1': 88.25000000000004}
模型微调:
首先创建模型:
xxxxxxxxxx
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
我们收到一个警告,有些权重没有使用(来自 pretrained head
)、另一些权重是随机初始化的(用于问答任务的)。这仅仅是因为该模型是用于微调。
然后,如果我们需要将结果推送到 HuggingFace Hub
,则需要登录:
xxxxxxxxxx
# Jupyter notebook 中
from huggingface_hub import notebook_login
notebook_login()
# 或者在 terminal 中执行命令: huggingface-cli login
也可以不登录。
之后,我们创建 TrainingArguments
:
xxxxxxxxxx
from transformers import TrainingArguments
args = TrainingArguments(
"bert-finetuned-squad",
evaluation_strategy="no", # 训练过程中不需要评估
save_strategy="epoch", # 每个 epoch 保存一次
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
fp16=True, # fp16 混合精度训练
push_to_hub=False, # 不需要推送到 HuggingFace Hub
)
注意,由于 compute_metrics()
的函数前面,它不支持常规的 evaluation loop
。一种解决办法是创建 Trainer
的子类;另一种方法是不在训练过程中评估,而是在训练结束后在自定义 loop
中进行评估。这里我们选择第二种办法。这就是 Trainer API
的局限性,也体现了 Accelerate
库的亮点:完全自定义的 training loop
。
然后我们创建 Trainer
:
xxxxxxxxxx
from transformers import Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=validation_dataset,
tokenizer=tokenizer,
)
trainer.train()
# TrainOutput(
# global_step=33276,
# training_loss=0.8343884834576605,
# metrics={'train_runtime': 1781.3809, 'train_samples_per_second': 149.427, 'train_steps_per_second': 18.68, 'total_flos': 5.216534983896422e+16, 'train_loss': 0.8343884834576605, 'epoch': 3.0}
# )
模型评估:Trainer
的 predict()
方法将返回一个元组,其中第一个元素将是模型的预测(这里 (start_logits, end_logits)
组合)。我们将其发送给 compute_metrics()
函数:
xxxxxxxxxx
prediction = trainer.predict(validation_dataset).predictions
start_logits, end_logits = predictions
print(compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"]))
# {'exact_match': 81.19205298013244, 'f1': 88.62747671089845}
很好!作为比较,BERT
文章中报告的该模型的 baseline
分数是80.8
和88.5
,所以我们应该是正确的。
最后,如果需要的话,我们可以把模型上传到 HuggingFace Hub
:
xxxxxxxxxx
trainer.push_to_hub(commit_message="Training complete")
使用微调的模型:在 pipeline
中使用微调好的模型很简单:
xxxxxxxxxx
from transformers import pipeline
model_checkpoint = "./bert-finetuned-squad/checkpoint-33276" # 本地路径,或者 huggingface 上的 model id
question_answerer = pipeline("question-answering", model=model_checkpoint)
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
print(question_answerer(question=question, context=context))
# {'score': 0.9980810880661011,
# 'start': 78,
# 'end': 105,
# 'answer': 'Jax, PyTorch and TensorFlow'}
首先创建 dataloader
:
xxxxxxxxxx
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator
raw_datasets = load_dataset("squad")
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_length = 384
stride = 128
def preprocess_training_examples(examples):
questions = [q.strip() for q in examples["question"]] # 清除空格
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second", # 仅截断 context
stride=stride, # 滑动窗口的步长
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length", # 填充到全局最大
)
offset_mapping = inputs.pop("offset_mapping")
sample_map = inputs.pop("overflow_to_sample_mapping")
answers = examples["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(offset_mapping):
sample_idx = sample_map[i]
answer = answers[sample_idx]
start_char = answer["answer_start"][0]
end_char = answer["answer_start"][0] + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(i) # 第i个新样本的原始样本中每个 token 对应于 question 还是 context
# Find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
# If the answer is not fully inside the context, label is (0, 0)
if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
start_positions.append(0)
end_positions.append(0)
else:
# Otherwise it's the start and end token positions
idx = context_start
while idx <= context_end and offset[idx][0] <= start_char:
idx += 1
start_positions.append(idx - 1)
idx = context_end
while idx >= context_start and offset[idx][1] >= end_char:
idx -= 1
end_positions.append(idx + 1)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
return inputs
def preprocess_validation_examples(examples):
questions = [q.strip() for q in examples["question"]]
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second",
stride=stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
sample_map = inputs.pop("overflow_to_sample_mapping")
example_ids = [] # 样本ID,由数据集的 id 字段提供,格式为 "56be4db0acb8001400a502ec" 这种
for i in range(len(inputs["input_ids"])):
sample_idx = sample_map[i] # 第 i 个新样本对应于第 sample_idx 个原始样本
example_ids.append(examples["id"][sample_idx]) # 原始样本的样本id
sequence_ids = inputs.sequence_ids(i) # 第i个新样本的原始样本中每个 token 对应于 question 还是 context
offset = inputs["offset_mapping"][i]
inputs["offset_mapping"][i] = [
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
]
inputs["example_id"] = example_ids
return inputs
train_dataset = raw_datasets["train"].map(
preprocess_training_examples,
batched=True,
remove_columns=raw_datasets["train"].column_names,
)
validation_dataset = raw_datasets["validation"].map(
preprocess_validation_examples,
batched=True,
remove_columns=raw_datasets["validation"].column_names,
)
train_dataset.set_format("torch") # 设置为 torch 格式
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"]) # 删除模型未使用的列
validation_set.set_format("torch")
train_dataloader = DataLoader(
train_dataset,
shuffle=True, # 仅混洗训练集,无需混洗验证集
collate_fn=default_data_collator, # 使用默认的 data_collator
batch_size=8,
)
eval_dataloader = DataLoader(
validation_set, collate_fn=default_data_collator, batch_size=8
)
然后创建训练组件并进行训练:
xxxxxxxxxx
from torch.optim import AdamW
from transformers import AutoModelForQuestionAnswering
from accelerate import Accelerator
from transformers import get_scheduler
import collections
import numpy as np
from datasets import load_metric
metric = load_metric("squad")
def compute_metrics(start_logits, end_logits, features, examples):
n_best = 20
max_answer_length = 30 # 答案最多 30 个 token
##************* 记录每个原始样本被拆分到哪些新样本中 ***********
example_to_features = collections.defaultdict(list) # string -> list
for idx, feature in enumerate(features):
example_to_features[feature["example_id"]].append(idx)
##*************** 获取每个原始样本的预测结果 ***************
predicted_answers = []
for example in tqdm(examples):
example_id = example["id"]
context = example["context"]
answers = []
# 遍历每个原始样本
for feature_index in example_to_features[example_id]:
start_logit = start_logits[feature_index]
end_logit = end_logits[feature_index]
offsets = features[feature_index]["offset_mapping"]
start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() # top-n start_logit
end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() # top-n end_logit
for start_index in start_indexes:
for end_index in end_indexes:
# 答案并不是完全位于 context 中,因此不考虑 (这意味着最多只有一个新样本包含完整的答案)
if offsets[start_index] is None or offsets[end_index] is None:
continue
# 如果答案长度为负、或者答案长度超过 max_answer_length,则不考虑
if (
end_index < start_index
or end_index - start_index + 1 > max_answer_length
):
continue
answer = {
"text": context[offsets[start_index][0] : offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
}
answers.append(answer)
# 如果找到一组候选答案,则寻找得分最大的那个
if len(answers) > 0:
best_answer = max(answers, key=lambda x: x["logit_score"])
predicted_answers.append(
{"id": example_id, "prediction_text": best_answer["text"]}
)
else: # 否则没有候选答案,则默认为空字符串
predicted_answers.append({"id": example_id, "prediction_text": ""})
theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
return metric.compute(predictions=predicted_answers, references=theoretical_answers)
##************* 创建训练组件 **********************
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator(mixed_precision='fp16')
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
len_validation = len(validation_dataset)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
##************** 推送到 HuggingFace Hub (可选)***********
# from huggingface_hub import Repository, get_full_repo_name
# model_name = "bert-finetuned-squad-accelerate"
# repo_name = get_full_repo_name(model_name)
output_dir = "bert-finetuned-squad-accelerate"
# repo = Repository(output_dir, clone_from=repo_name)
##**************** Training Loop *****************
from tqdm.auto import tqdm
import torch
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Training
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation
model.eval()
start_logits = []
end_logits = []
accelerator.print("Evaluation!")
for batch in tqdm(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())
start_logits = np.concatenate(start_logits) # 拼接所有的 start_logits
end_logits = np.concatenate(end_logits) # 拼接所有的 end_logits
print("start_logits len: %d; end_logits len:%d; validation_dataset len:%d."%(
len(start_logits),len(end_logits), len_validation))
# start_logits len: 10822; end_logits len:10822; validation_dataset len:10822
start_logits = start_logits[: len_validation] # accelerator 可能在最后添加了一些样本,这里需要截断
end_logits = end_logits[: len_validation] # accelerator 可能在最后添加了一些样本,这里需要截断
metrics = compute_metrics(
start_logits, end_logits, validation_dataset, raw_datasets["validation"]
)
print(f"epoch {epoch}:", metrics)
# epoch 0: {'exact_match': 79.09176915799432, 'f1': 86.89534995209642}
# epoch 1: {'exact_match': 81.25827814569537, 'f1': 88.58745720707509}
# epoch 2: {'exact_match': 81.28666035950805, 'f1': 88.57238479975265}
# 在每个 epoch 保存和上传
accelerator.wait_for_everyone() # 阻塞每个进程,直到所有进程都到达这里
unwrapped_model = accelerator.unwrap_model(model) # 获取底层的模型
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) # 使用 accelerator.save
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir) # 保存 tokenizer
# repo.push_to_hub(
# commit_message=f"Training in progress epoch {epoch}", blocking=False
# )