RoBERTa Model

RoBERTa: A Robustly Optimized BERT Pretraining Approach (arxiv.org)

Roberta-large and Roberta-base Model

Try RoBERTa model

1
2
3
4
5
6
7
8
from transformers import pipeline

fill_mask = pipeline(
"fill-mask",
model="roberta-base",
tokenizer="roberta-base"
)
fill_mask("Send these <mask> back!")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
[{'sequence': 'Send these pictures back!',
'score': 0.166615292429924,
'token': 3493,
'token_str': ' pictures'},
{'sequence': 'Send these photos back!',
'score': 0.10792841762304306,
'token': 2356,
'token_str': ' photos'},
{'sequence': 'Send these emails back!',
'score': 0.07670938968658447,
'token': 5575,
'token_str': ' emails'},
{'sequence': 'Send these images back!',
'score': 0.048607729375362396,
'token': 3156,
'token_str': ' images'},
{'sequence': 'Send these letters back!',
'score': 0.0484173484146595,
'token': 5430,
'token_str': ' letters'}]

Here is how to use this model to get the features of a given text in PyTorch:

1
2
3
4
5
6
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encodednput)

 Retrain our own RoBERTa MLM using our Own Dataset

Huggingface🤗Transformers: Retraining roberta-base using the RoBERTa MLM Procedure

1
2
3
4
from transformers import RobertaTokenizer, RobertaForMaskedLM

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')
1
2
3
4
5
6
7
8
import pandas as pd

# 从 raw_data 中拿出我们需要的数据。
input_list = ['url', 'snippet', 'zhanwei', 'title', 'body', 'label']
data = pd.read_csv('./data/unlabeled_large.tsv', sep='\t', header=None, names=input_list)
data_out = data['title'] + ". " + data['body']
# print(data_out[0])
data_out.to_csv('./data/unlabeled_large_new.tsv', sep='\t', header=False, index=False)
1
2
3
4
5
6
7
8
from transformers import LineByLineTextDataset

# 如果数据文件就是一行一个数据,那么直接用下面这个函数即可。
dataset = LineByLineTextDataset(
tokenizer = tokenizer,
file_path = "./data/unlabeled_large_new.tsv",
block_size = 512, # Roberta 处理的数据的最大长度
)
1
2
3
4
5
6
from transformers import DataCollatorForLanguageModeling

# The data collator object helps us to form input data batches in a form on which the LM can be trained.
# For example, it pads all examples of a batch to bring them to the same length.
# and for mlm task, it mask 15% token.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments (
output_dir= './output/roberta_retrained',
overwrite_output_dir=True,
num_train_epochs=5,
per_device_train_batch_size=4,
save_steps=5,
save_total_limit=2,
seed=1
)

trainer = Trainer(
model=model,
args=train_args,
data_collator=data_collator,
train_dataset=dataset
)
1
2
trainer.train()
trainer.save_model("./output/roberta_retrained")

Finetune RoBERTa for Text Classification

Code:ML-and-Data-Analysis/RoBERTa for text classification.ipynb at master

讲解:High accuracy text classification with Python | Towards Data Science

上面这份 code 有些地方因为版本什么的问题在运行的时候会有些问题,下面的代码是修改并注释过的

1
2
3
4
5
6
7
8
9
10
11
12
13
import torch

# Set random seed and set device to GPU.
torch.manual_seed(17)

if torch.cuda.is_available():
device = torch.device('cuda:0')
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
else:
device = torch.device('cpu')

print(device)

从 raw_data 中拿出我们需要的数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import pandas as pd

input_list = ['query', 'host', 'url', 'title', 'body', 'label']
data = pd.read_csv('./data/train.tsv', sep='\t', header=None, names=input_list)

data_out = pd.DataFrame()
data_out['titletext'] = data['title'] + ". " + data['body']
data_out['label'] = data['label'].astype('Int64')
data_out['length'] = data_out['titletext'].apply(lambda x: len(str(x).split()))

data_out = data_out[data_out['label']>=0] # 过滤掉 label 不存在的行

data_out['titletext'] = data_out['titletext'].apply(lambda x: " ".join(str(x).split()[:512])) # 数据截断
data_out.to_csv('./data/train_classification.csv')
#print(data_out['label'][0])
#print(data_out['length'][0])

安装 torchtext conda install -c pytorch torchtext -

1
2
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

# Set tokenizer hyperparameters.
MAX_SEQ_LEN = 512
BATCH_SIZE = 16
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) # PAD_INDEX = 1
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # 碰到字典中没有的字,用来表示未知字符,比如用 "[UNK]" 表示未知字符. UNK_INDEX = 3

# Define columns to read.
# https://torchtext.readthedocs.io/en/latest/data.html#field
label_field = Field(sequential=False, use_vocab=False, batch_first=True) # squential:数据是否为序列数据,默认为Ture。如果为False,则不能使用分词。
text_field = Field(use_vocab=False, # 是否使用词典,默认为 True
tokenize=tokenizer.encode, # 分词函数,默认为 str.split
include_lengths=False,
batch_first=True, # batch 作为第一个维度
fix_length=MAX_SEQ_LEN, # 所有样本的长度,不够则使用 pad_token 补全。默认为 None,表示灵活长度
pad_token=PAD_INDEX, # 用于补全的字符,默认为 <pad>
unk_token=UNK_INDEX) # 替换袋外词的字符,默认为 <unk>

fields = {'titletext' : ('titletext', text_field), 'label' : ('label', label_field)}

# 将一个文件分为 training data, valid data 和 test data
# https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.TabularDataset
train_data, valid_data, test_data = TabularDataset(path=f"./data/unlabeled_large_classification.csv",
format='CSV',
fields=fields,
skip_header=False).split(split_ratio=[0.70, 0.2, 0.1],
stratified=True, # whether the sampling should be stratified. Default is False.
strata_field='label') # name of the examples Field stratified over. Default is ‘label’ for the conventional label field.

# Create train and validation iterators.
# https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator
train_iter, valid_iter = BucketIterator.splits((train_data, valid_data),
batch_size=BATCH_SIZE,
device=device,
shuffle=True,
sort_key=lambda x: len(x.titletext),
sort=True,
sort_within_batch=False)

# Test iterator, no shuffling or sorting required.
test_iter = Iterator(test_data, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)

自定义 model 模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from transformers import RobertaModel

# Model with extra layers on top of RoBERTa
class ROBERTAClassifier(torch.nn.Module):
def __init__(self, dropout_rate=0.3):
super(ROBERTAClassifier, self).__init__()

self.roberta = RobertaModel.from_pretrained('roberta-base')
self.d1 = torch.nn.Dropout(dropout_rate)
self.l1 = torch.nn.Linear(768, 64)
self.bn1 = torch.nn.LayerNorm(64)
self.d2 = torch.nn.Dropout(dropout_rate)
self.l2 = torch.nn.Linear(64, 2)

def forward(self, input_ids, attention_mask):
_, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=False) # 加上了 return_dict=False,否则报错
x = self.d1(x)
x = self.l1(x)
x = self.bn1(x)
x = torch.nn.Tanh()(x)
x = self.d2(x)
x = self.l2(x)

return x
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def pretrain(model, 
optimizer,
train_iter,
valid_iter,
scheduler = None,
valid_period = len(train_iter),
num_epochs = 5):

# Pretrain linear layers, do not train bert
for param in model.roberta.parameters():
param.requires_grad = False

model.train()

# Initialize losses and loss histories
train_loss = 0.0
valid_loss = 0.0
global_step = 0

# Train loop
for epoch in range(num_epochs):
for (source, target), _ in train_iter:
mask = (source != PAD_INDEX).type(torch.uint8)

y_pred = model(input_ids=source, attention_mask=mask)
loss = torch.nn.CrossEntropyLoss()(y_pred, target)
loss.backward()

# Optimizer and scheduler step
optimizer.step()
scheduler.step()

optimizer.zero_grad()

# Update train loss and global step
train_loss += loss.item()
global_step += 1

# Validation loop. Save progress and evaluate model performance.
if global_step % valid_period == 0:
model.eval()

with torch.no_grad():
for (source, target), _ in valid_iter:
mask = (source != PAD_INDEX).type(torch.uint8)

y_pred = model(input_ids=source, attention_mask=mask)
loss = torch.nn.CrossEntropyLoss()(y_pred, target)
valid_loss += loss.item()

# Store train and validation loss history
train_loss = train_loss / valid_period
valid_loss = valid_loss / len(valid_iter)

model.train()

# print summary
print('Epoch [{}/{}], global step [{}/{}], PT Loss: {:.4f}, Val Loss: {:.4f}'
.format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter), train_loss, valid_loss))

train_loss = 0.0
valid_loss = 0.0

# Set bert parameters back to trainable
for param in model.roberta.parameters():
param.requires_grad = True

print('Pre-training done!')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Training Function
output_path = "./output/roberta_classification"

def train(model,
optimizer,
train_iter,
valid_iter,
scheduler = None,
num_epochs = 5,
valid_period = len(train_iter),
output_path = output_path):

# Initialize losses and loss histories
train_loss = 0.0
valid_loss = 0.0
train_loss_list = []
valid_loss_list = []
best_valid_loss = float('Inf')

global_step = 0
global_steps_list = []

model.train()

# Train loop
for epoch in range(num_epochs):
for (source, target), _ in train_iter:
mask = (source != PAD_INDEX).type(torch.uint8)

y_pred = model(input_ids=source, attention_mask=mask)
loss = torch.nn.CrossEntropyLoss()(y_pred, target)
loss.backward()

#torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

# Optimizer and scheduler step
optimizer.step()
scheduler.step()

optimizer.zero_grad()

# Update train loss and global step
train_loss += loss.item()
global_step += 1

# Validation loop. Save progress and evaluate model performance.
if global_step % valid_period == 0:
model.eval()

with torch.no_grad():
for (source, target), _ in valid_iter:
mask = (source != PAD_INDEX).type(torch.uint8)

y_pred = model(input_ids=source, attention_mask=mask)
loss = torch.nn.CrossEntropyLoss()(y_pred, target)
valid_loss += loss.item()

# Store train and validation loss history
train_loss = train_loss / valid_period
valid_loss = valid_loss / len(valid_iter)
train_loss_list.append(train_loss)
valid_loss_list.append(valid_loss)
global_steps_list.append(global_step)

# print summary
print('Epoch [{}/{}], global step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
.format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
train_loss, valid_loss))

# checkpoint
if best_valid_loss > valid_loss:
best_valid_loss = valid_loss
save_checkpoint(output_path + '/model.pkl', model, best_valid_loss)
save_metrics(output_path + '/metric.pkl', train_loss_list, valid_loss_list, global_steps_list)

train_loss = 0.0
valid_loss = 0.0
model.train()

save_metrics(output_path + '/metric.pkl', train_loss_list, valid_loss_list, global_steps_list)
print('Training done!')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from transformers import AdamW, get_linear_schedule_with_warmup

# Main training loop
NUM_EPOCHS = 6
steps_per_epoch = len(train_iter)

model = ROBERTAClassifier(0.4)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=steps_per_epoch*1,
num_training_steps=steps_per_epoch*NUM_EPOCHS)

print("======================= Start pretraining ==============================")

pretrain(model=model,
train_iter=train_iter,
valid_iter=valid_iter,
optimizer=optimizer,
scheduler=scheduler,
num_epochs=NUM_EPOCHS)

NUM_EPOCHS = 12
print("======================= Start training =================================")
optimizer = AdamW(model.parameters(), lr=2e-6)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=steps_per_epoch*2,
num_training_steps=steps_per_epoch*NUM_EPOCHS)

train(model=model,
train_iter=train_iter,
valid_iter=valid_iter,
optimizer=optimizer,
scheduler=scheduler,
num_epochs=NUM_EPOCHS)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
======================= Start pretraining ==============================
Epoch [1/6], global step [4442/26652], PT Loss: 0.6695, Val Loss: 0.6873
Epoch [2/6], global step [8884/26652], PT Loss: 0.6504, Val Loss: 0.6829
Epoch [3/6], global step [13326/26652], PT Loss: 0.6436, Val Loss: 0.6726
Epoch [4/6], global step [17768/26652], PT Loss: 0.6333, Val Loss: 0.6459
Epoch [5/6], global step [22210/26652], PT Loss: 0.6185, Val Loss: 0.6076
Epoch [6/6], global step [26652/26652], PT Loss: 0.6044, Val Loss: 0.6007
Pre-training done!
======================= Start training =================================
Epoch [1/12], global step [4442/53304], Train Loss: 0.4489, Valid Loss: 0.3322
Epoch [2/12], global step [8884/53304], Train Loss: 0.3454, Valid Loss: 0.2886
Epoch [3/12], global step [13326/53304], Train Loss: 0.2872, Valid Loss: 0.2474
Epoch [4/12], global step [17768/53304], Train Loss: 0.2378, Valid Loss: 0.2406
Epoch [5/12], global step [22210/53304], Train Loss: 0.2052, Valid Loss: 0.2541
Epoch [6/12], global step [26652/53304], Train Loss: 0.1811, Valid Loss: 0.2360
Epoch [7/12], global step [31094/53304], Train Loss: 0.1597, Valid Loss: 0.2461
Epoch [8/12], global step [35536/53304], Train Loss: 0.1419, Valid Loss: 0.2602
Epoch [9/12], global step [39978/53304], Train Loss: 0.1299, Valid Loss: 0.2449
Epoch [10/12], global step [44420/53304], Train Loss: 0.1193, Valid Loss: 0.2503
Epoch [11/12], global step [48862/53304], Train Loss: 0.1106, Valid Loss: 0.2511
Epoch [12/12], global step [53304/53304], Train Loss: 0.1064, Valid Loss: 0.2479
Training done!

画出 loss 下降的图

1
2
3
4
5
6
7
8
9
10
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
train_loss_list, valid_loss_list, global_steps_list = load_metrics(output_path + '/metric.pkl')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.legend(fontsize=14)
plt.show()

对 model 进行 evaluate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
import seaborn as sns

# Evaluation Function
def evaluate(model, test_loader):
y_pred = []
y_true = []

model.eval()
with torch.no_grad():
for (source, target), _ in test_loader:
mask = (source != PAD_INDEX).type(torch.uint8)

output = model(source, attention_mask=mask)
y_pred.extend(torch.argmax(output, axis=-1).tolist())
y_true.extend(target.tolist())

print('Classification Report:')
print(classification_report(y_true, y_pred, labels=[0, 1], digits=4))

cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
ax = plt.subplot()

sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted Labels')
ax.set_ylabel('True Labels')
ax.xaxis.set_ticklabels(['Facts', 'Opinion'])
ax.yaxis.set_ticklabels(['Facts', 'Opinion'])
1
2
3
4
5
6
model = ROBERTAClassifier()
model = model.to(device)

load_checkpoint(output_path + '/model.pkl', model)

evaluate(model, test_iter)
1
2
3
4
5
6
7
8
9
10
Classification Report:
precision recall f1-score support

0 0.8763 0.9301 0.9024 1645
1 0.8981 0.8244 0.8597 1230

accuracy 0.8849 2875
macro avg 0.8872 0.8772 0.8810 2875
weighted avg 0.8856 0.8849 0.8841 2875

直接使用 Roberta 里面的 RobertaForSequenceClassification

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from transformers import RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) # 加上 num_labels=2

在 train 的时候:
outputs = model(input_ids=source, attention_mask=mask, labels=target) # labels 要直接写在这里
# outputs 是一个 tuple
# SequenceClassifierOutput(
# loss=loss, # single_label_classification:CrossEntropyLoss(); multi_label_classification:BCEWithLogitsLoss()
# logits=logits,
# hidden_states=outputs.hidden_states,
# attentions=outputs.attentions,
# )
loss = outputs[0]
y_pred = ouputs[1]