文本清洗
去除网址
def remove_URL(text):
url = re.compile(r'https?://\S+|www\.\S+')
return url.sub(r'', text)
去除表情符号
def remove_emoji(text):
emoji_pattern = re.compile(
'['
u'\U0001F600-\U0001F64F' # emoticons
u'\U0001F300-\U0001F5FF' # symbols & pictographs
u'\U0001F680-\U0001F6FF' # transport & map symbols
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
u'\U00002702-\U000027B0'
u'\U000024C2-\U0001F251'
']+',
flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
去掉网页标签
def remove_html(text):
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(html, '', text)
去掉标点符号
def remove_punct(text):
#所有的标点字符
table = str.maketrans('', '', string.punctuation)
return text.translate(table)
def remove_punct(s):
s = re.sub(r"([.!?])", r" ", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
字符编码转换
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
去除低频单词
nltk
#语料库和词典
from nltk.corpus import stopwords, wordnet
#分词
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
#频率分布和平滑概率
from nltk.probability import FreqDist
数据分析及可视化
数据集词频统计
from collections import Counter
c = Counter()
#默认按出现顺序输入字典元素
most = counter.most_common()
两变量之间的关系
iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")
sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5)
sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
两对变量之间的关系
sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)
处理输入数据
bert
分词转换
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
#适配分词器
tokenizer.tokenize(combined[0])
#转换为id,不包含csl,sep等
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(combined[0]))
#编码
tokenizer.encode(combined[0],max_length = 512)
encoded_dict = tokenizer.encode_plus(
text, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
truncation='longest_first', # Activate and control truncation
max_length = 84, # Max length according to our text data.
pad_to_max_length = True, # Pad & truncate all sentences.
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
划分数据集
from torch.utils.data import TensorDataset, random_split
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
封装数据
# TensorDataset对tensor进行打包
train_ids = TensorDataset(a, b)
for x_train, y_label in train_ids:
print(x_train, y_label)
# dataloader进行数据封装
train_loader = DataLoader(dataset=train_ids, batch_size=4, shuffle=True)
for i, data in enumerate(train_loader, 1):
# 注意enumerate返回值有两个,一个是序号,一个是数据(包含训练数据和标签)
x_data, label = data
分词转换
创建词典
class Lang:
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = {0: "SOS", 1: "EOS"}
self.n_words = 2 # Count SOS and EOS
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1
打包数据
for x in train:
temp_ids = tokenizer.encode(x, add_special_tokens=True)
max_len = max(max_len, len(temp_ids))
input_ids.append(temp_ids)
#转换得到imput_ids和attention_masks
input_ids = np.array([i + [0]*(max_len-len(i)) for i in input_ids])
attention_masks = np.where(input_ids != 0, 1, 0)
dataset = TensorDataset(input_ids, attention_masks, labels)
封装数据
自定义的Dataset需要继承它并且实现两个成员方法: getitem() 该方法定义用索引(0 到 len(self))获取一条数据或一个样本 len()该方法返回数据集的总长度
from torch.utils.data import Dataset,DataLoader
class MRPCDataset(Dataset):
def __init__(self, dataset):
self.data = dataset
def __getitem__(self, index):
#这里可以有很多操作
return self.data[index][0], self.data[index][1], self.data[index][2]
def __len__(self):
return len(self.data)
#实例化 并送入DataLoader
train_dataset = MRPCDataset(train_dataset)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)# 随机:shuffle=True
定义加载模型
bert
model = BertForSequenceClassification.from_pretrained(
'bert-large-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
num_labels = 2, # The number of output labels--2 for binary classification. You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
自定义模型
import torch
#自定义前向传播,自动反向传播
class FCModel(torch.nn.Module):#注意继承自 torch.nn.Module
def __init__(self):
super(FCModel, self).__init__() # init父类
#多种方式定义model的层
self.fc = torch.nn.Linear(in_features=768, out_features=1)
def forward(self, input):
#使用model的层
score = self.fc(input)
result = torch.sigmoid(score)
return result
GPU/CPU
#获取设备类型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#让model适应相应设备
model = FCModel() #模型实例化
model = model.to(device)
#让数据适应相应设备
input_ids = input_ids.to(device)
显卡设置
#多GPU并行运行
model = nn.DataParallel(model)
#清除显存
torch.cuda.empty_cache()
优化器
optimizer = AdamW(model.parameters(),
lr = 6e-6, # args.learning_rate
eps = 1e-8 # args.adam_epsilon
)
#学习率预热
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps
)
RMSProp
思想:梯度震动较大的项,在下降时,减小其下降速度;对于震动幅度小的项,在下降时,加速其下降速度
RMSprop采用均方根作为分母,可缓解Adagrad学习率下降较快的问题,对于RNN有很好的效果
torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
优点:可缓解Adagrad学习率下降较快的问题,并且引入均方根,可以减少摆动,适合处理非平稳目标,对于RNN效果很好
缺点:依然依赖于全局学习率
Adam
将Momentum算法和RMSProp算法结合起来使用的一种算法,既用动量来累积梯度,又使得收敛速度更快同时使得波动的幅度更小,并进行了偏差修正
torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
优点: 1、对目标函数没有平稳要求,即loss function可以随着时间变化 2、参数的更新不受梯度的伸缩变换影响 3、更新步长和梯度大小无关,只和alpha、beta_1、beta_2有关系。并且由它们决定步长的理论上限 4、更新的步长能够被限制在大致的范围内(初始学习率) 5、能较好的处理噪音样本,能天然地实现步长退火过程(自动调整学习率) 6、很适合应用于大规模的数据及参数的场景、不稳定目标函数、梯度稀疏或梯度存在很大噪声的
训练和评估
for epoch_i in range(0, epochs):
#
model.train()
#model.eval():告诉网络的所有层,你在eval模式,也就是说,像batchNorm和dropout这样的层会工作在eval模式而非training模式
#model.eval()
for step, batch in enumerate(train_dataloader):
model.zero_grad()
loss, logits = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
scheduler.step()
模型保存加载
一般模型
# 保存模型的全部 (大模型不建议)
torch.save(model, "./model_fc.pth")
model = torch.load("./model_fc.pth")
# 只保存各层的参数 (大模型建议)
torch.save(model.state_dict(), "./model_fc.pt")
model = FCModel()#加载前需要构造一个模型实例
model.load_state_dict(torch.load("./model_fc.pt"))
huggingface
#save tokenizer若未修改不用保存
bert_model.save_pretrained('./Fine_tune_BERT/')
#load
bert_model = TFBertModel.from_pretrained('./Fine_tune_BERT/')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
可视化注意力
def showAttention(input_sentence, output_words, attentions):
# 用colorbar设置图
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(attentions.numpy(), cmap='bone')
fig.colorbar(cax)
# 设置坐标
ax.set_xticklabels([''] + input_sentence.split(' ') +
['<EOS>'], rotation=90)
ax.set_yticklabels([''] + output_words)
# 在每个刻度处显示标签
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.show()
def evaluateAndShowAttention(input_sentence):
output_words, attentions = evaluate(
encoder1, attn_decoder1, input_sentence)
print('input =', input_sentence)
print('output =', ' '.join(output_words))
showAttention(input_sentence, output_words, attentions)