pytorch nlp深度学习代码总结-seo优化_前端开发

文本清洗

去除网址

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

去除表情符号

def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

去掉网页标签

def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

去掉标点符号

def remove_punct(text):
    #所有的标点字符
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_punct(s):
    s = re.sub(r"([.!?])", r" ", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

字符编码转换

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

去除低频单词

nltk

#语料库和词典
from nltk.corpus import stopwords, wordnet
#分词
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
#频率分布和平滑概率
from nltk.probability import FreqDist

数据分析及可视化

数据集词频统计

from collections import Counter
    c = Counter()
    #默认按出现顺序输入字典元素
    most = counter.most_common()

两变量之间的关系

iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")

sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5)

sns.boxplot(x="Species", y="PetalLengthCm", data=iris)

两对变量之间的关系

sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)

处理输入数据

bert

分词转换

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

#适配分词器
tokenizer.tokenize(combined[0])
#转换为id,不包含csl,sep等
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(combined[0]))
#编码
tokenizer.encode(combined[0]，max_length = 512)

encoded_dict = tokenizer.encode_plus(
text,                      # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
truncation='longest_first', # Activate and control truncation
max_length = 84,           # Max length according to our text data.
pad_to_max_length = True, # Pad & truncate all sentences.
return_attention_mask = True,   # Construct attn. masks.
return_tensors = 'pt',     # Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])

划分数据集

from torch.utils.data import TensorDataset, random_split

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

封装数据

# TensorDataset对tensor进行打包
train_ids = TensorDataset(a, b) 
for x_train, y_label in train_ids:
    print(x_train, y_label)


# dataloader进行数据封装
train_loader = DataLoader(dataset=train_ids, batch_size=4, shuffle=True)
for i, data in enumerate(train_loader, 1):  
# 注意enumerate返回值有两个,一个是序号，一个是数据（包含训练数据和标签）
    x_data, label = data

分词转换

创建词典

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

打包数据

for x in train:
    temp_ids = tokenizer.encode(x, add_special_tokens=True)
    max_len = max(max_len, len(temp_ids))
    input_ids.append(temp_ids)
#转换得到imput_ids和attention_masks
input_ids = np.array([i + [0]*(max_len-len(i)) for i in input_ids])
attention_masks = np.where(input_ids != 0, 1, 0)

dataset = TensorDataset(input_ids, attention_masks, labels)

封装数据

自定义的Dataset需要继承它并且实现两个成员方法： getitem() 该方法定义用索引(0 到 len(self))获取一条数据或一个样本 len()该方法返回数据集的总长度

from torch.utils.data import Dataset，DataLoader
class MRPCDataset(Dataset):
     def __init__(self, dataset):
        self.data = dataset
     def __getitem__(self, index):
        #这里可以有很多操作
         return self.data[index][0], self.data[index][1], self.data[index][2]
     def __len__(self):
         return len(self.data)
 #实例化 并送入DataLoader
train_dataset = MRPCDataset(train_dataset)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)# 随机：shuffle=True

定义加载模型

bert

model = BertForSequenceClassification.from_pretrained(
'bert-large-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
num_labels = 2, # The number of output labels--2 for binary classification. You can increase this for multi-class tasks.   
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)

自定义模型

import torch
#自定义前向传播，自动反向传播
class FCModel(torch.nn.Module):#注意继承自 torch.nn.Module
    def __init__(self):
        super(FCModel, self).__init__() # init父类
        #多种方式定义model的层
        self.fc = torch.nn.Linear(in_features=768, out_features=1)
    def forward(self, input):
        #使用model的层
        score = self.fc(input)
        result = torch.sigmoid(score)
        return result

GPU/CPU

#获取设备类型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#让model适应相应设备
model = FCModel()   #模型实例化
model = model.to(device)
#让数据适应相应设备
input_ids = input_ids.to(device)

显卡设置

#多GPU并行运行
model = nn.DataParallel(model)

#清除显存
torch.cuda.empty_cache()

优化器

optimizer = AdamW(model.parameters(),
                  lr = 6e-6, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
                )

#学习率预热
scheduler = get_linear_schedule_with_warmup(
optimizer, 
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps
)

RMSProp

思想：梯度震动较大的项，在下降时，减小其下降速度；对于震动幅度小的项，在下降时，加速其下降速度

RMSprop采用均方根作为分母，可缓解Adagrad学习率下降较快的问题,对于RNN有很好的效果

torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

优点：可缓解Adagrad学习率下降较快的问题，并且引入均方根，可以减少摆动，适合处理非平稳目标，对于RNN效果很好
缺点：依然依赖于全局学习率

Adam

将Momentum算法和RMSProp算法结合起来使用的一种算法，既用动量来累积梯度，又使得收敛速度更快同时使得波动的幅度更小，并进行了偏差修正

torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

优点： 1、对目标函数没有平稳要求，即loss function可以随着时间变化 2、参数的更新不受梯度的伸缩变换影响 3、更新步长和梯度大小无关，只和alpha、beta_1、beta_2有关系。并且由它们决定步长的理论上限 4、更新的步长能够被限制在大致的范围内（初始学习率） 5、能较好的处理噪音样本，能天然地实现步长退火过程（自动调整学习率） 6、很适合应用于大规模的数据及参数的场景、不稳定目标函数、梯度稀疏或梯度存在很大噪声的

训练和评估

for epoch_i in range(0, epochs):
    #
    model.train()
    #model.eval():告诉网络的所有层，你在eval模式，也就是说，像batchNorm和dropout这样的层会工作在eval模式而非training模式
    #model.eval()
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

模型保存加载

一般模型

# 保存模型的全部 （大模型不建议）
torch.save(model, "./model_fc.pth")
model = torch.load("./model_fc.pth")
# 只保存各层的参数 （大模型建议）
torch.save(model.state_dict(), "./model_fc.pt")
model = FCModel()#加载前需要构造一个模型实例
model.load_state_dict(torch.load("./model_fc.pt"))

huggingface

#save tokenizer若未修改不用保存
bert_model.save_pretrained('./Fine_tune_BERT/')
#load
bert_model = TFBertModel.from_pretrained('./Fine_tune_BERT/')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

可视化注意力

def showAttention(input_sentence, output_words, attentions):
    # 用colorbar设置图
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # 设置坐标
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # 在每个刻度处显示标签
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

pytorch nlp深度学习代码总结

文本清洗

nltk

数据分析及可视化

数据集词频统计

两变量之间的关系

两对变量之间的关系

处理输入数据

bert

分词转换

打包数据

封装数据

定义加载模型

bert

自定义模型

GPU/CPU

显卡设置

优化器

RMSProp

Adam

训练和评估

模型保存加载

一般模型

huggingface

可视化注意力

hush

相关推荐

热门文章

热门搜索

seo超级工具

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

关注微信公众号：themebetter
复制微信号

回顶部

文本清洗

nltk

数据分析及可视化

数据集词频统计

两变量之间的关系

两对变量之间的关系

处理输入数据

bert

分词转换

打包数据

封装数据

定义加载模型

bert

自定义模型

GPU/CPU

显卡设置

优化器

RMSProp

Adam

训练和评估

模型保存加载

一般模型

huggingface

可视化注意力

hush

相关推荐

热门文章

热门搜索

seo超级工具

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

关注微信公众号：themebetter复制微信号

回顶部

关注微信公众号：themebetter
复制微信号