Python Code Block Example with Highlight.js

FinBert 토크나이저 분석하기

finbert 불러오기


import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# FinBERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained(
            'yiyanghkust/finbert-tone')
tokenizer.vocab

tokenizer 실행하고 토큰 확인하기

 # 텍스트를 토큰화하고 입력 텐서로 변환
# return_tensors='pt' (파이토치 형식으로 변환)
text = "The company's revenue exceeded expectations this quarter."
inputs = tokenizer(text, return_tensors='pt')

# 형태의 확인
inputs.keys()
inputs['input_ids'].shape

inputs['input_ids'] # tensor
inputs['token_type_ids'] # tensor
inputs['attention_mask'] # tensor

# input_ids 출력
print("Input IDs:", inputs['input_ids'])
# 토큰 ID를 실제 토큰으로 변환하여 출력
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0))
print("Tokens:", tokens)

tokenizer 추가하기

# 중복 여부 확인 후 토큰 추가
new_tokens = ["new_token_1", "new_token_2", "new_token_3"]

for token in new_tokens:
    if token not in tokenizer.vocab:
        tokenizer.add_tokens([token])
    else:
        print(f"Token already exists: {token}")

# 토큰이 추가된 후의 토크나이저 정보 출력
print(f"Total number of tokens after addition: {len(tokenizer)}")

FinBert 로 모형만들기

FinBert Inference myClass 만들기. FinBert 에서 마지막 hidden layer 의 잠재변수를 반환하는 함수

# myClass
class MyNeuralNetwork(nn.Module):
    def __init__(self, n_tokens):
        super(MyNeuralNetwork, self).__init__()
        
        self.bert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', 
                     output_hidden_states=True, num_labels=3)
        for param in self.bert.parameters():
            param.requires_grad = False  # BERT 파라미터 고정
        
        self.n_tokens = n_tokens

    def forward(self, x):
        x = self.bert(**x)
        x = x.hidden_states[-1][:,-1,:]
        return x

model = MyNeuralNetwork(n_tokens)
model(inputs)

간단한 LORA 모형 만들기입니다. 먼저 param.requires_grad = False 로 BERT 파라미터 고정 하였습니다. bypass 는 token 에서 custom embedding 이 평균으로 계산되고 Bert embedding 과 합쳐집니다. 그 이후에 FFN 을 통과하여 5차원 잠재변수가 됩니다.

# LORA 모형 만들기 
class MyNeuralNetwork(nn.Module):
    def __init__(self, n_tokens):
        super(MyNeuralNetwork, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', 
                     output_hidden_states=True, num_labels=3)
        self.output_dim = self.bert.config.hidden_size
        for param in self.bert.parameters():
            param.requires_grad = False  # BERT 파라미터 고정
        
        self.n_tokens = n_tokens
        self.layer1 = nn.Embedding(n_tokens, self.output_dim)
        self.layer2 = nn.Sequential(nn.Linear(2*self.output_dim, 10),
                                    nn.ReLU(),
                                    nn.Linear(10, 5))
        
    def forward(self, x):
        z1 = self.bert(**x)
        z1 = z1.hidden_states[-1][:,-1,:]
        z2 = self.layer1(x['input_ids']).mean(dim=1)
        z = torch.cat((z1,z2), dim = 1)
        z = self.layer2(z)
        return z

예제 문장입니다.

 # 예제 문장
news_articles = ["The stock market surged today as investors reacted positively to the latest earnings reports.",
"Inflation rates continue to climb, raising concerns about the purchasing power of consumers.",
"The Federal Reserve announced a plan to increase interest rates in an effort to curb inflation.",
"Tech companies are leading the charge in the market, with several hitting all-time highs.",
"Global supply chain disruptions are causing significant delays and increasing costs for manufacturers.",
"Unemployment rates have fallen to their lowest levels since the start of the pandemic.",
"The housing market remains hot, with home prices continuing to rise at a rapid pace.",
"Analysts predict a slowdown in economic growth due to ongoing geopolitical tensions.",
"The cryptocurrency market experienced a sharp decline following new regulatory announcements.",
"Consumer confidence is at an all-time high, buoyed by strong job growth and wage increases."]

n = len(news_articles)
target =  torch.tensor(np.random.randint(0, 5, size = n),dtype=torch.int64)

model이 작동하는지 입력값을 넣어봅니다.

# 모형확인
inputs = tokenizer(news_articles, return_tensors='pt', 
padding=True, truncation=True)
model = MyNeuralNetwork(n_tokens)
model(inputs).shape

적합합니다.

 
learning_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
cross_entropy = nn.CrossEntropyLoss()

num_B = 100
loss_vec = np.zeros(num_B)
model.train()
val_vec=[]
for i in range(num_B):
    optimizer.zero_grad()
    output = model(inputs)
    loss = cross_entropy(output, target)
    loss.backward()
    optimizer.step()
    predicted = torch.argmax(output.data,1)
    if i%10 == 0:
        v =((predicted == target).sum()).item()/len(target)
        val_vec.append(v)
        print ("Accuracy: {:.4f}".format(np.array(val_vec).mean()))