emotion-chatbot / scripts /train_model.py
kootaeng2
Initial commit with final, clean project files
e850536
# train_model.py
# AI λͺ¨λΈμ„ ν›ˆλ ¨ν•˜λŠ” 슀크립트, λ‹€μ‹œ μ‚¬μš©κ°€λŠ₯ν•œ μ‚­μ œ x
import pandas as pd
import json
import re
import sys
import transformers
import torch
from transformers import AutoTokenizer
# --- 1. 데이터 λ‘œλ”© 및 μ „μ²˜λ¦¬ ---
print("--- [Phase 1] 데이터 λ‘œλ”© 및 μ „μ²˜λ¦¬ μ‹œμž‘ ---")
# 파일 경둜 μ„€μ •
file_path = './data/'
# ν›ˆλ ¨/검증 데이터 λ‘œλ”© (이전과 동일)
with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file:
training_data_raw = json.load(file)
with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file:
validation_data_raw = json.load(file)
# DataFrame 생성 ν•¨μˆ˜ (μ½”λ“œλ₯Ό κΉ”λ”ν•˜κ²Œ ν•˜κΈ° μœ„ν•΄ ν•¨μˆ˜λ‘œ 묢음)
def create_dataframe(data_raw):
extracted_data = []
for dialogue in data_raw:
try:
emotion_type = dialogue['profile']['emotion']['type']
dialogue_content = dialogue['talk']['content']
full_text = " ".join(list(dialogue_content.values()))
if full_text and emotion_type:
extracted_data.append({'text': full_text, 'emotion': emotion_type})
except KeyError:
continue
return pd.DataFrame(extracted_data)
df_train = create_dataframe(training_data_raw)
df_val = create_dataframe(validation_data_raw)
# ν…μŠ€νŠΈ μ •μ œ
def clean_text(text):
return re.sub(r'[^κ°€-힣a-zA-Z0-9 ]', '', text)
df_train['cleaned_text'] = df_train['text'].apply(clean_text)
df_val['cleaned_text'] = df_val['text'].apply(clean_text)
print("βœ… 데이터 λ‘œλ”© 및 μ „μ²˜λ¦¬ μ™„λ£Œ!")
# --- 2. AI λͺ¨λΈλ§ μ€€λΉ„ ---
print("\n--- [Phase 2] AI λͺ¨λΈλ§ μ€€λΉ„ μ‹œμž‘ ---")
# λͺ¨λΈ 및 ν† ν¬λ‚˜μ΄μ € 뢈러였기
MODEL_NAME = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ν…μŠ€νŠΈ 토큰화
train_tokenized = tokenizer(list(df_train['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True)
val_tokenized = tokenizer(list(df_val['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True)
# 라벨 인코딩
unique_labels = sorted(df_train['emotion'].unique())
label_to_id = {label: id for id, label in enumerate(unique_labels)}
id_to_label = {id: label for label, id in label_to_id.items()}
df_train['label'] = df_train['emotion'].map(label_to_id)
df_val['label'] = df_val['emotion'].map(label_to_id)
print("βœ… 토큰화 및 라벨 인코딩 μ™„λ£Œ!")
print("이제 λͺ¨λΈ ν›ˆλ ¨μ„ μœ„ν•œ λͺ¨λ“  μ€€λΉ„κ°€ λλ‚¬μŠ΅λ‹ˆλ‹€.")
# [Phase 3]의 κΈ°μ‘΄ μ½”λ“œλ₯Ό μ•„λž˜ λ‚΄μš©μœΌλ‘œ κ΅μ²΄ν•΄μ£Όμ„Έμš”.
# -----------------------------------------------------------
# --- [Phase 3] λͺ¨λΈ ν•™μŠ΅ 및 평가 (μ΅œμ†Œ κΈ°λŠ₯ 버전) ---
# -----------------------------------------------------------
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
print("\n--- [Phase 3] λͺ¨λΈ ν•™μŠ΅ 및 평가 μ‹œμž‘ ---")
# 1. PyTorch Dataset 클래슀 μ •μ˜ (이전과 동일)
class EmotionDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = EmotionDataset(train_tokenized, df_train['label'].tolist())
val_dataset = EmotionDataset(val_tokenized, df_val['label'].tolist())
print("βœ… PyTorch 데이터셋 생성이 μ™„λ£Œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
# 2. AI λͺ¨λΈ 뢈러였기 (이전과 동일)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=len(unique_labels),
id2label=id_to_label,
label2id=label_to_id
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"βœ… λͺ¨λΈ λ‘œλ”© μ™„λ£Œ! λͺ¨λΈμ€ {device}μ—μ„œ μ‹€ν–‰λ©λ‹ˆλ‹€.")
# 3. λͺ¨λΈ μ„±λŠ₯ 평가λ₯Ό μœ„ν•œ ν•¨μˆ˜ μ •μ˜ (μˆ˜μ • μ™„λ£Œ)
def compute_metrics(pred):
labels = pred.label_ids
# λ°”λ‘œ 이 뢀뢄이 μˆ˜μ •λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
acc = accuracy_score(labels, preds)
return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}
# 4. ν›ˆλ ¨μ„ μœ„ν•œ 상세 μ„€μ •(Arguments) μ •μ˜ (λͺ¨λ“  λΆ€κ°€ μ˜΅μ…˜ 제거)
training_args = TrainingArguments(
output_dir='./results', # λͺ¨λΈμ΄ μ €μž₯될 μœ„μΉ˜ (ν•„μˆ˜)
num_train_epochs=3, # ν›ˆλ ¨ 횟수
per_device_train_batch_size=16, # ν›ˆλ ¨ 배치 μ‚¬μ΄μ¦ˆ
# λ‚˜λ¨Έμ§€ λͺ¨λ“  평가/μ €μž₯ κ΄€λ ¨ μ˜΅μ…˜μ€ λͺ¨λ‘ μ œκ±°ν•©λ‹ˆλ‹€.
)
# ---!!! 핡심 μˆ˜μ • 사항 2 !!!---
# 5. Trainer μ •μ˜ (평가 κ΄€λ ¨ κΈ°λŠ₯ λΉ„ν™œμ„±ν™”)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
# ν›ˆλ ¨ 쀑 평가λ₯Ό ν•˜μ§€ μ•ŠμœΌλ―€λ‘œ μ•„λž˜ μ˜΅μ…˜λ“€μ€ μ œμ™Έν•©λ‹ˆλ‹€.
# eval_dataset=val_dataset,
# compute_metrics=compute_metrics
)
# 6. λͺ¨λΈ ν›ˆλ ¨ μ‹œμž‘!
print("\nπŸ”₯ AI λͺ¨λΈ ν›ˆλ ¨μ„ μ‹œμž‘ν•©λ‹ˆλ‹€...")
trainer.train()
print("\nπŸŽ‰ λͺ¨λΈ ν›ˆλ ¨ μ™„λ£Œ!")
# 7. μ΅œμ’… λͺ¨λΈ ν‰κ°€λŠ” ν›ˆλ ¨μ΄ λλ‚œ ν›„ 'λ³„λ„λ‘œ' μ‹€ν–‰
print("\n--- μ΅œμ’… λͺ¨λΈ μ„±λŠ₯ 평가 ---")
# λΉ„ν™œμ„±ν™”ν–ˆλ˜ 평가 데이터셋을 evaluate ν•¨μˆ˜μ— 직접 μ „λ‹¬ν•΄μ€λ‹ˆλ‹€.
final_evaluation = trainer.evaluate(eval_dataset=val_dataset)
print(final_evaluation)
print("\nλͺ¨λ“  과정이 μ„±κ³΅μ μœΌλ‘œ λλ‚¬μŠ΅λ‹ˆλ‹€! results ν΄λ”μ—μ„œ ν›ˆλ ¨λœ λͺ¨λΈμ„ ν™•μΈν•˜μ„Έμš”.")