Spaces:
Runtime error
Runtime error
| # train_model.py | |
| # AI λͺ¨λΈμ νλ ¨νλ μ€ν¬λ¦½νΈ, λ€μ μ¬μ©κ°λ₯ν μμ x | |
| import pandas as pd | |
| import json | |
| import re | |
| import sys | |
| import transformers | |
| import torch | |
| from transformers import AutoTokenizer | |
| # --- 1. λ°μ΄ν° λ‘λ© λ° μ μ²λ¦¬ --- | |
| print("--- [Phase 1] λ°μ΄ν° λ‘λ© λ° μ μ²λ¦¬ μμ ---") | |
| # νμΌ κ²½λ‘ μ€μ | |
| file_path = './data/' | |
| # νλ ¨/κ²μ¦ λ°μ΄ν° λ‘λ© (μ΄μ κ³Ό λμΌ) | |
| with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file: | |
| training_data_raw = json.load(file) | |
| with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file: | |
| validation_data_raw = json.load(file) | |
| # DataFrame μμ± ν¨μ (μ½λλ₯Ό κΉλνκ² νκΈ° μν΄ ν¨μλ‘ λ¬Άμ) | |
| def create_dataframe(data_raw): | |
| extracted_data = [] | |
| for dialogue in data_raw: | |
| try: | |
| emotion_type = dialogue['profile']['emotion']['type'] | |
| dialogue_content = dialogue['talk']['content'] | |
| full_text = " ".join(list(dialogue_content.values())) | |
| if full_text and emotion_type: | |
| extracted_data.append({'text': full_text, 'emotion': emotion_type}) | |
| except KeyError: | |
| continue | |
| return pd.DataFrame(extracted_data) | |
| df_train = create_dataframe(training_data_raw) | |
| df_val = create_dataframe(validation_data_raw) | |
| # ν μ€νΈ μ μ | |
| def clean_text(text): | |
| return re.sub(r'[^κ°-ν£a-zA-Z0-9 ]', '', text) | |
| df_train['cleaned_text'] = df_train['text'].apply(clean_text) | |
| df_val['cleaned_text'] = df_val['text'].apply(clean_text) | |
| print("β λ°μ΄ν° λ‘λ© λ° μ μ²λ¦¬ μλ£!") | |
| # --- 2. AI λͺ¨λΈλ§ μ€λΉ --- | |
| print("\n--- [Phase 2] AI λͺ¨λΈλ§ μ€λΉ μμ ---") | |
| # λͺ¨λΈ λ° ν ν¬λμ΄μ λΆλ¬μ€κΈ° | |
| MODEL_NAME = "klue/roberta-base" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # ν μ€νΈ ν ν°ν | |
| train_tokenized = tokenizer(list(df_train['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True) | |
| val_tokenized = tokenizer(list(df_val['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True) | |
| # λΌλ²¨ μΈμ½λ© | |
| unique_labels = sorted(df_train['emotion'].unique()) | |
| label_to_id = {label: id for id, label in enumerate(unique_labels)} | |
| id_to_label = {id: label for label, id in label_to_id.items()} | |
| df_train['label'] = df_train['emotion'].map(label_to_id) | |
| df_val['label'] = df_val['emotion'].map(label_to_id) | |
| print("β ν ν°ν λ° λΌλ²¨ μΈμ½λ© μλ£!") | |
| print("μ΄μ λͺ¨λΈ νλ ¨μ μν λͺ¨λ μ€λΉκ° λλ¬μ΅λλ€.") | |
| # [Phase 3]μ κΈ°μ‘΄ μ½λλ₯Ό μλ λ΄μ©μΌλ‘ κ΅μ²΄ν΄μ£ΌμΈμ. | |
| # ----------------------------------------------------------- | |
| # --- [Phase 3] λͺ¨λΈ νμ΅ λ° νκ° (μ΅μ κΈ°λ₯ λ²μ ) --- | |
| # ----------------------------------------------------------- | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| print("\n--- [Phase 3] λͺ¨λΈ νμ΅ λ° νκ° μμ ---") | |
| # 1. PyTorch Dataset ν΄λμ€ μ μ (μ΄μ κ³Ό λμΌ) | |
| class EmotionDataset(torch.utils.data.Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} | |
| item['labels'] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| train_dataset = EmotionDataset(train_tokenized, df_train['label'].tolist()) | |
| val_dataset = EmotionDataset(val_tokenized, df_val['label'].tolist()) | |
| print("β PyTorch λ°μ΄ν°μ μμ±μ΄ μλ£λμμ΅λλ€.") | |
| # 2. AI λͺ¨λΈ λΆλ¬μ€κΈ° (μ΄μ κ³Ό λμΌ) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| MODEL_NAME, | |
| num_labels=len(unique_labels), | |
| id2label=id_to_label, | |
| label2id=label_to_id | |
| ) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| print(f"β λͺ¨λΈ λ‘λ© μλ£! λͺ¨λΈμ {device}μμ μ€νλ©λλ€.") | |
| # 3. λͺ¨λΈ μ±λ₯ νκ°λ₯Ό μν ν¨μ μ μ (μμ μλ£) | |
| def compute_metrics(pred): | |
| labels = pred.label_ids | |
| # λ°λ‘ μ΄ λΆλΆμ΄ μμ λμμ΅λλ€. | |
| preds = pred.predictions.argmax(-1) | |
| precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0) | |
| acc = accuracy_score(labels, preds) | |
| return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall} | |
| # 4. νλ ¨μ μν μμΈ μ€μ (Arguments) μ μ (λͺ¨λ λΆκ° μ΅μ μ κ±°) | |
| training_args = TrainingArguments( | |
| output_dir='./results', # λͺ¨λΈμ΄ μ μ₯λ μμΉ (νμ) | |
| num_train_epochs=3, # νλ ¨ νμ | |
| per_device_train_batch_size=16, # νλ ¨ λ°°μΉ μ¬μ΄μ¦ | |
| # λλ¨Έμ§ λͺ¨λ νκ°/μ μ₯ κ΄λ ¨ μ΅μ μ λͺ¨λ μ κ±°ν©λλ€. | |
| ) | |
| # ---!!! ν΅μ¬ μμ μ¬ν 2 !!!--- | |
| # 5. Trainer μ μ (νκ° κ΄λ ¨ κΈ°λ₯ λΉνμ±ν) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| # νλ ¨ μ€ νκ°λ₯Ό νμ§ μμΌλ―λ‘ μλ μ΅μ λ€μ μ μΈν©λλ€. | |
| # eval_dataset=val_dataset, | |
| # compute_metrics=compute_metrics | |
| ) | |
| # 6. λͺ¨λΈ νλ ¨ μμ! | |
| print("\nπ₯ AI λͺ¨λΈ νλ ¨μ μμν©λλ€...") | |
| trainer.train() | |
| print("\nπ λͺ¨λΈ νλ ¨ μλ£!") | |
| # 7. μ΅μ’ λͺ¨λΈ νκ°λ νλ ¨μ΄ λλ ν 'λ³λλ‘' μ€ν | |
| print("\n--- μ΅μ’ λͺ¨λΈ μ±λ₯ νκ° ---") | |
| # λΉνμ±ννλ νκ° λ°μ΄ν°μ μ evaluate ν¨μμ μ§μ μ λ¬ν΄μ€λλ€. | |
| final_evaluation = trainer.evaluate(eval_dataset=val_dataset) | |
| print(final_evaluation) | |
| print("\nλͺ¨λ κ³Όμ μ΄ μ±κ³΅μ μΌλ‘ λλ¬μ΅λλ€! results ν΄λμμ νλ ¨λ λͺ¨λΈμ νμΈνμΈμ.") |