Spaces:
Runtime error
Runtime error
| # λ°μ΄ν°μ μ μ¬μ© μ‘°μ¬ λ³΄κ³ κ·Έλν | |
| import pandas as pd | |
| import json | |
| import re | |
| # νμΌ κ²½λ‘ μ€μ | |
| file_path = './data/' | |
| # νλ ¨ λΌλ²¨ JSON νμΌ λΆλ¬μ€κΈ° | |
| with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file: | |
| training_data_raw = json.load(file) | |
| # νμν λ°μ΄ν°λ§ μΆμΆνμ¬ λ¦¬μ€νΈμ μ μ₯ | |
| extracted_data = [] | |
| # λ°μ΄ν°λ 리μ€νΈμ΄λ―λ‘ λ°λ‘ μνν©λλ€. | |
| for dialogue in training_data_raw: | |
| try: | |
| # 1. κ°μ λΌλ²¨ μΆμΆ (emotion ν€λ profile μμ μμ΅λλ€) | |
| emotion_type = dialogue['profile']['emotion']['type'] | |
| # 2. λν ν μ€νΈ μΆμΆ (talk ν€ μμ contentκ° μμ΅λλ€) | |
| dialogue_content = dialogue['talk']['content'] | |
| # 3. λμ λ리μ valueλ€(ν μ€νΈ)λ§ μΆμΆν©λλ€. | |
| texts = list(dialogue_content.values()) | |
| # 4. λͺ¨λ ν μ€νΈλ₯Ό νλμ λ¬Έμμ΄λ‘ ν©μΉ©λλ€. | |
| # λΉ λ¬Έμμ΄μ μ κ±°νκ³ ν©μΉλ κ²μ΄ μ’μ΅λλ€. | |
| full_text = " ".join([text for text in texts if text.strip()]) | |
| # 5. ν©μ³μ§ ν μ€νΈμ κ°μ λΌλ²¨μ΄ λͺ¨λ μ ν¨ν κ²½μ°μλ§ μΆκ°ν©λλ€. | |
| if full_text and emotion_type: | |
| extracted_data.append({'text': full_text, 'emotion': emotion_type}) | |
| except KeyError: | |
| # 'profile', 'emotion', 'talk', 'content' λ±μ ν€κ° μλ νλͺ©μ 건λλλλ€. | |
| continue | |
| # μλ‘μ΄ λ°μ΄ν°νλ μ μμ± | |
| df_train = pd.DataFrame(extracted_data) | |
| # 6. ν©μ³μ§ λ°μ΄ν° νμΈ | |
| print("--- μΆμΆλ νλ ¨ λ°μ΄ν°νλ μμ 첫 5μ€ ---") | |
| print(df_train.head()) | |
| print("\n--- λ°μ΄ν°νλ μ ν¬κΈ° ---") | |
| print(f"νλ ¨ λ°μ΄ν°: {df_train.shape}") | |
| # κΈ°μ‘΄ νλ ¨ λ°μ΄ν° λ‘λ μ½λ μλμ μ΄μ΄μ μμ±ν΄ μ£ΌμΈμ. | |
| # ------------------------------------------------------------------ | |
| # 1. κ²μ¦ λΌλ²¨ JSON νμΌ λΆλ¬μ€κΈ° | |
| with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file: | |
| validation_data_raw = json.load(file) | |
| # 2. κ²μ¦ λ°μ΄ν° μΆμΆ | |
| extracted_val_data = [] | |
| for dialogue in validation_data_raw: | |
| try: | |
| emotion_type = dialogue['profile']['emotion']['type'] | |
| dialogue_content = dialogue['talk']['content'] | |
| texts = list(dialogue_content.values()) | |
| full_text = " ".join([text for text in texts if text.strip()]) | |
| if full_text and emotion_type: | |
| extracted_val_data.append({'text': full_text, 'emotion': emotion_type}) | |
| except KeyError: | |
| continue | |
| # 3. μλ‘μ΄ λ°μ΄ν°νλ μ μμ± | |
| df_val = pd.DataFrame(extracted_val_data) | |
| # 4. κ²μ¦ λ°μ΄ν° νμΈ | |
| print("\n--- μΆμΆλ κ²μ¦ λ°μ΄ν°νλ μμ 첫 5μ€ ---") | |
| print(df_val.head()) | |
| print("\n--- κ²μ¦ λ°μ΄ν°νλ μ ν¬κΈ° ---") | |
| print(f"κ²μ¦ λ°μ΄ν°: {df_val.shape}") | |
| # main.pyμ κΈ°μ‘΄ μ½λ 맨 μλμ μ΄μ΄μ μμ±ν©λλ€. | |
| # ----------------------------------------------------------- | |
| # --- [Phase 1] λ°μ΄ν° νμ λ° μ μ²λ¦¬ --- | |
| # ----------------------------------------------------------- | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # 1. λ°μ΄ν° νμ λ° μκ°ν | |
| print("\n--- [Phase 1-1] λ°μ΄ν° νμ λ° μκ°ν μμ ---") | |
| # νκΈ ν°νΈ μ€μ (Windows: Malgun Gothic, Mac: AppleGothic) | |
| plt.rcParams['font.family'] = 'Malgun Gothic' | |
| plt.rcParams['axes.unicode_minus'] = False # λ§μ΄λμ€ κΈ°νΈ κΉ¨μ§ λ°©μ§ | |
| # νλ ¨ λ°μ΄ν°μ κ°μ λΆν¬ νμΈ | |
| print("\n--- νλ ¨ λ°μ΄ν° κ°μ λΆν¬ ---") | |
| print(df_train['emotion'].value_counts()) | |
| # κ°μ λΆν¬ μκ°ν | |
| plt.figure(figsize=(10, 6)) | |
| sns.countplot(data=df_train, y='emotion', order=df_train['emotion'].value_counts().index) | |
| plt.title('νλ ¨ λ°μ΄ν° κ°μ λΆν¬ μκ°ν', fontsize=15) | |
| plt.xlabel('κ°μ', fontsize=12) | |
| plt.ylabel('κ°μ ', fontsize=12) | |
| plt.grid(axis='x', linestyle='--', alpha=0.7) | |
| plt.show() # κ·Έλν μ°½ 보μ¬μ£ΌκΈ° | |
| print("\nμκ°ν μλ£. κ·Έλν μ°½μ λ«μΌλ©΄ λ€μ λ¨κ³κ° μ§νλ©λλ€.") | |
| # 2. ν μ€νΈ μ μ | |
| print("\n--- [Phase 1-2] ν μ€νΈ μ μ μμ ---") | |
| # μ΄λ―Έ re λͺ¨λμ μμμ import νμ΅λλ€. | |
| def clean_text(text): | |
| # μ κ·ννμμ μ¬μ©νμ¬ νκΈ, μμ΄, μ«μ, 곡백μ μ μΈν λͺ¨λ λ¬Έμ μ κ±° | |
| return re.sub(r'[^κ°-ν£a-zA-Z0-9 ]', '', text) | |
| # νλ ¨/κ²μ¦ λ°μ΄ν°μ μ μ ν¨μ μ μ© | |
| df_train['cleaned_text'] = df_train['text'].apply(clean_text) | |
| df_val['cleaned_text'] = df_val['text'].apply(clean_text) | |
| print("ν μ€νΈ μ μ μλ£.") | |
| print(df_train[['text', 'cleaned_text']].head()) |