Spaces:
Runtime error
Runtime error
File size: 4,658 Bytes
e850536 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# λ°μ΄ν°μ
μ μ¬μ© μ‘°μ¬ λ³΄κ³ κ·Έλν
import pandas as pd
import json
import re
# νμΌ κ²½λ‘ μ€μ
file_path = './data/'
# νλ ¨ λΌλ²¨ JSON νμΌ λΆλ¬μ€κΈ°
with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file:
training_data_raw = json.load(file)
# νμν λ°μ΄ν°λ§ μΆμΆνμ¬ λ¦¬μ€νΈμ μ μ₯
extracted_data = []
# λ°μ΄ν°λ 리μ€νΈμ΄λ―λ‘ λ°λ‘ μνν©λλ€.
for dialogue in training_data_raw:
try:
# 1. κ°μ λΌλ²¨ μΆμΆ (emotion ν€λ profile μμ μμ΅λλ€)
emotion_type = dialogue['profile']['emotion']['type']
# 2. λν ν
μ€νΈ μΆμΆ (talk ν€ μμ contentκ° μμ΅λλ€)
dialogue_content = dialogue['talk']['content']
# 3. λμ
λ리μ valueλ€(ν
μ€νΈ)λ§ μΆμΆν©λλ€.
texts = list(dialogue_content.values())
# 4. λͺ¨λ ν
μ€νΈλ₯Ό νλμ λ¬Έμμ΄λ‘ ν©μΉ©λλ€.
# λΉ λ¬Έμμ΄μ μ κ±°νκ³ ν©μΉλ κ²μ΄ μ’μ΅λλ€.
full_text = " ".join([text for text in texts if text.strip()])
# 5. ν©μ³μ§ ν
μ€νΈμ κ°μ λΌλ²¨μ΄ λͺ¨λ μ ν¨ν κ²½μ°μλ§ μΆκ°ν©λλ€.
if full_text and emotion_type:
extracted_data.append({'text': full_text, 'emotion': emotion_type})
except KeyError:
# 'profile', 'emotion', 'talk', 'content' λ±μ ν€κ° μλ νλͺ©μ 건λλλλ€.
continue
# μλ‘μ΄ λ°μ΄ν°νλ μ μμ±
df_train = pd.DataFrame(extracted_data)
# 6. ν©μ³μ§ λ°μ΄ν° νμΈ
print("--- μΆμΆλ νλ ¨ λ°μ΄ν°νλ μμ 첫 5μ€ ---")
print(df_train.head())
print("\n--- λ°μ΄ν°νλ μ ν¬κΈ° ---")
print(f"νλ ¨ λ°μ΄ν°: {df_train.shape}")
# κΈ°μ‘΄ νλ ¨ λ°μ΄ν° λ‘λ μ½λ μλμ μ΄μ΄μ μμ±ν΄ μ£ΌμΈμ.
# ------------------------------------------------------------------
# 1. κ²μ¦ λΌλ²¨ JSON νμΌ λΆλ¬μ€κΈ°
with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file:
validation_data_raw = json.load(file)
# 2. κ²μ¦ λ°μ΄ν° μΆμΆ
extracted_val_data = []
for dialogue in validation_data_raw:
try:
emotion_type = dialogue['profile']['emotion']['type']
dialogue_content = dialogue['talk']['content']
texts = list(dialogue_content.values())
full_text = " ".join([text for text in texts if text.strip()])
if full_text and emotion_type:
extracted_val_data.append({'text': full_text, 'emotion': emotion_type})
except KeyError:
continue
# 3. μλ‘μ΄ λ°μ΄ν°νλ μ μμ±
df_val = pd.DataFrame(extracted_val_data)
# 4. κ²μ¦ λ°μ΄ν° νμΈ
print("\n--- μΆμΆλ κ²μ¦ λ°μ΄ν°νλ μμ 첫 5μ€ ---")
print(df_val.head())
print("\n--- κ²μ¦ λ°μ΄ν°νλ μ ν¬κΈ° ---")
print(f"κ²μ¦ λ°μ΄ν°: {df_val.shape}")
# main.pyμ κΈ°μ‘΄ μ½λ 맨 μλμ μ΄μ΄μ μμ±ν©λλ€.
# -----------------------------------------------------------
# --- [Phase 1] λ°μ΄ν° νμ λ° μ μ²λ¦¬ ---
# -----------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
# 1. λ°μ΄ν° νμ λ° μκ°ν
print("\n--- [Phase 1-1] λ°μ΄ν° νμ λ° μκ°ν μμ ---")
# νκΈ ν°νΈ μ€μ (Windows: Malgun Gothic, Mac: AppleGothic)
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False # λ§μ΄λμ€ κΈ°νΈ κΉ¨μ§ λ°©μ§
# νλ ¨ λ°μ΄ν°μ κ°μ λΆν¬ νμΈ
print("\n--- νλ ¨ λ°μ΄ν° κ°μ λΆν¬ ---")
print(df_train['emotion'].value_counts())
# κ°μ λΆν¬ μκ°ν
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train, y='emotion', order=df_train['emotion'].value_counts().index)
plt.title('νλ ¨ λ°μ΄ν° κ°μ λΆν¬ μκ°ν', fontsize=15)
plt.xlabel('κ°μ', fontsize=12)
plt.ylabel('κ°μ ', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show() # κ·Έλν μ°½ 보μ¬μ£ΌκΈ°
print("\nμκ°ν μλ£. κ·Έλν μ°½μ λ«μΌλ©΄ λ€μ λ¨κ³κ° μ§νλ©λλ€.")
# 2. ν
μ€νΈ μ μ
print("\n--- [Phase 1-2] ν
μ€νΈ μ μ μμ ---")
# μ΄λ―Έ re λͺ¨λμ μμμ import νμ΅λλ€.
def clean_text(text):
# μ κ·ννμμ μ¬μ©νμ¬ νκΈ, μμ΄, μ«μ, 곡백μ μ μΈν λͺ¨λ λ¬Έμ μ κ±°
return re.sub(r'[^κ°-ν£a-zA-Z0-9 ]', '', text)
# νλ ¨/κ²μ¦ λ°μ΄ν°μ μ μ ν¨μ μ μ©
df_train['cleaned_text'] = df_train['text'].apply(clean_text)
df_val['cleaned_text'] = df_val['text'].apply(clean_text)
print("ν
μ€νΈ μ μ μλ£.")
print(df_train[['text', 'cleaned_text']].head()) |