Spaces:

koons
/

emotion-chatbot

Runtime error

emotion-chatbot / notebooks /explore_data.py

kootaeng2

Initial commit with final, clean project files

e850536 5 months ago

4.66 kB

	# 데이터셋의 사용 조사 보고 그래프

	import pandas as pd
	import json
	import re

	# 파일 경로 설정
	file_path = './data/'

	# 훈련 라벨 JSON 파일 불러오기
	with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file:
	training_data_raw = json.load(file)

	# 필요한 데이터만 추출하여 리스트에 저장
	extracted_data = []

	# 데이터는 리스트이므로 바로 순회합니다.
	for dialogue in training_data_raw:
	try:
	# 1. 감정 라벨 추출 (emotion 키는 profile 안에 있습니다)
	emotion_type = dialogue['profile']['emotion']['type']

	# 2. 대화 텍스트 추출 (talk 키 안에 content가 있습니다)
	dialogue_content = dialogue['talk']['content']

	# 3. 딕셔너리의 value들(텍스트)만 추출합니다.
	texts = list(dialogue_content.values())

	# 4. 모든 텍스트를 하나의 문자열로 합칩니다.
	# 빈 문자열을 제거하고 합치는 것이 좋습니다.
	full_text = " ".join([text for text in texts if text.strip()])

	# 5. 합쳐진 텍스트와 감정 라벨이 모두 유효할 경우에만 추가합니다.
	if full_text and emotion_type:
	extracted_data.append({'text': full_text, 'emotion': emotion_type})

	except KeyError:
	# 'profile', 'emotion', 'talk', 'content' 등의 키가 없는 항목은 건너뜁니다.
	continue

	# 새로운 데이터프레임 생성
	df_train = pd.DataFrame(extracted_data)

	# 6. 합쳐진 데이터 확인
	print("--- 추출된 훈련 데이터프레임의 첫 5줄 ---")
	print(df_train.head())

	print("\n--- 데이터프레임 크기 ---")
	print(f"훈련 데이터: {df_train.shape}")

	# 기존 훈련 데이터 로드 코드 아래에 이어서 작성해 주세요.
	# ------------------------------------------------------------------

	# 1. 검증 라벨 JSON 파일 불러오기
	with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file:
	validation_data_raw = json.load(file)

	# 2. 검증 데이터 추출
	extracted_val_data = []

	for dialogue in validation_data_raw:
	try:
	emotion_type = dialogue['profile']['emotion']['type']
	dialogue_content = dialogue['talk']['content']
	texts = list(dialogue_content.values())
	full_text = " ".join([text for text in texts if text.strip()])

	if full_text and emotion_type:
	extracted_val_data.append({'text': full_text, 'emotion': emotion_type})

	except KeyError:
	continue

	# 3. 새로운 데이터프레임 생성
	df_val = pd.DataFrame(extracted_val_data)

	# 4. 검증 데이터 확인
	print("\n--- 추출된 검증 데이터프레임의 첫 5줄 ---")
	print(df_val.head())

	print("\n--- 검증 데이터프레임 크기 ---")
	print(f"검증 데이터: {df_val.shape}")

	# main.py의 기존 코드 맨 아래에 이어서 작성합니다.
	# -----------------------------------------------------------
	# --- [Phase 1] 데이터 탐색 및 전처리 ---
	# -----------------------------------------------------------
	import matplotlib.pyplot as plt
	import seaborn as sns

	# 1. 데이터 탐색 및 시각화
	print("\n--- [Phase 1-1] 데이터 탐색 및 시각화 시작 ---")

	# 한글 폰트 설정 (Windows: Malgun Gothic, Mac: AppleGothic)
	plt.rcParams['font.family'] = 'Malgun Gothic'
	plt.rcParams['axes.unicode_minus'] = False # 마이너스 기호 깨짐 방지

	# 훈련 데이터의 감정 분포 확인
	print("\n--- 훈련 데이터 감정 분포 ---")
	print(df_train['emotion'].value_counts())

	# 감정 분포 시각화
	plt.figure(figsize=(10, 6))
	sns.countplot(data=df_train, y='emotion', order=df_train['emotion'].value_counts().index)
	plt.title('훈련 데이터 감정 분포 시각화', fontsize=15)
	plt.xlabel('개수', fontsize=12)
	plt.ylabel('감정', fontsize=12)
	plt.grid(axis='x', linestyle='--', alpha=0.7)
	plt.show() # 그래프 창 보여주기

	print("\n시각화 완료. 그래프 창을 닫으면 다음 단계가 진행됩니다.")

	# 2. 텍스트 정제
	print("\n--- [Phase 1-2] 텍스트 정제 시작 ---")
	# 이미 re 모듈은 위에서 import 했습니다.

	def clean_text(text):
	# 정규표현식을 사용하여 한글, 영어, 숫자, 공백을 제외한 모든 문자 제거
	return re.sub(r'[^가-힣a-zA-Z0-9 ]', '', text)

	# 훈련/검증 데이터에 정제 함수 적용
	df_train['cleaned_text'] = df_train['text'].apply(clean_text)
	df_val['cleaned_text'] = df_val['text'].apply(clean_text)

	print("텍스트 정제 완료.")
	print(df_train[['text', 'cleaned_text']].head())