Upload folder using huggingface_hub

0446288 verified 6 months ago

5.31 kB

	# data_preparation.py
	import json
	import os
	from pathlib import Path
	import pandas as pd
	from typing import List, Dict, Tuple
	import numpy as np
	from tqdm import tqdm
	from sklearn.model_selection import train_test_split

	class KokoroChatProcessor:
	def __init__(self, data_path: str):
	self.data_path = Path(data_path)
	self.conversations = []
	self.processed_data = []

	def load_all_conversations(self) -> List[Dict]:
	"""Load all JSON files from KokoroChat dataset"""
	json_files = list(self.data_path.glob("*/.json"))
	print(f"Found {len(json_files)} conversation files")

	for json_file in tqdm(json_files, desc="Loading conversations"):
	try:
	with open(json_file, 'r', encoding='utf-8') as f:
	data = json.load(f)
	self.conversations.append(data)
	except Exception as e:
	print(f"Error loading {json_file}: {e}")

	return self.conversations

	def create_training_examples(self) -> List[Dict]:
	"""Convert conversations to training format"""

	for conv_data in tqdm(self.conversations, desc="Processing conversations"):
	dialogue = conv_data.get('dialogue', [])
	topic = conv_data.get('topic', {})
	review = conv_data.get('review_by_client_jp', {})

	# Create conversation context
	conversation_pairs = []

	for i in range(0, len(dialogue) - 1, 2):
	if i + 1 < len(dialogue):
	counselor_msg = dialogue[i]
	client_msg = dialogue[i + 1] if i + 1 < len(dialogue) else None

	if counselor_msg['role'] == 'counselor' and client_msg and client_msg['role'] == 'client':
	# Build context from previous messages
	context = self._build_context(dialogue[:i+1])

	training_example = {
	'instruction': "あなたは共感的で専門的な心理カウンセラーです。クライアントの悩みに寄り添い、適切なサポートを提供してください。",
	'input': f"クライアント: {client_msg['utterance']}",
	'output': counselor_msg['utterance'],
	'context': context,
	'topic': topic.get('main_jp', ''),
	'quality_score': self._calculate_quality_score(review)
	}

	self.processed_data.append(training_example)

	return self.processed_data

	def _build_context(self, dialogue_history: List[Dict], max_turns: int = 5) -> str:
	"""Build conversation context from history"""
	context_parts = []
	start_idx = max(0, len(dialogue_history) - max_turns * 2)

	for msg in dialogue_history[start_idx:]:
	role = "カウンセラー" if msg['role'] == 'counselor' else "クライアント"
	context_parts.append(f"{role}: {msg['utterance']}")

	return "\n".join(context_parts)

	def _calculate_quality_score(self, review: Dict) -> float:
	"""Calculate quality score from client review"""
	if not review or review.get('点数') is None:
	return 0.5 # Default middle score

	# Normalize score (assuming max score is 100)
	return review.get('点数', 50) / 100.0

	def prepare_for_finetuning(self, test_size: float = 0.1, val_size: float = 0.1):
	"""Prepare train/val/test splits"""

	# Filter high-quality examples (score > 0.6)
	high_quality = [ex for ex in self.processed_data if ex['quality_score'] > 0.6]
	print(f"Selected {len(high_quality)} high-quality examples")

	# Create splits
	train_data, test_data = train_test_split(high_quality, test_size=test_size, random_state=42)
	train_data, val_data = train_test_split(train_data, test_size=val_size, random_state=42)

	# Format for fine-tuning
	def format_example(ex):
	prompt = f"""### 指示:
	{ex['instruction']}

	### コンテキスト:
	{ex['context']}

	### 入力:
	{ex['input']}

	### 応答:
	{ex['output']}"""
	return {'text': prompt}

	train_formatted = [format_example(ex) for ex in train_data]
	val_formatted = [format_example(ex) for ex in val_data]
	test_formatted = [format_example(ex) for ex in test_data]

	return train_formatted, val_formatted, test_formatted

	# Execute data preparation
	processor = KokoroChatProcessor('KokoroChat/data')
	processor.load_all_conversations()
	processor.create_training_examples()
	train_data, val_data, test_data = processor.prepare_for_finetuning()

	# Save processed data
	import pickle
	with open('processed_data.pkl', 'wb') as f:
	pickle.dump({
	'train': train_data,
	'val': val_data,
	'test': test_data
	}, f)

	print(f"Training examples: {len(train_data)}")
	print(f"Validation examples: {len(val_data)}")
	print(f"Test examples: {len(test_data)}")