lfm_complete_code / data_preparation.py
Techiiot's picture
Upload folder using huggingface_hub
0446288 verified
# data_preparation.py
import json
import os
from pathlib import Path
import pandas as pd
from typing import List, Dict, Tuple
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
class KokoroChatProcessor:
def __init__(self, data_path: str):
self.data_path = Path(data_path)
self.conversations = []
self.processed_data = []
def load_all_conversations(self) -> List[Dict]:
"""Load all JSON files from KokoroChat dataset"""
json_files = list(self.data_path.glob("**/*.json"))
print(f"Found {len(json_files)} conversation files")
for json_file in tqdm(json_files, desc="Loading conversations"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self.conversations.append(data)
except Exception as e:
print(f"Error loading {json_file}: {e}")
return self.conversations
def create_training_examples(self) -> List[Dict]:
"""Convert conversations to training format"""
for conv_data in tqdm(self.conversations, desc="Processing conversations"):
dialogue = conv_data.get('dialogue', [])
topic = conv_data.get('topic', {})
review = conv_data.get('review_by_client_jp', {})
# Create conversation context
conversation_pairs = []
for i in range(0, len(dialogue) - 1, 2):
if i + 1 < len(dialogue):
counselor_msg = dialogue[i]
client_msg = dialogue[i + 1] if i + 1 < len(dialogue) else None
if counselor_msg['role'] == 'counselor' and client_msg and client_msg['role'] == 'client':
# Build context from previous messages
context = self._build_context(dialogue[:i+1])
training_example = {
'instruction': "あなたは共感的で専門的な心理カウンセラーです。クライアントの悩みに寄り添い、適切なサポートを提供してください。",
'input': f"クライアント: {client_msg['utterance']}",
'output': counselor_msg['utterance'],
'context': context,
'topic': topic.get('main_jp', ''),
'quality_score': self._calculate_quality_score(review)
}
self.processed_data.append(training_example)
return self.processed_data
def _build_context(self, dialogue_history: List[Dict], max_turns: int = 5) -> str:
"""Build conversation context from history"""
context_parts = []
start_idx = max(0, len(dialogue_history) - max_turns * 2)
for msg in dialogue_history[start_idx:]:
role = "カウンセラー" if msg['role'] == 'counselor' else "クライアント"
context_parts.append(f"{role}: {msg['utterance']}")
return "\n".join(context_parts)
def _calculate_quality_score(self, review: Dict) -> float:
"""Calculate quality score from client review"""
if not review or review.get('点数') is None:
return 0.5 # Default middle score
# Normalize score (assuming max score is 100)
return review.get('点数', 50) / 100.0
def prepare_for_finetuning(self, test_size: float = 0.1, val_size: float = 0.1):
"""Prepare train/val/test splits"""
# Filter high-quality examples (score > 0.6)
high_quality = [ex for ex in self.processed_data if ex['quality_score'] > 0.6]
print(f"Selected {len(high_quality)} high-quality examples")
# Create splits
train_data, test_data = train_test_split(high_quality, test_size=test_size, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=val_size, random_state=42)
# Format for fine-tuning
def format_example(ex):
prompt = f"""### 指示:
{ex['instruction']}
### コンテキスト:
{ex['context']}
### 入力:
{ex['input']}
### 応答:
{ex['output']}"""
return {'text': prompt}
train_formatted = [format_example(ex) for ex in train_data]
val_formatted = [format_example(ex) for ex in val_data]
test_formatted = [format_example(ex) for ex in test_data]
return train_formatted, val_formatted, test_formatted
# Execute data preparation
processor = KokoroChatProcessor('KokoroChat/data')
processor.load_all_conversations()
processor.create_training_examples()
train_data, val_data, test_data = processor.prepare_for_finetuning()
# Save processed data
import pickle
with open('processed_data.pkl', 'wb') as f:
pickle.dump({
'train': train_data,
'val': val_data,
'test': test_data
}, f)
print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")
print(f"Test examples: {len(test_data)}")