File size: 5,309 Bytes
0446288 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | # data_preparation.py
import json
import os
from pathlib import Path
import pandas as pd
from typing import List, Dict, Tuple
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
class KokoroChatProcessor:
def __init__(self, data_path: str):
self.data_path = Path(data_path)
self.conversations = []
self.processed_data = []
def load_all_conversations(self) -> List[Dict]:
"""Load all JSON files from KokoroChat dataset"""
json_files = list(self.data_path.glob("**/*.json"))
print(f"Found {len(json_files)} conversation files")
for json_file in tqdm(json_files, desc="Loading conversations"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self.conversations.append(data)
except Exception as e:
print(f"Error loading {json_file}: {e}")
return self.conversations
def create_training_examples(self) -> List[Dict]:
"""Convert conversations to training format"""
for conv_data in tqdm(self.conversations, desc="Processing conversations"):
dialogue = conv_data.get('dialogue', [])
topic = conv_data.get('topic', {})
review = conv_data.get('review_by_client_jp', {})
# Create conversation context
conversation_pairs = []
for i in range(0, len(dialogue) - 1, 2):
if i + 1 < len(dialogue):
counselor_msg = dialogue[i]
client_msg = dialogue[i + 1] if i + 1 < len(dialogue) else None
if counselor_msg['role'] == 'counselor' and client_msg and client_msg['role'] == 'client':
# Build context from previous messages
context = self._build_context(dialogue[:i+1])
training_example = {
'instruction': "あなたは共感的で専門的な心理カウンセラーです。クライアントの悩みに寄り添い、適切なサポートを提供してください。",
'input': f"クライアント: {client_msg['utterance']}",
'output': counselor_msg['utterance'],
'context': context,
'topic': topic.get('main_jp', ''),
'quality_score': self._calculate_quality_score(review)
}
self.processed_data.append(training_example)
return self.processed_data
def _build_context(self, dialogue_history: List[Dict], max_turns: int = 5) -> str:
"""Build conversation context from history"""
context_parts = []
start_idx = max(0, len(dialogue_history) - max_turns * 2)
for msg in dialogue_history[start_idx:]:
role = "カウンセラー" if msg['role'] == 'counselor' else "クライアント"
context_parts.append(f"{role}: {msg['utterance']}")
return "\n".join(context_parts)
def _calculate_quality_score(self, review: Dict) -> float:
"""Calculate quality score from client review"""
if not review or review.get('点数') is None:
return 0.5 # Default middle score
# Normalize score (assuming max score is 100)
return review.get('点数', 50) / 100.0
def prepare_for_finetuning(self, test_size: float = 0.1, val_size: float = 0.1):
"""Prepare train/val/test splits"""
# Filter high-quality examples (score > 0.6)
high_quality = [ex for ex in self.processed_data if ex['quality_score'] > 0.6]
print(f"Selected {len(high_quality)} high-quality examples")
# Create splits
train_data, test_data = train_test_split(high_quality, test_size=test_size, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=val_size, random_state=42)
# Format for fine-tuning
def format_example(ex):
prompt = f"""### 指示:
{ex['instruction']}
### コンテキスト:
{ex['context']}
### 入力:
{ex['input']}
### 応答:
{ex['output']}"""
return {'text': prompt}
train_formatted = [format_example(ex) for ex in train_data]
val_formatted = [format_example(ex) for ex in val_data]
test_formatted = [format_example(ex) for ex in test_data]
return train_formatted, val_formatted, test_formatted
# Execute data preparation
processor = KokoroChatProcessor('KokoroChat/data')
processor.load_all_conversations()
processor.create_training_examples()
train_data, val_data, test_data = processor.prepare_for_finetuning()
# Save processed data
import pickle
with open('processed_data.pkl', 'wb') as f:
pickle.dump({
'train': train_data,
'val': val_data,
'test': test_data
}, f)
print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")
print(f"Test examples: {len(test_data)}")
|