# prepare_dataset_multiturn.py import json import os from datasets import Dataset, Features, Value import pandas as pd from pathlib import Path def parse_kokorochat_with_context(json_file_path, context_window=4, max_history_tokens=1500): """ Parse KokoroChat with conversation history for realistic counseling. Args: json_file_path: Path to JSON file context_window: Number of previous turns to include (default: 4 = 2 exchanges) max_history_tokens: Approximate token limit for history (prevents too long sequences) """ try: with open(json_file_path, 'r', encoding='utf-8') as f: data = json.load(f) except Exception as e: return [], 0 conversations = [] dialogue = data.get('dialogue', []) # Get quality score review_en = data.get('review_by_client_en', {}) total_score = review_en.get('score', 0) # Get topic topic = data.get('topic', {}) main_topic = topic.get('main_en', '') sub_topic = topic.get('sub', '') # Extract examples with context for i in range(len(dialogue) - 1): current = dialogue[i] next_turn = dialogue[i + 1] # Look for client -> counselor pairs if current['role'] == 'client' and next_turn['role'] == 'counselor': client_msg = current['utterance'].strip() counselor_msg = next_turn['utterance'].strip() if len(client_msg) > 5 and len(counselor_msg) > 5: # Get conversation history (previous turns) start_idx = max(0, i - context_window) history = dialogue[start_idx:i] # Estimate token count (rough: ~3 chars per token for Japanese) history_text = ''.join([h['utterance'] for h in history]) if len(history_text) < max_history_tokens * 3: # Keep reasonable length conversations.append({ 'history': history, 'client': client_msg, 'counselor': counselor_msg, 'quality_score': total_score, 'topic_main': main_topic, 'topic_sub': sub_topic, 'dialogue_id': Path(json_file_path).stem }) return conversations, total_score def format_conversation_for_lfm2(conversation): """ Format conversation with history into LFM2 ChatML template """ # Start with system prompt formatted = "<|im_start|>system\n" formatted += "あなたは経験豊富な心理カウンセラーです。クライアントの話を傾聴し、共感的で支援的な応答をしてください。<|im_end|>\n" # Add conversation history for turn in conversation['history']: if turn['role'] == 'client': formatted += f"<|im_start|>user\n{turn['utterance']}<|im_end|>\n" elif turn['role'] == 'counselor': formatted += f"<|im_start|>assistant\n{turn['utterance']}<|im_end|>\n" # Add current exchange (what we're training on) formatted += f"<|im_start|>user\n{conversation['client']}<|im_end|>\n" formatted += f"<|im_start|>assistant\n{conversation['counselor']}<|im_end|><|endoftext|>" return formatted def create_training_dataset_multiturn( data_dir="./KokoroChat/data", min_score=70, context_window=4 ): """ Create training dataset with conversation context. Args: data_dir: Directory containing JSON files min_score: Minimum quality score (0-100, recommend 85 for top quality) context_window: Number of previous turns to include """ json_files = list(Path(data_dir).rglob("*.json")) print(f"Found {len(json_files)} JSON files") all_conversations = [] score_distribution = [] print("\nProcessing files with multi-turn context...") for idx, json_file in enumerate(json_files): if idx % 1000 == 0: print(f"Processed {idx}/{len(json_files)} files...") try: convs, score = parse_kokorochat_with_context( json_file, context_window=context_window ) score_distribution.append(score) if score >= min_score: all_conversations.extend(convs) except Exception as e: continue print(f"\n=== Processing Results ===") print(f"High-quality files (>= {min_score}): {sum(1 for s in score_distribution if s >= min_score)}") print(f"Total conversation examples: {len(all_conversations)}") if len(all_conversations) == 0: print(f"❌ No conversations found! Try lowering min_score (current: {min_score})") return None # Format for LFM2 formatted_data = [] for conv in all_conversations: formatted_text = format_conversation_for_lfm2(conv) formatted_data.append({ 'text': formatted_text, 'quality_score': conv['quality_score'], 'topic_main': conv['topic_main'], 'topic_sub': conv['topic_sub'], 'has_context': len(conv['history']) > 0 }) # Create dataset features = Features({ 'text': Value('string'), 'quality_score': Value('int64'), 'topic_main': Value('string'), 'topic_sub': Value('string'), 'has_context': Value('bool') }) df = pd.DataFrame(formatted_data) dataset = Dataset.from_pandas(df, features=features) dataset = dataset.train_test_split(test_size=0.1, seed=42) print(f"\n=== Final Dataset ===") print(f"Training samples: {len(dataset['train'])}") print(f"Validation samples: {len(dataset['test'])}") print(f"Examples with context: {sum(df['has_context'])}") # Save dataset.save_to_disk("./kokorochat_processed_multiturn") print("\n✅ Multi-turn dataset saved to ./kokorochat_processed_multiturn") # Show sample print("\n=== Sample Training Example (with context) ===") sample = dataset['train'][5]['text'] print(sample[:1000] + "\n..." if len(sample) > 1000 else sample) return dataset if __name__ == "__main__": dataset = create_training_dataset_multiturn( data_dir="./KokoroChat/kokorochat_dialogues", min_score=60, # Top 30% quality context_window=4 # Include 4 previous turns )