File size: 6,512 Bytes
0446288 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | # prepare_dataset_multiturn.py
import json
import os
from datasets import Dataset, Features, Value
import pandas as pd
from pathlib import Path
def parse_kokorochat_with_context(json_file_path, context_window=4, max_history_tokens=1500):
"""
Parse KokoroChat with conversation history for realistic counseling.
Args:
json_file_path: Path to JSON file
context_window: Number of previous turns to include (default: 4 = 2 exchanges)
max_history_tokens: Approximate token limit for history (prevents too long sequences)
"""
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except Exception as e:
return [], 0
conversations = []
dialogue = data.get('dialogue', [])
# Get quality score
review_en = data.get('review_by_client_en', {})
total_score = review_en.get('score', 0)
# Get topic
topic = data.get('topic', {})
main_topic = topic.get('main_en', '')
sub_topic = topic.get('sub', '')
# Extract examples with context
for i in range(len(dialogue) - 1):
current = dialogue[i]
next_turn = dialogue[i + 1]
# Look for client -> counselor pairs
if current['role'] == 'client' and next_turn['role'] == 'counselor':
client_msg = current['utterance'].strip()
counselor_msg = next_turn['utterance'].strip()
if len(client_msg) > 5 and len(counselor_msg) > 5:
# Get conversation history (previous turns)
start_idx = max(0, i - context_window)
history = dialogue[start_idx:i]
# Estimate token count (rough: ~3 chars per token for Japanese)
history_text = ''.join([h['utterance'] for h in history])
if len(history_text) < max_history_tokens * 3: # Keep reasonable length
conversations.append({
'history': history,
'client': client_msg,
'counselor': counselor_msg,
'quality_score': total_score,
'topic_main': main_topic,
'topic_sub': sub_topic,
'dialogue_id': Path(json_file_path).stem
})
return conversations, total_score
def format_conversation_for_lfm2(conversation):
"""
Format conversation with history into LFM2 ChatML template
"""
# Start with system prompt
formatted = "<|im_start|>system\n"
formatted += "あなたは経験豊富な心理カウンセラーです。クライアントの話を傾聴し、共感的で支援的な応答をしてください。<|im_end|>\n"
# Add conversation history
for turn in conversation['history']:
if turn['role'] == 'client':
formatted += f"<|im_start|>user\n{turn['utterance']}<|im_end|>\n"
elif turn['role'] == 'counselor':
formatted += f"<|im_start|>assistant\n{turn['utterance']}<|im_end|>\n"
# Add current exchange (what we're training on)
formatted += f"<|im_start|>user\n{conversation['client']}<|im_end|>\n"
formatted += f"<|im_start|>assistant\n{conversation['counselor']}<|im_end|><|endoftext|>"
return formatted
def create_training_dataset_multiturn(
data_dir="./KokoroChat/data",
min_score=70,
context_window=4
):
"""
Create training dataset with conversation context.
Args:
data_dir: Directory containing JSON files
min_score: Minimum quality score (0-100, recommend 85 for top quality)
context_window: Number of previous turns to include
"""
json_files = list(Path(data_dir).rglob("*.json"))
print(f"Found {len(json_files)} JSON files")
all_conversations = []
score_distribution = []
print("\nProcessing files with multi-turn context...")
for idx, json_file in enumerate(json_files):
if idx % 1000 == 0:
print(f"Processed {idx}/{len(json_files)} files...")
try:
convs, score = parse_kokorochat_with_context(
json_file,
context_window=context_window
)
score_distribution.append(score)
if score >= min_score:
all_conversations.extend(convs)
except Exception as e:
continue
print(f"\n=== Processing Results ===")
print(f"High-quality files (>= {min_score}): {sum(1 for s in score_distribution if s >= min_score)}")
print(f"Total conversation examples: {len(all_conversations)}")
if len(all_conversations) == 0:
print(f"❌ No conversations found! Try lowering min_score (current: {min_score})")
return None
# Format for LFM2
formatted_data = []
for conv in all_conversations:
formatted_text = format_conversation_for_lfm2(conv)
formatted_data.append({
'text': formatted_text,
'quality_score': conv['quality_score'],
'topic_main': conv['topic_main'],
'topic_sub': conv['topic_sub'],
'has_context': len(conv['history']) > 0
})
# Create dataset
features = Features({
'text': Value('string'),
'quality_score': Value('int64'),
'topic_main': Value('string'),
'topic_sub': Value('string'),
'has_context': Value('bool')
})
df = pd.DataFrame(formatted_data)
dataset = Dataset.from_pandas(df, features=features)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
print(f"\n=== Final Dataset ===")
print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['test'])}")
print(f"Examples with context: {sum(df['has_context'])}")
# Save
dataset.save_to_disk("./kokorochat_processed_multiturn")
print("\n✅ Multi-turn dataset saved to ./kokorochat_processed_multiturn")
# Show sample
print("\n=== Sample Training Example (with context) ===")
sample = dataset['train'][5]['text']
print(sample[:1000] + "\n..." if len(sample) > 1000 else sample)
return dataset
if __name__ == "__main__":
dataset = create_training_dataset_multiturn(
data_dir="./KokoroChat/kokorochat_dialogues",
min_score=60, # Top 30% quality
context_window=4 # Include 4 previous turns
)
|