lfm_complete_code / dataprocessing_multiturn.py
Techiiot's picture
Upload folder using huggingface_hub
0446288 verified
# prepare_dataset_multiturn.py
import json
import os
from datasets import Dataset, Features, Value
import pandas as pd
from pathlib import Path
def parse_kokorochat_with_context(json_file_path, context_window=4, max_history_tokens=1500):
"""
Parse KokoroChat with conversation history for realistic counseling.
Args:
json_file_path: Path to JSON file
context_window: Number of previous turns to include (default: 4 = 2 exchanges)
max_history_tokens: Approximate token limit for history (prevents too long sequences)
"""
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except Exception as e:
return [], 0
conversations = []
dialogue = data.get('dialogue', [])
# Get quality score
review_en = data.get('review_by_client_en', {})
total_score = review_en.get('score', 0)
# Get topic
topic = data.get('topic', {})
main_topic = topic.get('main_en', '')
sub_topic = topic.get('sub', '')
# Extract examples with context
for i in range(len(dialogue) - 1):
current = dialogue[i]
next_turn = dialogue[i + 1]
# Look for client -> counselor pairs
if current['role'] == 'client' and next_turn['role'] == 'counselor':
client_msg = current['utterance'].strip()
counselor_msg = next_turn['utterance'].strip()
if len(client_msg) > 5 and len(counselor_msg) > 5:
# Get conversation history (previous turns)
start_idx = max(0, i - context_window)
history = dialogue[start_idx:i]
# Estimate token count (rough: ~3 chars per token for Japanese)
history_text = ''.join([h['utterance'] for h in history])
if len(history_text) < max_history_tokens * 3: # Keep reasonable length
conversations.append({
'history': history,
'client': client_msg,
'counselor': counselor_msg,
'quality_score': total_score,
'topic_main': main_topic,
'topic_sub': sub_topic,
'dialogue_id': Path(json_file_path).stem
})
return conversations, total_score
def format_conversation_for_lfm2(conversation):
"""
Format conversation with history into LFM2 ChatML template
"""
# Start with system prompt
formatted = "<|im_start|>system\n"
formatted += "あなたは経験豊富な心理カウンセラーです。クライアントの話を傾聴し、共感的で支援的な応答をしてください。<|im_end|>\n"
# Add conversation history
for turn in conversation['history']:
if turn['role'] == 'client':
formatted += f"<|im_start|>user\n{turn['utterance']}<|im_end|>\n"
elif turn['role'] == 'counselor':
formatted += f"<|im_start|>assistant\n{turn['utterance']}<|im_end|>\n"
# Add current exchange (what we're training on)
formatted += f"<|im_start|>user\n{conversation['client']}<|im_end|>\n"
formatted += f"<|im_start|>assistant\n{conversation['counselor']}<|im_end|><|endoftext|>"
return formatted
def create_training_dataset_multiturn(
data_dir="./KokoroChat/data",
min_score=70,
context_window=4
):
"""
Create training dataset with conversation context.
Args:
data_dir: Directory containing JSON files
min_score: Minimum quality score (0-100, recommend 85 for top quality)
context_window: Number of previous turns to include
"""
json_files = list(Path(data_dir).rglob("*.json"))
print(f"Found {len(json_files)} JSON files")
all_conversations = []
score_distribution = []
print("\nProcessing files with multi-turn context...")
for idx, json_file in enumerate(json_files):
if idx % 1000 == 0:
print(f"Processed {idx}/{len(json_files)} files...")
try:
convs, score = parse_kokorochat_with_context(
json_file,
context_window=context_window
)
score_distribution.append(score)
if score >= min_score:
all_conversations.extend(convs)
except Exception as e:
continue
print(f"\n=== Processing Results ===")
print(f"High-quality files (>= {min_score}): {sum(1 for s in score_distribution if s >= min_score)}")
print(f"Total conversation examples: {len(all_conversations)}")
if len(all_conversations) == 0:
print(f"❌ No conversations found! Try lowering min_score (current: {min_score})")
return None
# Format for LFM2
formatted_data = []
for conv in all_conversations:
formatted_text = format_conversation_for_lfm2(conv)
formatted_data.append({
'text': formatted_text,
'quality_score': conv['quality_score'],
'topic_main': conv['topic_main'],
'topic_sub': conv['topic_sub'],
'has_context': len(conv['history']) > 0
})
# Create dataset
features = Features({
'text': Value('string'),
'quality_score': Value('int64'),
'topic_main': Value('string'),
'topic_sub': Value('string'),
'has_context': Value('bool')
})
df = pd.DataFrame(formatted_data)
dataset = Dataset.from_pandas(df, features=features)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
print(f"\n=== Final Dataset ===")
print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['test'])}")
print(f"Examples with context: {sum(df['has_context'])}")
# Save
dataset.save_to_disk("./kokorochat_processed_multiturn")
print("\n✅ Multi-turn dataset saved to ./kokorochat_processed_multiturn")
# Show sample
print("\n=== Sample Training Example (with context) ===")
sample = dataset['train'][5]['text']
print(sample[:1000] + "\n..." if len(sample) > 1000 else sample)
return dataset
if __name__ == "__main__":
dataset = create_training_dataset_multiturn(
data_dir="./KokoroChat/kokorochat_dialogues",
min_score=60, # Top 30% quality
context_window=4 # Include 4 previous turns
)