File size: 5,309 Bytes
0446288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# data_preparation.py
import json
import os
from pathlib import Path
import pandas as pd
from typing import List, Dict, Tuple
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

class KokoroChatProcessor:
    def __init__(self, data_path: str):
        self.data_path = Path(data_path)
        self.conversations = []
        self.processed_data = []
        
    def load_all_conversations(self) -> List[Dict]:
        """Load all JSON files from KokoroChat dataset"""
        json_files = list(self.data_path.glob("**/*.json"))
        print(f"Found {len(json_files)} conversation files")
        
        for json_file in tqdm(json_files, desc="Loading conversations"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    self.conversations.append(data)
            except Exception as e:
                print(f"Error loading {json_file}: {e}")
                
        return self.conversations
    
    def create_training_examples(self) -> List[Dict]:
        """Convert conversations to training format"""
        
        for conv_data in tqdm(self.conversations, desc="Processing conversations"):
            dialogue = conv_data.get('dialogue', [])
            topic = conv_data.get('topic', {})
            review = conv_data.get('review_by_client_jp', {})
            
            # Create conversation context
            conversation_pairs = []
            
            for i in range(0, len(dialogue) - 1, 2):
                if i + 1 < len(dialogue):
                    counselor_msg = dialogue[i]
                    client_msg = dialogue[i + 1] if i + 1 < len(dialogue) else None
                    
                    if counselor_msg['role'] == 'counselor' and client_msg and client_msg['role'] == 'client':
                        # Build context from previous messages
                        context = self._build_context(dialogue[:i+1])
                        
                        training_example = {
                            'instruction': "あなたは共感的で専門的な心理カウンセラーです。クライアントの悩みに寄り添い、適切なサポートを提供してください。",
                            'input': f"クライアント: {client_msg['utterance']}",
                            'output': counselor_msg['utterance'],
                            'context': context,
                            'topic': topic.get('main_jp', ''),
                            'quality_score': self._calculate_quality_score(review)
                        }
                        
                        self.processed_data.append(training_example)
        
        return self.processed_data
    
    def _build_context(self, dialogue_history: List[Dict], max_turns: int = 5) -> str:
        """Build conversation context from history"""
        context_parts = []
        start_idx = max(0, len(dialogue_history) - max_turns * 2)
        
        for msg in dialogue_history[start_idx:]:
            role = "カウンセラー" if msg['role'] == 'counselor' else "クライアント"
            context_parts.append(f"{role}: {msg['utterance']}")
        
        return "\n".join(context_parts)
    
    def _calculate_quality_score(self, review: Dict) -> float:
        """Calculate quality score from client review"""
        if not review or review.get('点数') is None:
            return 0.5  # Default middle score
        
        # Normalize score (assuming max score is 100)
        return review.get('点数', 50) / 100.0
    
    def prepare_for_finetuning(self, test_size: float = 0.1, val_size: float = 0.1):
        """Prepare train/val/test splits"""
        
        # Filter high-quality examples (score > 0.6)
        high_quality = [ex for ex in self.processed_data if ex['quality_score'] > 0.6]
        print(f"Selected {len(high_quality)} high-quality examples")
        
        # Create splits
        train_data, test_data = train_test_split(high_quality, test_size=test_size, random_state=42)
        train_data, val_data = train_test_split(train_data, test_size=val_size, random_state=42)
        
        # Format for fine-tuning
        def format_example(ex):
            prompt = f"""### 指示:
{ex['instruction']}

### コンテキスト:
{ex['context']}

### 入力:
{ex['input']}

### 応答:
{ex['output']}"""
            return {'text': prompt}
        
        train_formatted = [format_example(ex) for ex in train_data]
        val_formatted = [format_example(ex) for ex in val_data]
        test_formatted = [format_example(ex) for ex in test_data]
        
        return train_formatted, val_formatted, test_formatted

# Execute data preparation
processor = KokoroChatProcessor('KokoroChat/data')
processor.load_all_conversations()
processor.create_training_examples()
train_data, val_data, test_data = processor.prepare_for_finetuning()

# Save processed data
import pickle
with open('processed_data.pkl', 'wb') as f:
    pickle.dump({
        'train': train_data,
        'val': val_data,
        'test': test_data
    }, f)

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")
print(f"Test examples: {len(test_data)}")