Techiiot commited on Oct 12, 2025

Commit

27c46c6

verified ·

1 Parent(s): 76d2998

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +3 -0
benchmark.py +1201 -0
benchmark_report.txt +33 -0
benchmark_report_2b-Copy1.txt +23 -0
benchmark_report_2b.txt +23 -0
benchmark_report_v2.txt +23 -0
benchmark_report_wo_merging.txt +33 -0
benchmark_results.png +3 -0
benchmark_v1.py +803 -0
benchmarking_v2.py +782 -0
chat.py +339 -0
data_preprocessor.py +524 -0
finalmerged_model.zip +3 -0
finetune_lfm.py +1311 -0
finetune_lfm_complete_history.py +801 -0
finetune_trl_supervised.py +221 -0
merge_model.py +74 -0
preprocess_kokoro_method.py +651 -0
score_analysis_threshold_60.png +3 -0
score_distribution.png +3 -0
training_config.json +23 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+benchmark_results.png filter=lfs diff=lfs merge=lfs -text
+score_analysis_threshold_60.png filter=lfs diff=lfs merge=lfs -text
+score_distribution.png filter=lfs diff=lfs merge=lfs -text

benchmark.py ADDED Viewed

	@@ -0,0 +1,1201 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+import json
+from typing import Dict, List, Tuple
+import numpy as np
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score, f1_score
+import evaluate
+from datasets import load_dataset
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+class CounselorBenchmark:
+    def __init__(self, base_model_path: str, finetuned_model_path: str):
+        """
+        Initialize benchmark suite for counselor models
+        """
+        self.base_model_path = base_model_path
+        self.finetuned_model_path = finetuned_model_path
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load evaluation metrics
+        self.bleu = evaluate.load("sacrebleu")
+        self.rouge = evaluate.load("rouge")
+        self.bertscore = evaluate.load("bertscore")
+    def load_models(self):
+        """Load both base and fine-tuned models for comparison"""
+        # Load base model
+        print("Loading base model...")
+        self.base_tokenizer = AutoTokenizer.from_pretrained(self.base_model_path)
+        self.base_model = AutoModelForCausalLM.from_pretrained(
+            self.base_model_path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto"
+        )
+        # Load fine-tuned model
+        print("Loading fine-tuned model...")
+        self.ft_tokenizer = AutoTokenizer.from_pretrained(self.finetuned_model_path)
+        self.ft_model = AutoModelForCausalLM.from_pretrained(
+            self.finetuned_model_path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto"
+        )
+    def generate_response(self, model, tokenizer, prompt: str, max_length: int = 256):
+        """Generate response from model"""
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_length,
+                temperature=0.7,
+                do_sample=True,
+                top_p=0.9,
+                repetition_penalty=1.1
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the generated part
+        response = response[len(prompt):].strip()
+        return response
+    def evaluate_empathy_score(self, response: str) -> float:
+        """
+        Evaluate empathy in counselor response
+        Custom metric based on Japanese counseling keywords
+        """
+        empathy_keywords = [
+            'わかります', '理解', '共感', '気持ち', '感じ',
+            'つらい', '大変', 'お察し', '心配', '支援'
+        ]
+        score = sum(1 for keyword in empathy_keywords if keyword in response)
+        return min(score / len(empathy_keywords), 1.0)
+    def evaluate_response_quality(self, response: str) -> Dict[str, float]:
+        """
+        Comprehensive response quality evaluation
+        """
+        metrics = {}
+        # Length appropriateness (not too short, not too long)
+        response_length = len(response)
+        if 50 <= response_length <= 300:
+            metrics['length_score'] = 1.0
+        elif response_length < 50:
+            metrics['length_score'] = response_length / 50
+        else:
+            metrics['length_score'] = max(0, 1 - (response_length - 300) / 500)
+        # Question engagement (does counselor ask clarifying questions?)
+        metrics['question_score'] = 1.0 if '？' in response or 'か？' in response else 0.0
+        # Supportive language
+        support_phrases = ['大丈夫', '一緒に', '支援', 'サポート', '助け']
+        metrics['support_score'] = sum(1 for phrase in support_phrases if phrase in response) / len(support_phrases)
+        # Empathy score
+        metrics['empathy_score'] = self.evaluate_empathy_score(response)
+        return metrics
+    def benchmark_on_test_set(self, test_data_path: str, num_samples: int = 100):
+        """
+        Run comprehensive benchmark on test set
+        """
+        # Load test data
+        test_dataset = load_dataset('json', data_files=test_data_path, split='train')
+        test_samples = test_dataset.select(range(min(num_samples, len(test_dataset))))
+        results = {
+            'base_model': {'responses': [], 'metrics': []},
+            'finetuned_model': {'responses': [], 'metrics': []}
+        }
+        print(f"Evaluating on {len(test_samples)} test samples...")
+        for sample in tqdm(test_samples):
+            prompt = sample['text'].split('### Response:')[0] + '### Response:'
+            reference = sample['text'].split('### Response:')[1].strip() if '### Response:' in sample['text'] else ""
+            # Generate responses
+            base_response = self.generate_response(self.base_model, self.base_tokenizer, prompt)
+            ft_response = self.generate_response(self.ft_model, self.ft_tokenizer, prompt)
+            # Store responses
+            results['base_model']['responses'].append(base_response)
+            results['finetuned_model']['responses'].append(ft_response)
+            # Evaluate quality
+            base_metrics = self.evaluate_response_quality(base_response)
+            ft_metrics = self.evaluate_response_quality(ft_response)
+            results['base_model']['metrics'].append(base_metrics)
+            results['finetuned_model']['metrics'].append(ft_metrics)
+        return results
+    def calculate_aggregate_metrics(self, results: Dict) -> Dict:
+        """Calculate aggregate metrics for comparison"""
+        aggregate = {}
+        for model_name in ['base_model', 'finetuned_model']:
+            model_metrics = results[model_name]['metrics']
+            aggregate[model_name] = {}
+            # Calculate average for each metric
+            metric_names = model_metrics[0].keys() if model_metrics else []
+            for metric in metric_names:
+                values = [m[metric] for m in model_metrics]
+                aggregate[model_name][metric] = {
+                    'mean': np.mean(values),
+                    'std': np.std(values),
+                    'min': np.min(values),
+                    'max': np.max(values)
+                }
+        return aggregate
+    def generate_comparison_report(self, results: Dict, aggregate: Dict):
+        """Generate detailed comparison report"""
+        report = []
+        report.append("=" * 80)
+        report.append("COUNSELOR MODEL BENCHMARK REPORT")
+        report.append("=" * 80)
+        report.append("")
+        # Overall performance comparison
+        report.append("PERFORMANCE COMPARISON:")
+        report.append("-" * 40)
+        for metric in aggregate['base_model'].keys():
+            base_score = aggregate['base_model'][metric]['mean']
+            ft_score = aggregate['finetuned_model'][metric]['mean']
+            improvement = ((ft_score - base_score) / base_score * 100) if base_score > 0 else 0
+            report.append(f"\n{metric.upper()}:")
+            report.append(f"  Base Model:      {base_score:.3f} (±{aggregate['base_model'][metric]['std']:.3f})")
+            report.append(f"  Fine-tuned Model: {ft_score:.3f} (±{aggregate['finetuned_model'][metric]['std']:.3f})")
+            report.append(f"  Improvement:      {improvement:+.1f}%")
+        # Calculate overall score
+        base_overall = np.mean([aggregate['base_model'][m]['mean'] for m in aggregate['base_model']])
+        ft_overall = np.mean([aggregate['finetuned_model'][m]['mean'] for m in aggregate['finetuned_model']])
+        overall_improvement = ((ft_overall - base_overall) / base_overall * 100) if base_overall > 0 else 0
+        report.append("\n" + "=" * 40)
+        report.append("OVERALL PERFORMANCE:")
+        report.append(f"  Base Model:       {base_overall:.3f}")
+        report.append(f"  Fine-tuned Model: {ft_overall:.3f}")
+        report.append(f"  Overall Improvement: {overall_improvement:+.1f}%")
+        report.append("=" * 40)
+        return "\n".join(report)
+    def visualize_results(self, aggregate: Dict):
+        """Create visualization of benchmark results"""
+        # Prepare data for plotting
+        metrics = list(aggregate['base_model'].keys())
+        base_scores = [aggregate['base_model'][m]['mean'] for m in metrics]
+        ft_scores = [aggregate['finetuned_model'][m]['mean'] for m in metrics]
+        # Create comparison plot
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
+        # Bar plot comparison
+        x = np.arange(len(metrics))
+        width = 0.35
+        ax1.bar(x - width/2, base_scores, width, label='Base Model', color='lightblue')
+        ax1.bar(x + width/2, ft_scores, width, label='Fine-tuned Model', color='darkblue')
+        ax1.set_xlabel('Metrics')
+        ax1.set_ylabel('Score')
+        ax1.set_title('Model Performance Comparison')
+        ax1.set_xticks(x)
+        ax1.set_xticklabels(metrics, rotation=45, ha='right')
+        ax1.legend()
+        ax1.grid(True, alpha=0.3)
+        # Improvement percentage plot
+        improvements = [((ft - base) / base * 100) if base > 0 else 0
+                       for base, ft in zip(base_scores, ft_scores)]
+        colors = ['green' if imp > 0 else 'red' for imp in improvements]
+        ax2.bar(metrics, improvements, color=colors, alpha=0.7)
+        ax2.set_xlabel('Metrics')
+        ax2.set_ylabel('Improvement (%)')
+        ax2.set_title('Fine-tuning Improvement over Base Model')
+        ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
+        ax2.set_xticklabels(metrics, rotation=45, ha='right')
+        ax2.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig('benchmark_results.png', dpi=300, bbox_inches='tight')
+        plt.show()
+        print("Visualization saved as 'benchmark_results.png'")
+# Run benchmarking
+if __name__ == "__main__":
+    # Initialize benchmark
+    benchmark = CounselorBenchmark(
+        base_model_path="./models/LFM2-2.6B",
+        finetuned_model_path="./merged_counselor_mode_2b"
+    )
+    # Load models
+    benchmark.load_models()
+    # Run benchmark
+    print("Running benchmark evaluation...")
+    results = benchmark.benchmark_on_test_set("./processed_data_score80/test.jsonl", num_samples=100)
+    # Calculate aggregate metrics
+    aggregate = benchmark.calculate_aggregate_metrics(results)
+    # Generate report
+    report = benchmark.generate_comparison_report(results, aggregate)
+    print(report)
+    # Save report
+    with open("benchmark_report_2b.txt", "w") as f:
+        f.write(report)
+    # Visualize results
+    benchmark.visualize_results(aggregate)
+    print("\nBenchmarking completed! Check 'benchmark_report.txt' for detailed results.")
+####################
+# import torch
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+# from peft import PeftModel, PeftConfig
+# import numpy as np
+# from typing import List, Dict, Tuple, Optional
+# import json
+# from tqdm import tqdm
+# import os
+# import gc
+# import warnings
+# from datetime import datetime
+# import pandas as pd
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+# from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
+# from rouge_score import rouge_scorer
+# import nltk
+# from collections import defaultdict
+# # Download required NLTK data
+# try:
+#     nltk.download('punkt', quiet=True)
+# except:
+#     pass
+# warnings.filterwarnings('ignore')
+# class AdvancedCounselorBenchmark:
+#     def __init__(self,
+#                  base_model_name: str = "LiquidAI/LFM2-1.2B",
+#                  finetuned_model_path: str = "./counselor_model/best_model",
+#                  merged_model_path: str = "./merged_counselor_model",
+#                  test_data_path: str = "./processed_data_score70/test.jsonl",
+#                  device: str = None):
+#         """
+#         Initialize advanced benchmark suite with BLEU and ROUGE metrics
+#         Args:
+#             base_model_name: Name/path of base model
+#             finetuned_model_path: Path to fine-tuned LoRA adapter
+#             merged_model_path: Path to save/load merged model
+#             test_data_path: Path to test dataset with reference responses
+#             device: Device to run on (cuda/cpu)
+#         """
+#         self.base_model_name = base_model_name
+#         self.finetuned_model_path = finetuned_model_path
+#         self.merged_model_path = merged_model_path
+#         self.test_data_path = test_data_path
+#         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+#         print(f"🔧 Initializing Advanced Benchmark Suite")
+#         print(f"   Device: {self.device}")
+#         if self.device == "cuda":
+#             print(f"   GPU: {torch.cuda.get_device_name(0)}")
+#             print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+#         # Initialize ROUGE scorer
+#         self.rouge_scorer = rouge_scorer.RougeScorer(
+#             ['rouge1', 'rouge2', 'rougeL'],
+#             use_stemmer=False,  # Set to False for Japanese
+#             lang='japanese'
+#         )
+#         # Smoothing function for BLEU scores
+#         self.smoothing = SmoothingFunction().method1
+#         self.results = {}
+#     def load_test_data(self) -> List[Dict]:
+#         """Load test dataset with reference responses"""
+#         print(f"\n📚 Loading test data from {self.test_data_path}")
+#         test_data = []
+#         if os.path.exists(self.test_data_path):
+#             with open(self.test_data_path, 'r', encoding='utf-8') as f:
+#                 for line in f:
+#                     data = json.loads(line)
+#                     test_data.append(data)
+#             print(f"   Loaded {len(test_data)} test examples")
+#         else:
+#             print(f"⚠️  Test data not found. Creating synthetic test data...")
+#             test_data = self.create_synthetic_test_data()
+#         return test_data
+#     def create_synthetic_test_data(self) -> List[Dict]:
+#         """Create synthetic test data if real data is not available"""
+#         synthetic_data = [
+#             {
+#                 "text": "### Input:\n最近ストレスを感じています。\n\n### Response:\nストレスを感じているのですね。それは大変つらいことだと思います。どのような状況でストレスを感じることが��いですか？お話を聞かせていただければ、一緒に対処法を考えることができます。",
+#                 "input": "最近ストレスを感じています。",
+#                 "reference": "ストレスを感じているのですね。それは大変つらいことだと思います。どのような状況でストレスを感じることが多いですか？お話を聞かせていただければ、一緒に対処法を考えることができます。"
+#             },
+#             {
+#                 "text": "### Input:\n仕事がうまくいかなくて悩んでいます。\n\n### Response:\n仕事でお悩みなのですね。うまくいかないと感じると、本当に辛いですよね。具体的にどのような点で困難を感じていらっしゃいますか？一緒に整理してみましょう。",
+#                 "input": "仕事がうまくいかなくて悩んでいます。",
+#                 "reference": "仕事でお悩みなのですね。うまくいかないと感じると、本当に辛いですよね。具体的にどのような点で困難を感じていらっしゃいますか？一緒に整理してみましょう。"
+#             },
+#             {
+#                 "text": "### Input:\n人間関係で困っています。\n\n### Response:\n人間関係の悩みは本当に心が疲れますよね。お気持ちお察しします。どのような関係性でお困りでしょうか？職場、家族、友人関係など、もう少し詳しくお聞かせいただけますか？",
+#                 "input": "人間関係で困っています。",
+#                 "reference": "人間関係の悩みは本当に心が疲れますよね。お気持ちお察しします。どのような関係性でお困りでしょうか？職場、家族、友人関係など、もう少し詳しくお聞かせいただけますか？"
+#             },
+#             {
+#                 "text": "### Input:\n将来が不安です。\n\n### Response:\n将来への不安を抱えていらっしゃるのですね。先が見えない不安は、とても重く感じられることと思います。特にどのような点について不安を感じていらっしゃいますか？",
+#                 "input": "将来が不安です。",
+#                 "reference": "将来への不安を抱えていらっしゃるのですね。先が見えない不安は、とても重く感じられることと思います。特にどのような点について不安を感じていらっしゃいますか？"
+#             },
+#             {
+#                 "text": "### Input:\n自信が持てません。\n\n### Response:\n自信が持てないというお気持ち、よくわかります。多くの方が同じような悩みを抱えています。どのような場面で特に自信が持てないと感じますか？あなたの強みも一緒に見つけていきましょう。",
+#                 "input": "自信が持てません。",
+#                 "reference": "自信が持てないというお気持ち、よくわかります。多くの方が同じような悩みを抱えています。どのような場面で特に自信が持てないと感じますか？あなたの強みも一緒に見つけていきましょう。"
+#             }
+#         ]
+#         return synthetic_data
+#     def merge_and_save_model(self, force_merge: bool = False):
+#         """Merge LoRA weights with base model and save"""
+#         if os.path.exists(self.merged_model_path) and not force_merge:
+#             print(f"✅ Merged model already exists at {self.merged_model_path}")
+#             return
+#         print("\n🔄 Merging LoRA adapter with base model...")
+#         try:
+#             # Load base model
+#             print("  Loading base model...")
+#             base_model = AutoModelForCausalLM.from_pretrained(
+#                 self.base_model_name,
+#                 torch_dtype=torch.float16,
+#                 device_map="auto" if self.device == "cuda" else None,
+#                 trust_remote_code=True,
+#                 low_cpu_mem_usage=True
+#             )
+#             # Check if adapter exists
+#             adapter_config_path = os.path.join(self.finetuned_model_path, "adapter_config.json")
+#             if not os.path.exists(adapter_config_path):
+#                 print(f"⚠️ No LoRA adapter found at {self.finetuned_model_path}")
+#                 model = base_model
+#             else:
+#                 # Load LoRA adapter
+#                 print("  Loading LoRA adapter...")
+#                 model = PeftModel.from_pretrained(
+#                     base_model,
+#                     self.finetuned_model_path,
+#                     torch_dtype=torch.float16
+#                 )
+#                 # Merge weights
+#                 print("  Merging weights...")
+#                 model = model.merge_and_unload()
+#             # Save merged model
+#             print(f"  Saving merged model to {self.merged_model_path}...")
+#             model.save_pretrained(self.merged_model_path)
+#             # Save tokenizer
+#             tokenizer = AutoTokenizer.from_pretrained(
+#                 self.finetuned_model_path
+#                 if os.path.exists(os.path.join(self.finetuned_model_path, "tokenizer_config.json"))
+#                 else self.base_model_name
+#             )
+#             tokenizer.save_pretrained(self.merged_model_path)
+#             print("✅ Model merged and saved successfully!")
+#             # Clean up memory
+#             del base_model, model
+#             gc.collect()
+#             torch.cuda.empty_cache()
+#         except Exception as e:
+#             print(f"❌ Error during merging: {e}")
+#             raise
+#     def load_models(self):
+#         """Load base and fine-tuned models for comparison"""
+#         print("\n📚 Loading models for benchmarking...")
+#         # Load tokenizer
+#         self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
+#         if self.tokenizer.pad_token is None:
+#             self.tokenizer.pad_token = self.tokenizer.eos_token
+#         # Load base model
+#         print("  Loading base model...")
+#         self.base_model = AutoModelForCausalLM.from_pretrained(
+#             self.base_model_name,
+#             torch_dtype=torch.float16,
+#             device_map="auto" if self.device == "cuda" else None,
+#             trust_remote_code=True,
+#             low_cpu_mem_usage=True
+#         )
+#         self.base_model.eval()
+#         # Load merged fine-tuned model
+#         if os.path.exists(self.merged_model_path):
+#             print("  Loading merged fine-tuned model...")
+#             self.finetuned_model = AutoModelForCausalLM.from_pretrained(
+#                 self.merged_model_path,
+#                 torch_dtype=torch.float16,
+#                 device_map="auto" if self.device == "cuda" else None,
+#                 trust_remote_code=True,
+#                 low_cpu_mem_usage=True
+#             )
+#         else:
+#             print("  Loading fine-tuned model (attempting PEFT)...")
+#             try:
+#                 base_for_peft = AutoModelForCausalLM.from_pretrained(
+#                     self.base_model_name,
+#                     torch_dtype=torch.float16,
+#                     device_map="auto" if self.device == "cuda" else None,
+#                     trust_remote_code=True,
+#                     low_cpu_mem_usage=True
+#                 )
+#                 self.finetuned_model = PeftModel.from_pretrained(
+#                     base_for_peft,
+#                     self.finetuned_model_path,
+#                     torch_dtype=torch.float16
+#                 )
+#             except:
+#                 self.finetuned_model = AutoModelForCausalLM.from_pretrained(
+#                     self.finetuned_model_path,
+#                     torch_dtype=torch.float16,
+#                     device_map="auto" if self.device == "cuda" else None,
+#                     trust_remote_code=True,
+#                     low_cpu_mem_usage=True
+#                 )
+#         self.finetuned_model.eval()
+#         print("✅ Models loaded successfully!")
+#     def generate_response(self, model, prompt: str, max_length: int = 150) -> str:
+#         """Generate response from model"""
+#         inputs = self.tokenizer(
+#             prompt,
+#             return_tensors="pt",
+#             truncation=True,
+#             max_length=512
+#         )
+#         if self.device == "cuda":
+#             inputs = {k: v.cuda() for k, v in inputs.items()}
+#         with torch.no_grad():
+#             outputs = model.generate(
+#                 **inputs,
+#                 max_new_tokens=max_length,
+#                 temperature=0.7,
+#                 do_sample=True,
+#                 top_p=0.9,
+#                 pad_token_id=self.tokenizer.pad_token_id,
+#                 eos_token_id=self.tokenizer.eos_token_id
+#             )
+#         response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+#         # Extract only the generated response
+#         if "### Response:" in response:
+#             response = response.split("### Response:")[-1].strip()
+#         elif "Response:" in response:
+#             response = response.split("Response:")[-1].strip()
+#         else:
+#             # Remove the input prompt from response
+#             response = response[len(prompt):].strip()
+#         return response
+#     def tokenize_japanese(self, text: str) -> List[str]:
+#         """Tokenize Japanese text for BLEU calculation"""
+#         # Simple character-based tokenization for Japanese
+#         # In production, use MeCab or similar for better tokenization
+#         import re
+#         # Remove special characters and split
+#         text = re.sub(r'[。、！？\n]', ' ', text)
+#         tokens = text.strip().split()
+#         # Character-level tokenization as fallback
+#         if not tokens:
+#             tokens = list(text.strip())
+#         return tokens
+#     def calculate_bleu_scores(self, reference: str, hypothesis: str) -> Dict[str, float]:
+#         """Calculate BLEU-1, BLEU-2, BLEU-3, BLEU-4 scores"""
+#         # Tokenize texts
+#         ref_tokens = self.tokenize_japanese(reference)
+#         hyp_tokens = self.tokenize_japanese(hypothesis)
+#         # Calculate BLEU scores with different n-grams
+#         scores = {}
+#         # BLEU-1 (unigram)
+#         scores['BLEU-1'] = sentence_bleu(
+#             [ref_tokens], hyp_tokens,
+#             weights=(1.0, 0, 0, 0),
+#             smoothing_function=self.smoothing
+#         )
+#         # BLEU-2 (bigram)
+#         scores['BLEU-2'] = sentence_bleu(
+#             [ref_tokens], hyp_tokens,
+#             weights=(0.5, 0.5, 0, 0),
+#             smoothing_function=self.smoothing
+#         )
+#         # BLEU-3 (trigram)
+#         scores['BLEU-3'] = sentence_bleu(
+#             [ref_tokens], hyp_tokens,
+#             weights=(0.33, 0.33, 0.34, 0),
+#             smoothing_function=self.smoothing
+#         )
+#         # BLEU-4 (4-gram)
+#         scores['BLEU-4'] = sentence_bleu(
+#             [ref_tokens], hyp_tokens,
+#             weights=(0.25, 0.25, 0.25, 0.25),
+#             smoothing_function=self.smoothing
+#         )
+#         return scores
+#     def calculate_rouge_scores(self, reference: str, hypothesis: str) -> Dict[str, float]:
+#         """Calculate ROUGE-1, ROUGE-2, ROUGE-L scores"""
+#         scores = self.rouge_scorer.score(reference, hypothesis)
+#         return {
+#             'ROUGE-1': scores['rouge1'].fmeasure,
+#             'ROUGE-2': scores['rouge2'].fmeasure,
+#             'ROUGE-L': scores['rougeL'].fmeasure
+#         }
+#     def run_bleu_rouge_benchmark(self, num_samples: int = None):
+#         """Run comprehensive BLEU and ROUGE benchmark"""
+#         print("\n" + "="*70)
+#         print("🏃 RUNNING BLEU & ROUGE BENCHMARK")
+#         print("="*70)
+#         # Load test data
+#         test_data = self.load_test_data()
+#         if num_samples:
+#             test_data = test_data[:num_samples]
+#             print(f"   Using {num_samples} samples for benchmarking")
+#         # Initialize score collectors
+#         base_scores = defaultdict(list)
+#         finetuned_scores = defaultdict(list)
+#         # Metrics to calculate
+#         metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4',
+#                   'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
+#         print(f"\n📊 Evaluating {len(test_data)} test examples...")
+#         print("-" * 70)
+#         detailed_results = []
+#         for i, example in enumerate(tqdm(test_data, desc="Evaluating")):
+#             # Extract input and reference
+#             if 'input' in example:
+#                 input_text = example['input']
+#             else:
+#                 # Try to extract from text field
+#                 if "### Input:" in example['text']:
+#                     input_text = example['text'].split("### Input:")[1].split("### Response:")[0].strip()
+#                 else:
+#                     input_text = example['text'].split("\n")[0].strip()
+#             if 'reference' in example:
+#                 reference = example['reference']
+#             else:
+#                 # Try to extract from text field
+#                 if "### Response:" in example['text']:
+#                     reference = example['text'].split("### Response:")[1].strip()
+#                 else:
+#                     parts = example['text'].split("\n")
+#                     reference = parts[1] if len(parts) > 1 else parts[0]
+#             # Format input for models
+#             formatted_input = f"### Instruction:\nあなたは思いやりのある心理カウンセラーです。\n\n### Input:\n{input_text}\n\n### Response:\n"
+#             # Generate responses
+#             base_response = self.generate_response(self.base_model, formatted_input)
+#             finetuned_response = self.generate_response(self.finetuned_model, formatted_input)
+#             # Calculate BLEU scores
+#             base_bleu = self.calculate_bleu_scores(reference, base_response)
+#             finetuned_bleu = self.calculate_bleu_scores(reference, finetuned_response)
+#             # Calculate ROUGE scores
+#             base_rouge = self.calculate_rouge_scores(reference, base_response)
+#             finetuned_rouge = self.calculate_rouge_scores(reference, finetuned_response)
+#             # Combine scores
+#             base_all_scores = {**base_bleu, **base_rouge}
+#             finetuned_all_scores = {**finetuned_bleu, **finetuned_rouge}
+#             # Collect scores
+#             for metric in metrics:
+#                 base_scores[metric].append(base_all_scores[metric])
+#                 finetuned_scores[metric].append(finetuned_all_scores[metric])
+#             # Store detailed results
+#             detailed_results.append({
+#                 'input': input_text,
+#                 'reference': reference,
+#                 'base_response': base_response,
+#                 'finetuned_response': finetuned_response,
+#                 'base_scores': base_all_scores,
+#                 'finetuned_scores': finetuned_all_scores
+#             })
+#             # Print sample results
+#             if i < 3:  # Show first 3 examples
+#                 print(f"\n📝 Example {i+1}:")
+#                 print(f"   Input: {input_text[:50]}...")
+#                 print(f"   Reference: {reference[:50]}...")
+#                 print(f"   Base response: {base_response[:50]}...")
+#                 print(f"   Fine-tuned response: {finetuned_response[:50]}...")
+#                 print(f"   Base BLEU-4: {base_bleu['BLEU-4']:.3f}")
+#                 print(f"   Fine-tuned BLEU-4: {finetuned_bleu['BLEU-4']:.3f}")
+#         # Calculate aggregate statistics
+#         print("\n" + "="*70)
+#         print("📈 BENCHMARK RESULTS")
+#         print("="*70)
+#         self.results = {
+#             'detailed_results': detailed_results,
+#             'aggregate_scores': {},
+#             'improvements': {}
+#         }
+#         # Print and store results
+#         print("\n" + "-"*70)
+#         print(f"{'Metric':<12} {'Base Model':<20} {'Fine-tuned Model':<20} {'Improvement':<15}")
+#         print("-"*70)
+#         for metric in metrics:
+#             base_mean = np.mean(base_scores[metric])
+#             base_std = np.std(base_scores[metric])
+#             finetuned_mean = np.mean(finetuned_scores[metric])
+#             finetuned_std = np.std(finetuned_scores[metric])
+#             # Calculate improvement
+#             if base_mean > 0:
+#                 improvement = ((finetuned_mean - base_mean) / base_mean) * 100
+#             else:
+#                 improvement = 0
+#             # Store results
+#             self.results['aggregate_scores'][metric] = {
+#                 'base_mean': base_mean,
+#                 'base_std': base_std,
+#                 'finetuned_mean': finetuned_mean,
+#                 'finetuned_std': finetuned_std
+#             }
+#             self.results['improvements'][metric] = improvement
+#             # Print results
+#             base_str = f"{base_mean:.3f} (±{base_std:.3f})"
+#             finetuned_str = f"{finetuned_mean:.3f} (±{finetuned_std:.3f})"
+#             imp_str = f"{improvement:+.1f}%"
+#             # Color code improvement
+#             if improvement > 0:
+#                 imp_str = f"✅ {imp_str}"
+#             elif improvement < 0:
+#                 imp_str = f"⚠️  {imp_str}"
+#             else:
+#                 imp_str = f"➖ {imp_str}"
+#             print(f"{metric:<12} {base_str:<20} {finetuned_str:<20} {imp_str:<15}")
+#         # Calculate overall scores
+#         print("\n" + "="*70)
+#         print("🎯 OVERALL PERFORMANCE")
+#         print("="*70)
+#         # Average BLEU score
+#         bleu_metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']
+#         base_bleu_avg = np.mean([np.mean(base_scores[m]) for m in bleu_metrics])
+#         finetuned_bleu_avg = np.mean([np.mean(finetuned_scores[m]) for m in bleu_metrics])
+#         bleu_improvement = ((finetuned_bleu_avg - base_bleu_avg) / base_bleu_avg) * 100 if base_bleu_avg > 0 else 0
+#         # Average ROUGE score
+#         rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
+#         base_rouge_avg = np.mean([np.mean(base_scores[m]) for m in rouge_metrics])
+#         finetuned_rouge_avg = np.mean([np.mean(finetuned_scores[m]) for m in rouge_metrics])
+#         rouge_improvement = ((finetuned_rouge_avg - base_rouge_avg) / base_rouge_avg) * 100 if base_rouge_avg > 0 else 0
+#         # Overall average
+#         base_overall = np.mean([np.mean(base_scores[m]) for m in metrics])
+#         finetuned_overall = np.mean([np.mean(finetuned_scores[m]) for m in metrics])
+#         overall_improvement = ((finetuned_overall - base_overall) / base_overall) * 100 if base_overall > 0 else 0
+#         self.results['summary'] = {
+#             'bleu_average': {
+#                 'base': base_bleu_avg,
+#                 'finetuned': finetuned_bleu_avg,
+#                 'improvement': bleu_improvement
+#             },
+#             'rouge_average': {
+#                 'base': base_rouge_avg,
+#                 'finetuned': finetuned_rouge_avg,
+#                 'improvement': rouge_improvement
+#             },
+#             'overall': {
+#                 'base': base_overall,
+#                 'finetuned': finetuned_overall,
+#                 'improvement': overall_improvement
+#             }
+#         }
+#         print(f"\n📊 Average BLEU Score:")
+#         print(f"   Base Model:       {base_bleu_avg:.3f}")
+#         print(f"   Fine-tuned Model: {finetuned_bleu_avg:.3f}")
+#         print(f"   Improvement:      {bleu_improvement:+.1f}%")
+#         print(f"\n📊 Average ROUGE Score:")
+#         print(f"   Base Model:       {base_rouge_avg:.3f}")
+#         print(f"   Fine-tuned Model: {finetuned_rouge_avg:.3f}")
+#         print(f"   Improvement:      {rouge_improvement:+.1f}%")
+#         print(f"\n🎯 Overall Average:")
+#         print(f"   Base Model:       {base_overall:.3f}")
+#         print(f"   Fine-tuned Model: {finetuned_overall:.3f}")
+#         print(f"   Improvement:      {overall_improvement:+.1f}%")
+#         print("="*70)
+#         return self.results
+#     def visualize_results(self, save_path: str = "bleu_rouge_benchmark.png"):
+#         """Create comprehensive visualization of BLEU and ROUGE results"""
+#         if 'aggregate_scores' not in self.results:
+#             print("❌ No results to visualize. Run benchmark first.")
+#             return
+#         print("\n📊 Creating visualizations...")
+#         fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+#         # Color scheme
+#         base_color = '#3498db'
+#         finetuned_color = '#e74c3c'
+#         improvement_positive = '#27ae60'
+#         improvement_negative = '#c0392b'
+#         # 1. BLEU Scores Comparison
+#         bleu_metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']
+#         bleu_base = [self.results['aggregate_scores'][m]['base_mean'] for m in bleu_metrics]
+#         bleu_finetuned = [self.results['aggregate_scores'][m]['finetuned_mean'] for m in bleu_metrics]
+#         x = np.arange(len(bleu_metrics))
+#         width = 0.35
+#         axes[0, 0].bar(x - width/2, bleu_base, width, label='Base Model',
+#                       color=base_color, alpha=0.8)
+#         axes[0, 0].bar(x + width/2, bleu_finetuned, width, label='Fine-tuned Model',
+#                       color=finetuned_color, alpha=0.8)
+#         axes[0, 0].set_xlabel('BLEU Metrics')
+#         axes[0, 0].set_ylabel('Score')
+#         axes[0, 0].set_title('BLEU Score Comparison')
+#         axes[0, 0].set_xticks(x)
+#         axes[0, 0].set_xticklabels(bleu_metrics)
+#         axes[0, 0].legend()
+#         axes[0, 0].grid(True, alpha=0.3)
+#         axes[0, 0].set_ylim([0, max(max(bleu_base), max(bleu_finetuned)) * 1.2])
+#         # 2. ROUGE Scores Comparison
+#         rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
+#         rouge_base = [self.results['aggregate_scores'][m]['base_mean'] for m in rouge_metrics]
+#         rouge_finetuned = [self.results['aggregate_scores'][m]['finetuned_mean'] for m in rouge_metrics]
+#         x = np.arange(len(rouge_metrics))
+#         axes[0, 1].bar(x - width/2, rouge_base, width, label='Base Model',
+#                       color=base_color, alpha=0.8)
+#         axes[0, 1].bar(x + width/2, rouge_finetuned, width, label='Fine-tuned Model',
+#                       color=finetuned_color, alpha=0.8)
+#         axes[0, 1].set_xlabel('ROUGE Metrics')
+#         axes[0, 1].set_ylabel('Score')
+#         axes[0, 1].set_title('ROUGE Score Comparison')
+#         axes[0, 1].set_xticks(x)
+#         axes[0, 1].set_xticklabels(rouge_metrics)
+#         axes[0, 1].legend()
+#         axes[0, 1].grid(True, alpha=0.3)
+#         axes[0, 1].set_ylim([0, max(max(rouge_base), max(rouge_finetuned)) * 1.2])
+#         # 3. Improvement Percentages
+#         all_metrics = bleu_metrics + rouge_metrics
+#         improvements = [self.results['improvements'][m] for m in all_metrics]
+#         colors = [improvement_positive if imp > 0 else improvement_negative for imp in improvements]
+#         axes[0, 2].barh(range(len(all_metrics)), improvements, color=colors, alpha=0.7)
+#         axes[0, 2].set_yticks(range(len(all_metrics)))
+#         axes[0, 2].set_yticklabels(all_metrics)
+#         axes[0, 2].set_xlabel('Improvement (%)')
+#         axes[0, 2].set_title('Performance Improvement by Metric')
+#         axes[0, 2].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
+#         axes[0, 2].grid(True, alpha=0.3, axis='x')
+#         # 4. Line plot showing progression
+#         axes[1, 0].plot(bleu_metrics, bleu_base, 'o-', label='Base Model',
+#                        color=base_color, linewidth=2, markersize=8)
+#         axes[1, 0].plot(bleu_metrics, bleu_finetuned, 's-', label='Fine-tuned Model',
+#                        color=finetuned_color, linewidth=2, markersize=8)
+#         axes[1, 0].set_xlabel('BLEU N-gram')
+#         axes[1, 0].set_ylabel('Score')
+#         axes[1, 0].set_title('BLEU Score Progression')
+#         axes[1, 0].legend()
+#         axes[1, 0].grid(True, alpha=0.3)
+#         # 5. Summary Statistics
+#         ax5 = axes[1, 1]
+#         ax5.axis('off')
+#         summary_text = f"""
+# BENCHMARK SUMMARY
+# {'='*30}
+# BLEU Average:
+#   Base:       {self.results['summary']['bleu_average']['base']:.3f}
+#   Fine-tuned: {self.results['summary']['bleu_average']['finetuned']:.3f}
+#   Improvement: {self.results['summary']['bleu_average']['improvement']:+.1f}%
+# ROUGE Average:
+#   Base:       {self.results['summary']['rouge_average']['base']:.3f}
+#   Fine-tuned: {self.results['summary']['rouge_average']['finetuned']:.3f}
+#   Improvement: {self.results['summary']['rouge_average']['improvement']:+.1f}%
+# Overall Performance:
+#   Base:       {self.results['summary']['overall']['base']:.3f}
+#   Fine-tuned: {self.results['summary']['overall']['finetuned']:.3f}
+#   Improvement: {self.results['summary']['overall']['improvement']:+.1f}%
+# Best Improvements:
+# """
+#         # Find best improvements
+#         sorted_metrics = sorted(all_metrics,
+#                               key=lambda m: self.results['improvements'][m],
+#                               reverse=True)
+#         for m in sorted_metrics[:2]:
+#             summary_text += f"  • {m}: {self.results['improvements'][m]:+.1f}%\n"
+#         if any(self.results['improvements'][m] < 0 for m in all_metrics):
+#             summary_text += f"\nNeeds Attention:\n"
+#             for m in sorted_metrics[-2:]:
+#                 if self.results['improvements'][m] < 0:
+#                     summary_text += f"  • {m}: {self.results['improvements'][m]:+.1f}%\n"
+#         ax5.text(0.1, 0.9, summary_text, transform=ax5.transAxes,
+#                 fontsize=10, verticalalignment='top', fontfamily='monospace')
+#         # 6. Heatmap of all scores
+#         metrics_for_heatmap = all_metrics
+#         models = ['Base', 'Fine-tuned']
+#         heatmap_data = []
+#         for metric in metrics_for_heatmap:
+#             heatmap_data.append([
+#                 self.results['aggregate_scores'][metric]['base_mean'],
+#                 self.results['aggregate_scores'][metric]['finetuned_mean']
+#             ])
+#         im = axes[1, 2].imshow(heatmap_data, cmap='YlOrRd', aspect='auto')
+#         axes[1, 2].set_xticks(np.arange(len(models)))
+#         axes[1, 2].set_yticks(np.arange(len(metrics_for_heatmap)))
+#         axes[1, 2].set_xticklabels(models)
+#         axes[1, 2].set_yticklabels(metrics_for_heatmap)
+#         axes[1, 2].set_title('Score Heatmap')
+#         # Add text annotations
+#         for i in range(len(metrics_for_heatmap)):
+#             for j in range(len(models)):
+#                 text = axes[1, 2].text(j, i, f'{heatmap_data[i][j]:.3f}',
+#                                       ha="center", va="center", color="black", fontsize=8)
+#         plt.colorbar(im, ax=axes[1, 2])
+#         plt.suptitle('BLEU & ROUGE Benchmark Results', fontsize=16, fontweight='bold')
+#         plt.tight_layout()
+#         plt.savefig(save_path, dpi=300, bbox_inches='tight')
+#         print(f"✅ Visualization saved to {save_path}")
+#         plt.show()
+#     def save_results(self, output_path: str = "bleu_rouge_results.json"):
+#         """Save benchmark results to JSON"""
+#         # Convert numpy types to Python native types for JSON serialization
+#         def convert_to_native(obj):
+#             if isinstance(obj, np.floating):
+#                 return float(obj)
+#             elif isinstance(obj, np.integer):
+#                 return int(obj)
+#             elif isinstance(obj, np.ndarray):
+#                 return obj.tolist()
+#             elif isinstance(obj, dict):
+#                 return {k: convert_to_native(v) for k, v in obj.items()}
+#             elif isinstance(obj, list):
+#                 return [convert_to_native(item) for item in obj]
+#             return obj
+#         results_native = convert_to_native(self.results)
+#         with open(output_path, 'w', encoding='utf-8') as f:
+#             json.dump(results_native, f, ensure_ascii=False, indent=2)
+#         print(f"✅ Results saved to {output_path}")
+#     def generate_detailed_report(self, output_path: str = "bleu_rouge_report.md"):
+#         """Generate detailed markdown report"""
+#         if not self.results:
+#             print("❌ No results to report. Run benchmark first.")
+#             return
+#         report = f"""# BLEU & ROUGE Benchmark Report
+# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+# ## Executive Summary
+# Comprehensive evaluation of the fine-tuned counseling model using BLEU and ROUGE metrics.
+# ### Overall Performance
+# - **Base Model Score**: {self.results['summary']['overall']['base']:.3f}
+# - **Fine-tuned Model Score**: {self.results['summary']['overall']['finetuned']:.3f}
+# - **Overall Improvement**: {self.results['summary']['overall']['improvement']:+.1f}%
+# ## Detailed Metrics
+# ### BLEU Scores
+# | Metric | Base Model | Fine-tuned Model | Improvement |
+# |--------|------------|------------------|-------------|
+# """
+#         for metric in ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']:
+#             scores = self.results['aggregate_scores'][metric]
+#             report += f"| {metric} | {scores['base_mean']:.3f} (±{scores['base_std']:.3f}) | "
+#             report += f"{scores['finetuned_mean']:.3f} (±{scores['finetuned_std']:.3f}) | "
+#             report += f"{self.results['improvements'][metric]:+.1f}% |\n"
+#         report += f"""
+# **BLEU Average**: {self.results['summary']['bleu_average']['improvement']:+.1f}% improvement
+# ### ROUGE Scores
+# | Metric | Base Model | Fine-tuned Model | Improvement |
+# |--------|------------|------------------|-------------|
+# """
+#         for metric in ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']:
+#             scores = self.results['aggregate_scores'][metric]
+#             report += f"| {metric} | {scores['base_mean']:.3f} (±{scores['base_std']:.3f}) | "
+#             report += f"{scores['finetuned_mean']:.3f} (±{scores['finetuned_std']:.3f}) | "
+#             report += f"{self.results['improvements'][metric]:+.1f}% |\n"
+#         report += f"""
+# **ROUGE Average**: {self.results['summary']['rouge_average']['improvement']:+.1f}% improvement
+# ## Sample Outputs
+# """
+#         # Add sample outputs
+#         for i, result in enumerate(self.results['detailed_results'][:3]):
+#             report += f"""### Example {i+1}
+# **Input**: {result['input']}
+# **Reference**: {result['reference'][:200]}...
+# **Base Model Response**: {result['base_response'][:200]}...
+# **Fine-tuned Model Response**: {result['finetuned_response'][:200]}...
+# **Scores**:
+# - Base BLEU-4: {result['base_scores']['BLEU-4']:.3f}, ROUGE-L: {result['base_scores']['ROUGE-L']:.3f}
+# - Fine-tuned BLEU-4: {result['finetuned_scores']['BLEU-4']:.3f}, ROUGE-L: {result['finetuned_scores']['ROUGE-L']:.3f}
+# ---
+# """
+#         report += """## Analysis & Recommendations
+# """
+#         overall_imp = self.results['summary']['overall']['improvement']
+#         if overall_imp < -10:
+#             report += """### ⚠️ Significant Performance Degradation
+# The fine-tuned model shows significant degradation in BLEU/ROUGE scores. This indicates:
+# 1. **Catastrophic Forgetting**: The model has lost its language generation capabilities
+# 2. **Overfitting**: The model memorized training data instead of learning patterns
+# 3. **Format Mismatch**: Training and inference formats may differ
+# **Immediate Actions Required**:
+# - ✅ Ensure proper model merging (LoRA weights with base model)
+# - ✅ Reduce learning rate (try 1e-5 or 2e-5)
+# - ✅ Use smaller LoRA rank (r=4 or r=8)
+# - ✅ Mix general conversation data with counseling data (80/20 ratio)
+# - ✅ Implement regularization (weight decay=0.1, dropout=0.1)
+# - ✅ Use early stopping with patience=3
+# """
+#         elif overall_imp < 0:
+#             report += """### ⚠️ Minor Performance Degradation
+# The model shows slight degradation. Common causes:
+# 1. **Aggressive Fine-tuning**: Parameters changed too much
+# 2. **Limited Training Data**: Not enough diverse examples
+# 3. **Domain Shift**: Counseling domain too different from base training
+# **Recommended Actions**:
+# - ✅ Fine-tune for fewer epochs (1-2 instead of 3)
+# - ✅ Use gradient accumulation for larger effective batch size
+# - ✅ Implement knowledge distillation from base model
+# - ✅ Add more diverse training examples
+# """
+#         elif overall_imp < 10:
+#             report += """### 📊 Modest Improvement
+# The model shows small but positive improvements.
+# **To Further Improve**:
+# - ✅ Increase training data quality and quantity
+# - ✅ Experiment with different generation parameters
+# - ✅ Fine-tune on domain-specific pre-training
+# - ✅ Use ensemble methods with base model
+# """
+#         else:
+#             report += """### ✅ Significant Improvement
+# Excellent results! The fine-tuned model shows substantial improvements.
+# **Next Steps**:
+# - ✅ Deploy for A/B testing with users
+# - ✅ Monitor performance on edge cases
+# - ✅ Consider model compression for deployment
+# - ✅ Collect user feedback for iterative improvement
+# """
+#         with open(output_path, 'w', encoding='utf-8') as f:
+#             f.write(report)
+#         print(f"✅ Detailed report saved to {output_path}")
+# # Main execution
+# if __name__ == "__main__":
+#     import argparse
+#     parser = argparse.ArgumentParser(description='Advanced BLEU & ROUGE Benchmark')
+#     parser.add_argument('--base_model', type=str, default='LiquidAI/LFM2-2.6B',
+#                        help='Base model name')
+#     parser.add_argument('--finetuned_path', type=str, default='./counselor_model/best_model',
+#                        help='Path to fine-tuned model')
+#     parser.add_argument('--merged_path', type=str, default='./merged_counselor_mode_2b',
+#                        help='Path to save/load merged model')
+#     parser.add_argument('--test_data', type=str, default='./processed_data_score80/test.jsonl',
+#                        help='Path to test data')
+#     parser.add_argument('--num_samples', type=int, default=None,
+#                        help='Number of samples to evaluate (None for all)')
+#     parser.add_argument('--force_merge', action='store_true',
+#                        help='Force re-merge even if merged model exists')
+#     parser.add_argument('--skip_merge', action='store_true',
+#                        help='Skip merging step')
+#     parser.add_argument('--output_dir', type=str, default='./benchmark_results',
+#                        help='Directory to save results')
+#     args = parser.parse_args()
+#     # Create output directory
+#     os.makedirs(args.output_dir, exist_ok=True)
+#     try:
+#         # Initialize benchmark
+#         print("🚀 Initializing Advanced BLEU & ROUGE Benchmark")
+#         benchmark = AdvancedCounselorBenchmark(
+#             base_model_name=args.base_model,
+#             finetuned_model_path=args.finetuned_path,
+#             merged_model_path=args.merged_path,
+#             test_data_path=args.test_data
+#         )
+#         # Merge models if needed
+#         if not args.skip_merge:
+#             benchmark.merge_and_save_model(force_merge=args.force_merge)
+#         # Load models
+#         benchmark.load_models()
+#         # Run BLEU & ROUGE benchmark
+#         results = benchmark.run_bleu_rouge_benchmark(num_samples=args.num_samples)
+#         # Save results
+#         benchmark.save_results(os.path.join(args.output_dir, "bleu_rouge_results_2b.json"))
+#         # Generate visualizations
+#         benchmark.visualize_results(os.path.join(args.output_dir, "bleu_rouge_visualization_2b.png"))
+#         # Generate detailed report
+#         benchmark.generate_detailed_report(os.path.join(args.output_dir, "bleu_rouge_report_2b.md"))
+#         print("\n✅ BLEU & ROUGE Benchmarking completed successfully!")
+#         print(f"📁 Results saved to {args.output_dir}/")
+#     except Exception as e:
+#         print(f"\n❌ Error during benchmarking: {e}")
+#         import traceback
+#         traceback.print_exc()

benchmark_report.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+================================================================================
+COUNSELOR MODEL BENCHMARK REPORT
+================================================================================
+PERFORMANCE COMPARISON:
+----------------------------------------
+LENGTH_SCORE:
+  Base Model:      0.809 (±0.153)
+  Fine-tuned Model: 0.840 (±0.174)
+  Improvement:      +3.8%
+QUESTION_SCORE:
+  Base Model:      0.660 (±0.474)
+  Fine-tuned Model: 0.850 (±0.357)
+  Improvement:      +28.8%
+SUPPORT_SCORE:
+  Base Model:      0.248 (±0.184)
+  Fine-tuned Model: 0.088 (±0.124)
+  Improvement:      -64.5%
+EMPATHY_SCORE:
+  Base Model:      0.262 (±0.086)
+  Fine-tuned Model: 0.152 (±0.114)
+  Improvement:      -42.0%
+========================================
+OVERALL PERFORMANCE:
+  Base Model:       0.495
+  Fine-tuned Model: 0.483
+  Overall Improvement: -2.5%
+========================================

benchmark_report_2b-Copy1.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+================================================================================
+COUNSELOR MODEL BENCHMARK REPORT
+================================================================================
+PERFORMANCE COMPARISON:
+----------------------------------------
+LENGTH_SCORE:
+  Base Model:      0.876 (±0.138)
+  Fine-tuned Model: 0.956 (±0.135)
+  Improvement:      +9.2%
+QUESTION_SCORE:
+  Base Model:      0.670 (±0.470)
+  Fine-tuned Model: 0.900 (±0.300)
+  Improvement:      +34.3%
+========================================
+OVERALL PERFORMANCE:
+  Base Model:       0.773
+  Fine-tuned Model: 0.928
+  Overall Improvement: +20.1%
+========================================

benchmark_report_2b.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+================================================================================
+COUNSELOR MODEL BENCHMARK REPORT
+================================================================================
+PERFORMANCE COMPARISON:
+----------------------------------------
+LENGTH_SCORE:
+  Base Model:      0.876 (±0.138)
+  Fine-tuned Model: 0.956 (±0.135)
+  Improvement:      +9.2%
+QUESTION_SCORE:
+  Base Model:      0.670 (±0.470)
+  Fine-tuned Model: 0.900 (±0.300)
+  Improvement:      +34.3%
+========================================
+OVERALL PERFORMANCE:
+  Base Model:       0.773
+  Fine-tuned Model: 0.928
+  Overall Improvement: +20.1%
+========================================

benchmark_report_v2.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+================================================================================
+COUNSELOR MODEL BENCHMARK REPORT
+================================================================================
+PERFORMANCE COMPARISON:
+----------------------------------------
+LENGTH_SCORE:
+  Base Model:      0.785 (±0.146)
+  Fine-tuned Model: 0.822 (±0.189)
+  Improvement:      +4.8%
+QUESTION_SCORE:
+  Base Model:      0.680 (±0.466)
+  Fine-tuned Model: 0.870 (±0.336)
+  Improvement:      +27.9%
+========================================
+OVERALL PERFORMANCE:
+  Base Model:       0.732
+  Fine-tuned Model: 0.846
+  Overall Improvement: +15.5%
+========================================

benchmark_report_wo_merging.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+================================================================================
+COUNSELOR MODEL BENCHMARK REPORT
+================================================================================
+PERFORMANCE COMPARISON:
+----------------------------------------
+LENGTH_SCORE:
+  Base Model:      0.807 (±0.154)
+  Fine-tuned Model: 0.808 (±0.202)
+  Improvement:      +0.1%
+QUESTION_SCORE:
+  Base Model:      0.670 (±0.470)
+  Fine-tuned Model: 0.910 (±0.286)
+  Improvement:      +35.8%
+SUPPORT_SCORE:
+  Base Model:      0.236 (±0.186)
+  Fine-tuned Model: 0.082 (±0.120)
+  Improvement:      -65.3%
+EMPATHY_SCORE:
+  Base Model:      0.267 (±0.099)
+  Fine-tuned Model: 0.141 (±0.100)
+  Improvement:      -47.2%
+========================================
+OVERALL PERFORMANCE:
+  Base Model:       0.495
+  Fine-tuned Model: 0.485
+  Overall Improvement: -2.0%
+========================================

benchmark_results.png ADDED Viewed

Git LFS Details

SHA256: d7df2f6780b3503d0d5eabe5374989e62691bbdd5528a1aa61cf69d9181550d6
Pointer size: 131 Bytes
Size of remote file: 155 kB

benchmark_v1.py ADDED Viewed

	@@ -0,0 +1,803 @@

+"""
+Comprehensive Japanese Counseling Model Benchmark Script
+Based on KokoroChat paper evaluation methodology
+"""
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import numpy as np
+from typing import List, Dict, Tuple, Optional, Any
+import json
+from tqdm import tqdm
+import os
+import gc
+import warnings
+from datetime import datetime
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import defaultdict
+import MeCab
+from rouge_score import rouge_scorer
+from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
+import sacrebleu
+from bert_score import score as bert_score
+import re
+import statistics
+warnings.filterwarnings('ignore')
+# Set style for better visualizations
+plt.style.use('seaborn-v0_8-darkgrid')
+sns.set_palette("husl")
+class JapaneseCounselingBenchmark:
+    """
+    Comprehensive benchmark suite for Japanese counseling models
+    Following KokoroChat paper evaluation methodology
+    """
+    def __init__(self,
+                 base_model_name: str = "LiquidAI/LFM2-1.2B",
+                 finetuned_model_path: str = "./merged_counselor_model",
+                 test_data_path: str = "./processed_data_score70/test.jsonl",
+                 device: str = None):
+        """
+        Initialize Japanese counseling benchmark
+        Args:
+            base_model_name: Name/path of base model
+            finetuned_model_path: Path to fine-tuned merged model
+            test_data_path: Path to test dataset
+            device: Device to run on (cuda/cpu)
+        """
+        self.base_model_name = base_model_name
+        self.finetuned_model_path = finetuned_model_path
+        self.test_data_path = test_data_path
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        print("="*80)
+        print("🎌 Japanese Counseling Model Benchmark Suite")
+        print("="*80)
+        print(f"📍 Device: {self.device}")
+        if self.device == "cuda":
+            print(f"   GPU: {torch.cuda.get_device_name(0)}")
+            print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+        # Initialize MeCab for Japanese tokenization
+        try:
+            self.mecab = MeCab.Tagger("-Owakati")  # Wakati-gaki mode for word segmentation
+            print("✅ MeCab initialized for Japanese tokenization")
+        except:
+            print("⚠️ MeCab not available. Install with: apt-get install mecab libmecab-dev mecab-ipadic-utf8")
+            print("   and: pip install mecab-python3")
+            print("   Using fallback character-level tokenization")
+            self.mecab = None
+        # Initialize ROUGE scorer (without lang parameter)
+        self.rouge_scorer = rouge_scorer.RougeScorer(
+            ['rouge1', 'rouge2', 'rougeL'],
+            use_stemmer=False  # Don't use stemming for Japanese
+        )
+        # Smoothing function for BLEU
+        self.smoothing = SmoothingFunction().method1
+        # Results storage
+        self.results = {}
+        self.detailed_results = []
+    def tokenize_japanese(self, text: str) -> List[str]:
+        """
+        Tokenize Japanese text using MeCab or fallback method
+        Args:
+            text: Japanese text to tokenize
+        Returns:
+            List of tokens
+        """
+        if self.mecab:
+            try:
+                # Use MeCab for proper Japanese tokenization
+                tokens = self.mecab.parse(text).strip().split()
+                return tokens if tokens else list(text)
+            except:
+                # Fallback if MeCab fails
+                pass
+        # Fallback to character-level tokenization
+        # Remove punctuation and split
+        text = re.sub(r'[。、！？\n\s]', ' ', text)
+        # Split by spaces and then into characters
+        words = text.split()
+        if words:
+            # Try to keep some word boundaries
+            tokens = []
+            for word in words:
+                if len(word) <= 4:  # Keep short words together
+                    tokens.append(word)
+                else:  # Split longer words into characters
+                    tokens.extend(list(word))
+            return tokens
+        else:
+            # Pure character-level tokenization
+            return list(text.replace(' ', ''))
+    def load_test_data(self, max_samples: Optional[int] = None) -> List[Dict]:
+        """
+        Load test dataset
+        Args:
+            max_samples: Maximum number of samples to load
+        Returns:
+            List of test examples
+        """
+        print(f"\n📚 Loading test data from {self.test_data_path}")
+        test_data = []
+        if not os.path.exists(self.test_data_path):
+            print(f"❌ Test data not found at {self.test_data_path}")
+            print("   Creating synthetic test data for demonstration...")
+            return self.create_synthetic_test_data()
+        with open(self.test_data_path, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if max_samples and i >= max_samples:
+                    break
+                try:
+                    data = json.loads(line)
+                    # Parse the text field to extract input and response
+                    text = data.get('text', '')
+                    # Extract input and reference response
+                    if "### Input:" in text and "### Response:" in text:
+                        parts = text.split("### Input:")
+                        if len(parts) > 1:
+                            input_part = parts[1].split("### Response:")[0].strip()
+                            response_part = text.split("### Response:")[1].strip()
+                            test_data.append({
+                                'input': input_part,
+                                'reference': response_part,
+                                'score': data.get('score', 0),
+                                'topic': data.get('topic', 'Unknown')
+                            })
+                except Exception as e:
+                    print(f"⚠️ Error parsing line {i}: {e}")
+                    continue
+        if not test_data:
+            print("⚠️ No valid test data found. Creating synthetic data...")
+            return self.create_synthetic_test_data()
+        print(f"✅ Loaded {len(test_data)} test examples")
+        return test_data
+    def create_synthetic_test_data(self) -> List[Dict]:
+        """Create synthetic test data for demonstration"""
+        synthetic_data = [
+            {
+                'input': '最近ストレスを感じています。',
+                'reference': 'ストレスを感じているのですね。それは大変つらいことだと思います。どのような状況でストレスを感じることが多いですか？',
+                'score': 75,
+                'topic': 'ストレス'
+            },
+            {
+                'input': '仕事がうまくいかなくて悩んでいます。',
+                'reference': '仕事でお悩みなのですね。うまくいかないと感じると、本当に辛いですよね。具体的にどのような点で困難を感じていらっしゃいますか？',
+                'score': 78,
+                'topic': '仕事'
+            },
+            {
+                'input': '人間関係で困っています。',
+                'reference': '人間関係の悩みは本当に心が疲れますよね。お気持ちお察しします。どのような関係性でお困りでしょうか？',
+                'score': 80,
+                'topic': '人間関係'
+            },
+            {
+                'input': '将来が不安です。',
+                'reference': '将来への不安を抱えていらっしゃるのですね。先が見えない不安は、とても重く感じられることと思います。',
+                'score': 72,
+                'topic': '不安'
+            },
+            {
+                'input': '自信が持てません。',
+                'reference': '自信が持てないというお気持ち、よくわかります。多くの方が同じような悩みを抱えています。',
+                'score': 76,
+                'topic': '自信'
+            }
+        ]
+        return synthetic_data
+    def load_models(self):
+        """Load base and fine-tuned models"""
+        print("\n🤖 Loading models for benchmarking...")
+        # Load tokenizer
+        print("  Loading tokenizer...")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
+        except:
+            print("  Using GPT2 tokenizer as fallback...")
+            self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Load base model
+        print("  Loading base model...")
+        try:
+            self.base_model = AutoModelForCausalLM.from_pretrained(
+                self.base_model_name,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                device_map="auto" if self.device == "cuda" else None,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+        except Exception as e:
+            print(f"  ⚠️ Could not load base model {self.base_model_name}: {e}")
+            print("  Using GPT2 as fallback base model...")
+            self.base_model = AutoModelForCausalLM.from_pretrained(
+                "gpt2",
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                device_map="auto" if self.device == "cuda" else None
+            )
+        self.base_model.eval()
+        # Load fine-tuned model
+        print(f"  Loading fine-tuned model from {self.finetuned_model_path}...")
+        # Check if model exists
+        if not os.path.exists(self.finetuned_model_path):
+            print(f"  ⚠️ Fine-tuned model not found at {self.finetuned_model_path}")
+            print("  Using base model for both comparisons (for demonstration)")
+            self.finetuned_model = self.base_model
+        else:
+            try:
+                self.finetuned_model = AutoModelForCausalLM.from_pretrained(
+                    self.finetuned_model_path,
+                    torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                    device_map="auto" if self.device == "cuda" else None,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    local_files_only=True
+                )
+                self.finetuned_model.eval()
+            except Exception as e:
+                print(f"  ⚠️ Error loading fine-tuned model: {e}")
+                print("  Using base model for comparison")
+                self.finetuned_model = self.base_model
+        print("✅ Models loaded successfully!")
+    def generate_response(self, model, prompt: str, max_length: int = 150) -> str:
+        """
+        Generate response from model
+        Args:
+            model: Model to use for generation
+            prompt: Input prompt
+            max_length: Maximum length of generated response
+        Returns:
+            Generated response text
+        """
+        # Format prompt for counseling
+        formatted_prompt = f"""### Instruction:
+あなたは思いやりのある心理カウンセラーです。
+クライアントの感情を理解し、共感的で支援的な応答を提供してください。
+### Input:
+{prompt}
+### Response:
+"""
+        # Tokenize input
+        inputs = self.tokenizer(
+            formatted_prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        )
+        if self.device == "cuda":
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+        # Generate response
+        try:
+            with torch.no_grad():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_length,
+                    temperature=0.7,
+                    do_sample=True,
+                    top_p=0.9,
+                    repetition_penalty=1.1,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id
+                )
+            # Decode response
+            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract only the generated response
+            if "### Response:" in full_response:
+                response = full_response.split("### Response:")[-1].strip()
+            else:
+                response = full_response[len(formatted_prompt):].strip()
+        except Exception as e:
+            print(f"  ⚠️ Generation error: {e}")
+            response = "申し訳ございません。応答を生成できませんでした。"
+        return response
+    def calculate_bleu_scores(self, reference: str, hypothesis: str) -> Dict[str, float]:
+        """
+        Calculate BLEU scores using Japanese tokenization
+        Args:
+            reference: Reference text
+            hypothesis: Generated text
+        Returns:
+            Dictionary of BLEU scores
+        """
+        # Tokenize using MeCab or fallback
+        ref_tokens = self.tokenize_japanese(reference)
+        hyp_tokens = self.tokenize_japanese(hypothesis)
+        # Ensure we have tokens
+        if not ref_tokens:
+            ref_tokens = ['empty']
+        if not hyp_tokens:
+            hyp_tokens = ['empty']
+        # Calculate BLEU scores
+        scores = {}
+        try:
+            # BLEU-1 through BLEU-4
+            for n in range(1, 5):
+                weights = tuple([1/n] * n + [0] * (4-n))
+                score = sentence_bleu(
+                    [ref_tokens],
+                    hyp_tokens,
+                    weights=weights,
+                    smoothing_function=self.smoothing
+                )
+                scores[f'BLEU-{n}'] = score
+        except Exception as e:
+            print(f"  ⚠️ BLEU calculation error: {e}")
+            for n in range(1, 5):
+                scores[f'BLEU-{n}'] = 0.0
+        return scores
+    def calculate_rouge_scores(self, reference: str, hypothesis: str) -> Dict[str, float]:
+        """
+        Calculate ROUGE scores for Japanese text
+        Args:
+            reference: Reference text
+            hypothesis: Generated text
+        Returns:
+            Dictionary of ROUGE scores
+        """
+        try:
+            # For Japanese, we need to add spaces between tokens for ROUGE scorer
+            if self.mecab:
+                ref_tokenized = ' '.join(self.tokenize_japanese(reference))
+                hyp_tokenized = ' '.join(self.tokenize_japanese(hypothesis))
+            else:
+                # Character-level with spaces
+                ref_tokenized = ' '.join(list(reference))
+                hyp_tokenized = ' '.join(list(hypothesis))
+            # Calculate ROUGE scores
+            scores = self.rouge_scorer.score(ref_tokenized, hyp_tokenized)
+            return {
+                'ROUGE-1': scores['rouge1'].fmeasure,
+                'ROUGE-2': scores['rouge2'].fmeasure,
+                'ROUGE-L': scores['rougeL'].fmeasure
+            }
+        except Exception as e:
+            print(f"  ⚠️ ROUGE calculation error: {e}")
+            return {
+                'ROUGE-1': 0.0,
+                'ROUGE-2': 0.0,
+                'ROUGE-L': 0.0
+            }
+    def calculate_bert_score(self, references: List[str], hypotheses: List[str]) -> Dict[str, float]:
+        """
+        Calculate BERTScore for semantic similarity
+        Args:
+            references: List of reference texts
+            hypotheses: List of generated texts
+        Returns:
+            Dictionary with BERTScore metrics
+        """
+        try:
+            # Calculate BERTScore
+            P, R, F1 = bert_score(
+                hypotheses,
+                references,
+                lang='ja',
+                verbose=False,
+                device=self.device
+            )
+            return {
+                'BERTScore_P': float(P.mean()),
+                'BERTScore_R': float(R.mean()),
+                'BERTScore_F1': float(F1.mean())
+            }
+        except Exception as e:
+            print(f"  ⚠️ BERTScore calculation failed: {e}")
+            print("    Install with: pip install bert-score")
+            return {
+                'BERTScore_P': 0.0,
+                'BERTScore_R': 0.0,
+                'BERTScore_F1': 0.0
+            }
+    def evaluate_counseling_quality(self, response: str) -> Dict[str, float]:
+        """
+        Evaluate counseling-specific qualities
+        Based on KokoroChat paper evaluation criteria
+        Args:
+            response: Generated counseling response
+        Returns:
+            Dictionary of counseling quality scores
+        """
+        scores = {}
+        # 1. Empathy Score (共感度)
+        empathy_keywords = [
+            'わかります', '理解', '共感', 'お気持ち', 'つらい',
+            '大変', 'お察し', 'そうですね', 'なるほど', '感じ'
+        ]
+        empathy_score = sum(1 for keyword in empathy_keywords if keyword in response)
+        scores['empathy'] = min(empathy_score / 5.0, 1.0)  # Normalize to 0-1
+        # 2. Support Score (支援度)
+        support_keywords = [
+            'サポート', '支援', '助け', '一緒に', '協力',
+            '応援', 'お手伝い', '力になり', '相談', '話を聞'
+        ]
+        support_score = sum(1 for keyword in support_keywords if keyword in response)
+        scores['support'] = min(support_score / 5.0, 1.0)
+        # 3. Active Listening (傾聴)
+        listening_indicators = ['？', 'でしょうか', 'ですか', 'いかがですか', 'どのような']
+        scores['active_listening'] = 1.0 if any(ind in response for ind in listening_indicators) else 0.3
+        # 4. Positivity (前向きさ)
+        positive_keywords = ['大丈夫', '良い', '素晴らしい', '頑張', '希望', '改善', '解決']
+        positive_score = sum(1 for keyword in positive_keywords if keyword in response)
+        scores['positivity'] = min(positive_score / 3.0, 1.0)
+        # 5. Response Appropriateness (応答の適切さ)
+        response_length = len(response)
+        if 30 <= response_length <= 200:
+            scores['appropriateness'] = 1.0
+        elif 20 <= response_length < 30 or 200 < response_length <= 300:
+            scores['appropriateness'] = 0.7
+        else:
+            scores['appropriateness'] = 0.4
+        return scores
+    def run_comprehensive_benchmark(self, num_samples: Optional[int] = None):
+        """
+        Run comprehensive benchmark evaluation
+        Args:
+            num_samples: Number of samples to evaluate (None for all)
+        """
+        print("\n" + "="*80)
+        print("🚀 Running Comprehensive Benchmark")
+        print("="*80)
+        # Load test data
+        test_data = self.load_test_data(max_samples=num_samples)
+        if not test_data:
+            raise ValueError("No test data available!")
+        # Initialize metric collectors
+        base_metrics = defaultdict(list)
+        finetuned_metrics = defaultdict(list)
+        # Collect all responses for BERTScore
+        all_references = []
+        all_base_responses = []
+        all_finetuned_responses = []
+        print(f"\n📊 Evaluating {len(test_data)} test examples...")
+        print("-"*80)
+        # Process each test example
+        for i, example in enumerate(tqdm(test_data, desc="Evaluating")):
+            input_text = example['input']
+            reference = example['reference']
+            # Generate responses
+            base_response = self.generate_response(self.base_model, input_text)
+            finetuned_response = self.generate_response(self.finetuned_model, input_text)
+            # Collect for BERTScore
+            all_references.append(reference)
+            all_base_responses.append(base_response)
+            all_finetuned_responses.append(finetuned_response)
+            # Calculate BLEU scores
+            base_bleu = self.calculate_bleu_scores(reference, base_response)
+            finetuned_bleu = self.calculate_bleu_scores(reference, finetuned_response)
+            for key, value in base_bleu.items():
+                base_metrics[key].append(value)
+            for key, value in finetuned_bleu.items():
+                finetuned_metrics[key].append(value)
+            # Calculate ROUGE scores
+            base_rouge = self.calculate_rouge_scores(reference, base_response)
+            finetuned_rouge = self.calculate_rouge_scores(reference, finetuned_response)
+            for key, value in base_rouge.items():
+                base_metrics[key].append(value)
+            for key, value in finetuned_rouge.items():
+                finetuned_metrics[key].append(value)
+            # Evaluate counseling quality
+            base_quality = self.evaluate_counseling_quality(base_response)
+            finetuned_quality = self.evaluate_counseling_quality(finetuned_response)
+            for key, value in base_quality.items():
+                base_metrics[f'quality_{key}'].append(value)
+            for key, value in finetuned_quality.items():
+                finetuned_metrics[f'quality_{key}'].append(value)
+            # Store detailed results
+            self.detailed_results.append({
+                'input': input_text,
+                'reference': reference,
+                'base_response': base_response,
+                'finetuned_response': finetuned_response,
+                'base_metrics': {**base_bleu, **base_rouge, **base_quality},
+                'finetuned_metrics': {**finetuned_bleu, **finetuned_rouge, **finetuned_quality}
+            })
+            # Show sample outputs
+            if i < 3:
+                print(f"\n📝 Example {i+1}:")
+                print(f"Input: {input_text[:100]}...")
+                print(f"Base BLEU-4: {base_bleu['BLEU-4']:.3f}, Fine-tuned BLEU-4: {finetuned_bleu['BLEU-4']:.3f}")
+        # Calculate BERTScore for all examples
+        if len(all_references) > 0:
+            print("\n🧮 Calculating BERTScore...")
+            base_bert = self.calculate_bert_score(all_references, all_base_responses)
+            finetuned_bert = self.calculate_bert_score(all_references, all_finetuned_responses)
+            for key, value in base_bert.items():
+                base_metrics[key] = [value] * len(test_data)
+            for key, value in finetuned_bert.items():
+                finetuned_metrics[key] = [value] * len(test_data)
+        # Calculate aggregate statistics
+        self.results = self.calculate_aggregate_statistics(base_metrics, finetuned_metrics)
+        # Print results
+        self.print_results()
+        return self.results
+    def calculate_aggregate_statistics(self, base_metrics: Dict, finetuned_metrics: Dict) -> Dict:
+        """
+        Calculate aggregate statistics from collected metrics
+        Args:
+            base_metrics: Base model metrics
+            finetuned_metrics: Fine-tuned model metrics
+        Returns:
+            Dictionary of aggregate results
+        """
+        results = {
+            'metrics': {},
+            'improvements': {},
+            'summary': {}
+        }
+        # Calculate statistics for each metric
+        all_metric_names = set(base_metrics.keys()) | set(finetuned_metrics.keys())
+        for metric in all_metric_names:
+            base_values = base_metrics.get(metric, [0])
+            finetuned_values = finetuned_metrics.get(metric, [0])
+            results['metrics'][metric] = {
+                'base': {
+                    'mean': float(np.mean(base_values)),
+                    'std': float(np.std(base_values)),
+                    'min': float(np.min(base_values)),
+                    'max': float(np.max(base_values))
+                },
+                'finetuned': {
+                    'mean': float(np.mean(finetuned_values)),
+                    'std': float(np.std(finetuned_values)),
+                    'min': float(np.min(finetuned_values)),
+                    'max': float(np.max(finetuned_values))
+                }
+            }
+            # Calculate improvement
+            base_mean = np.mean(base_values)
+            finetuned_mean = np.mean(finetuned_values)
+            if base_mean > 0:
+                improvement = ((finetuned_mean - base_mean) / base_mean) * 100
+            else:
+                improvement = 0
+            results['improvements'][metric] = improvement
+        # Calculate summary statistics
+        bleu_metrics = [m for m in results['metrics'] if 'BLEU' in m]
+        rouge_metrics = [m for m in results['metrics'] if 'ROUGE' in m]
+        quality_metrics = [m for m in results['metrics'] if 'quality' in m]
+        # Average improvements
+        results['summary'] = {
+            'bleu_avg_improvement': np.mean([results['improvements'][m] for m in bleu_metrics]) if bleu_metrics else 0,
+            'rouge_avg_improvement': np.mean([results['improvements'][m] for m in rouge_metrics]) if rouge_metrics else 0,
+            'quality_avg_improvement': np.mean([results['improvements'][m] for m in quality_metrics]) if quality_metrics else 0,
+            'overall_improvement': np.mean(list(results['improvements'].values())) if results['improvements'] else 0
+        }
+        return results
+    def print_results(self):
+        """Print formatted benchmark results"""
+        print("\n" + "="*80)
+        print("📊 BENCHMARK RESULTS")
+        print("="*80)
+        # Group metrics by category
+        bleu_metrics = sorted([m for m in self.results['metrics'] if 'BLEU' in m])
+        rouge_metrics = sorted([m for m in self.results['metrics'] if 'ROUGE' in m])
+        bert_metrics = sorted([m for m in self.results['metrics'] if 'BERT' in m])
+        quality_metrics = sorted([m for m in self.results['metrics'] if 'quality' in m])
+        # Print BLEU scores
+        if bleu_metrics:
+            print("\n📘 BLEU Scores:")
+            print("-"*60)
+            print(f"{'Metric':<15} {'Base Model':<20} {'Fine-tuned':<20} {'Improvement':<15}")
+            print("-"*60)
+            for metric in bleu_metrics:
+                base = self.results['metrics'][metric]['base']['mean']
+                finetuned = self.results['metrics'][metric]['finetuned']['mean']
+                improvement = self.results['improvements'][metric]
+                print(f"{metric:<15} {base:.4f}±{self.results['metrics'][metric]['base']['std']:.3f}  "
+                      f"{finetuned:.4f}±{self.results['metrics'][metric]['finetuned']['std']:.3f}  "
+                      f"{improvement:+.1f}%")
+        # Print ROUGE scores
+        if rouge_metrics:
+            print("\n📕 ROUGE Scores:")
+            print("-"*60)
+            for metric in rouge_metrics:
+                base = self.results['metrics'][metric]['base']['mean']
+                finetuned = self.results['metrics'][metric]['finetuned']['mean']
+                improvement = self.results['improvements'][metric]
+                print(f"{metric:<15} {base:.4f}±{self.results['metrics'][metric]['base']['std']:.3f}  "
+                      f"{finetuned:.4f}±{self.results['metrics'][metric]['finetuned']['std']:.3f}  "
+                      f"{improvement:+.1f}%")
+        # Print BERTScore
+        if bert_metrics:
+            print("\n📗 BERTScore:")
+            print("-"*60)
+            for metric in bert_metrics:
+                base = self.results['metrics'][metric]['base']['mean']
+                finetuned = self.results['metrics'][metric]['finetuned']['mean']
+                improvement = self.results['improvements'][metric]
+                print(f"{metric:<15} {base:.4f}  {finetuned:.4f}  {improvement:+.1f}%")
+        # Print Counseling Quality scores
+        if quality_metrics:
+            print("\n💬 Counseling Quality Metrics:")
+            print("-"*60)
+            for metric in quality_metrics:
+                base = self.results['metrics'][metric]['base']['mean']
+                finetuned = self.results['metrics'][metric]['finetuned']['mean']
+                improvement = self.results['improvements'][metric]
+                metric_name = metric.replace('quality_', '').capitalize()
+                print(f"{metric_name:<15} {base:.4f}±{self.results['metrics'][metric]['base']['std']:.3f}  "
+                      f"{finetuned:.4f}±{self.results['metrics'][metric]['finetuned']['std']:.3f}  "
+                      f"{improvement:+.1f}%")
+        # Print summary
+        print("\n" + "="*80)
+        print("📈 SUMMARY")
+        print("="*80)
+        print(f"Average BLEU Improvement:    {self.results['summary']['bleu_avg_improvement']:+.1f}%")
+        print(f"Average ROUGE Improvement:   {self.results['summary']['rouge_avg_improvement']:+.1f}%")
+        print(f"Average Quality Improvement: {self.results['summary']['quality_avg_improvement']:+.1f}%")
+        print(f"Overall Improvement:         {self.results['summary']['overall_improvement']:+.1f}%")
+        print("="*80)
+    def save_results(self, output_dir: str = "./benchmark_results"):
+        """Save all benchmark results"""
+        os.makedirs(output_dir, exist_ok=True)
+        # Save detailed results
+        with open(os.path.join(output_dir, "detailed_results.json"), 'w', encoding='utf-8') as f:
+            json.dump(self.detailed_results, f, ensure_ascii=False, indent=2, default=str)
+        # Save aggregate results
+        with open(os.path.join(output_dir, "aggregate_results.json"), 'w', encoding='utf-8') as f:
+            json.dump(self.results, f, ensure_ascii=False, indent=2, default=str)
+        print(f"✅ Results saved to {output_dir}/")
+def main():
+    """Main execution function"""
+    import argparse
+    parser = argparse.ArgumentParser(description='Japanese Counseling Model Benchmark')
+    parser.add_argument('--base_model', type=str, default='LiquidAI/LFM2-1.2B',
+                       help='Base model name or path')
+    parser.add_argument('--finetuned_model', type=str, default='./merged_counselor_model',
+                       help='Path to fine-tuned merged model')
+    parser.add_argument('--test_data', type=str, default='./processed_data_score70/test.jsonl',
+                       help='Path to test data')
+    parser.add_argument('--num_samples', type=int, default=None,
+                       help='Number of samples to evaluate (None for all)')
+    parser.add_argument('--output_dir', type=str, default='./benchmark_results',
+                       help='Directory to save results')
+    args = parser.parse_args()
+    try:
+        # Initialize benchmark
+        print("🎌 Initializing Japanese Counseling Benchmark Suite")
+        benchmark = JapaneseCounselingBenchmark(
+            base_model_name=args.base_model,
+            finetuned_model_path=args.finetuned_model,
+            test_data_path=args.test_data
+        )
+        # Load models
+        benchmark.load_models()
+        # Run benchmark
+        results = benchmark.run_comprehensive_benchmark(num_samples=args.num_samples)
+        # Save results
+        benchmark.save_results(args.output_dir)
+        print("\n✅ Benchmark completed successfully!")
+        print(f"📁 Results saved to {args.output_dir}/")
+    except Exception as e:
+        print(f"\n❌ Error during benchmarking: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

benchmarking_v2.py ADDED Viewed

	@@ -0,0 +1,782 @@

+"""
+Fixed Optimized Japanese Counseling Model Benchmark with proper DataParallel handling
+"""
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DataParallel
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import numpy as np
+from typing import List, Dict, Tuple, Optional, Any
+import json
+from tqdm import tqdm
+import os
+import gc
+import warnings
+from datetime import datetime
+import pandas as pd
+from collections import defaultdict
+import MeCab
+from rouge_score import rouge_scorer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+import re
+import wandb
+from concurrent.futures import ThreadPoolExecutor
+import time
+# Suppress warnings
+warnings.filterwarnings('ignore')
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+# Suppress Pydantic warnings
+import logging
+logging.getLogger('pydantic').setLevel(logging.ERROR)
+class TestDataset(Dataset):
+    """Custom dataset for efficient batch processing"""
+    def __init__(self, data: List[Dict]):
+        self.data = data
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+def custom_collate_fn(batch):
+    """Custom collate function to handle dictionary data properly"""
+    return batch
+class OptimizedJapaneseBenchmark:
+    """
+    Highly optimized benchmark suite with multi-GPU support and WandB logging
+    """
+    def __init__(self,
+                 base_model_name: str = "LiquidAI/LFM2-1.2B",
+                 finetuned_model_path: str = "./merged_counselor_model",
+                 test_data_path: str = "./processed_data_score80/test.jsonl",
+                 batch_size: int = 16,  # Reduced for stability
+                 num_workers: int = 0,
+                 use_wandb: bool = True):
+        """
+        Initialize optimized benchmark with multi-GPU support
+        """
+        self.base_model_name = base_model_name
+        self.finetuned_model_path = finetuned_model_path
+        self.test_data_path = test_data_path
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        # Setup devices
+        self.setup_devices()
+        # Initialize WandB
+        if use_wandb:
+            self.init_wandb()
+        else:
+            self.wandb_enabled = False
+        # Initialize tokenizers and scorers
+        self.setup_tokenizers_and_scorers()
+        # Results storage
+        self.results = {}
+        self.detailed_results = []
+    def setup_devices(self):
+        """Setup multi-GPU configuration"""
+        if torch.cuda.is_available():
+            self.num_gpus = torch.cuda.device_count()
+            print(f"🚀 Found {self.num_gpus} GPUs")
+            self.device_ids = list(range(self.num_gpus))
+            self.device = torch.device("cuda:0")
+            for i in range(self.num_gpus):
+                print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
+                print(f"   Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
+        else:
+            self.num_gpus = 0
+            self.device = torch.device("cpu")
+            print("⚠️ No GPU found, using CPU")
+    def init_wandb(self):
+        """Initialize WandB for experiment tracking"""
+        try:
+            run_name = f"benchmark-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+            wandb.init(
+                project="japanese-counseling-benchmark",
+                name=run_name,
+                config={
+                    "base_model": self.base_model_name,
+                    "finetuned_model": self.finetuned_model_path,
+                    "batch_size": self.batch_size,
+                    "num_gpus": self.num_gpus,
+                    "timestamp": datetime.now().isoformat()
+                },
+                tags=["benchmark", "japanese", "counseling", "multi-gpu"]
+            )
+            self.wandb_enabled = True
+            print(f"✅ WandB initialized: {wandb.run.name}")
+            print(f"📊 View at: {wandb.run.get_url()}")
+        except Exception as e:
+            print(f"⚠️ WandB initialization failed: {e}")
+            self.wandb_enabled = False
+    def setup_tokenizers_and_scorers(self):
+        """Setup tokenizers and scoring functions"""
+        # Initialize MeCab for Japanese tokenization
+        try:
+            self.mecab = MeCab.Tagger("-Owakati")
+            print("✅ MeCab initialized")
+        except:
+            print("⚠️ MeCab not available, using character tokenization")
+            self.mecab = None
+        # Initialize ROUGE scorer
+        self.rouge_scorer = rouge_scorer.RougeScorer(
+            ['rouge1', 'rouge2', 'rougeL'],
+            use_stemmer=False
+        )
+        # BLEU smoothing
+        self.smoothing = SmoothingFunction().method1
+    def load_test_data_fast(self, max_samples: Optional[int] = None) -> List[Dict]:
+        """Fast loading of test data"""
+        print(f"\n📚 Loading test data from {self.test_data_path}")
+        test_data = []
+        if not os.path.exists(self.test_data_path):
+            print("⚠️ Test data not found, using synthetic data")
+            return self.create_synthetic_test_data()
+        try:
+            with open(self.test_data_path, 'r', encoding='utf-8') as f:
+                lines = f.readlines()
+            if max_samples:
+                lines = lines[:max_samples]
+            for line in tqdm(lines, desc="Loading data"):
+                try:
+                    data = json.loads(line)
+                    text = data.get('text', '')
+                    if "### Input:" in text and "### Response:" in text:
+                        input_part = text.split("### Input:")[1].split("### Response:")[0].strip()
+                        response_part = text.split("### Response:")[1].strip()
+                        test_data.append({
+                            'input': input_part,
+                            'reference': response_part,
+                            'score': data.get('score', 0),
+                            'topic': data.get('topic', 'Unknown')
+                        })
+                except:
+                    continue
+        except Exception as e:
+            print(f"Error loading data: {e}")
+            return self.create_synthetic_test_data()
+        if not test_data:
+            print("⚠️ No valid data found, using synthetic data")
+            return self.create_synthetic_test_data()
+        print(f"✅ Loaded {len(test_data)} test examples")
+        if self.wandb_enabled:
+            wandb.log({"test_data_size": len(test_data)})
+        return test_data
+    def create_synthetic_test_data(self) -> List[Dict]:
+        """Create synthetic test data"""
+        return [
+            {
+                'input': f'ストレスを感じています。',
+                'reference': f'お気持ちわかります。どのような状況でストレスを感じていますか？',
+                'score': 75,
+                'topic': 'stress'
+            }
+            for i in range(10)
+        ]
+    def load_models_optimized(self):
+        """Load models with optimization for multi-GPU"""
+        print("\n🤖 Loading models with optimization...")
+        # Load tokenizer
+        print("  Loading tokenizer...")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.base_model_name,
+                use_fast=True
+            )
+        except:
+            self.tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Load base model
+        print("  Loading base model...")
+        try:
+            base_model = AutoModelForCausalLM.from_pretrained(
+                self.base_model_name,
+                torch_dtype=torch.float16,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+        except Exception as e:
+            print(f"  Error loading base model: {e}")
+            print("  Using GPT2 as fallback...")
+            base_model = AutoModelForCausalLM.from_pretrained(
+                "gpt2",
+                torch_dtype=torch.float16
+            )
+        # Load fine-tuned model
+        print("  Loading fine-tuned model...")
+        if os.path.exists(self.finetuned_model_path):
+            try:
+                finetuned_model = AutoModelForCausalLM.from_pretrained(
+                    self.finetuned_model_path,
+                    torch_dtype=torch.float16,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    local_files_only=True
+                )
+            except Exception as e:
+                print(f"  Error loading fine-tuned model: {e}")
+                finetuned_model = base_model
+        else:
+            print("  Fine-tuned model not found, using base model")
+            finetuned_model = base_model
+        # Move models to GPU
+        base_model = base_model.to(self.device)
+        finetuned_model = finetuned_model.to(self.device)
+        # Setup for multi-GPU if available
+        if self.num_gpus > 1:
+            print(f"  Setting up DataParallel for {self.num_gpus} GPUs...")
+            self.base_model = DataParallel(base_model, device_ids=self.device_ids)
+            self.finetuned_model = DataParallel(finetuned_model, device_ids=self.device_ids)
+        else:
+            self.base_model = base_model
+            self.finetuned_model = finetuned_model
+        self.base_model.eval()
+        self.finetuned_model.eval()
+        print("✅ Models loaded and optimized!")
+        if self.wandb_enabled:
+            wandb.log({
+                "model_loaded": True,
+                "num_gpus_used": self.num_gpus
+            })
+    def generate_batch_responses(self, model, prompts: List[str], max_length: int = 150) -> List[str]:
+        """Generate responses in batch for efficiency"""
+        if len(prompts) == 0:
+            return []
+        formatted_prompts = [
+            f"""### Instruction:
+あなたは思いやりのある心理カウンセラーです。
+### Input:
+{prompt}
+### Response:
+""" for prompt in prompts
+        ]
+        try:
+            # Tokenize all prompts at once
+            inputs = self.tokenizer(
+                formatted_prompts,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512,
+                padding=True,
+                padding_side= 'left'
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Get the actual model from DataParallel if needed
+            actual_model = model.module if isinstance(model, DataParallel) else model
+            # Generate in batch
+            with torch.no_grad():
+                with torch.cuda.amp.autocast():
+                    outputs = actual_model.generate(
+                        **inputs,
+                        max_new_tokens=max_length,
+                        temperature=0.7,
+                        do_sample=True,
+                        top_p=0.9,
+                        num_beams=1,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id
+                    )
+            # Decode all at once
+            responses = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            # Extract only generated parts
+            extracted_responses = []
+            for i, response in enumerate(responses):
+                if "### Response:" in response:
+                    extracted = response.split("### Response:")[-1].strip()
+                else:
+                    extracted = response[len(formatted_prompts[i]):].strip()
+                extracted_responses.append(extracted if extracted else "応答を生成できませんでした。")
+            return extracted_responses
+        except Exception as e:
+            print(f"Error in batch generation: {e}")
+            # Return default responses
+            return ["申し訳ございません。応答を生成できませんでした。"] * len(prompts)
+    def tokenize_japanese(self, text: str) -> List[str]:
+        """Tokenize Japanese text"""
+        if not text:
+            return ['empty']
+        if self.mecab:
+            try:
+                tokens = self.mecab.parse(text).strip().split()
+                return tokens if tokens else list(text)
+            except:
+                pass
+        # Fallback to character tokenization
+        return list(text.replace(' ', ''))
+    def calculate_metrics_batch(self, references: List[str], hypotheses: List[str]) -> Dict:
+        """Calculate all metrics in batch"""
+        metrics = defaultdict(list)
+        for ref, hyp in zip(references, hypotheses):
+            if not ref or not hyp:
+                # Add default scores for empty strings
+                for n in range(1, 5):
+                    metrics[f'BLEU-{n}'].append(0.0)
+                metrics['ROUGE-1'].append(0.0)
+                metrics['ROUGE-2'].append(0.0)
+                metrics['ROUGE-L'].append(0.0)
+                continue
+            try:
+                # Tokenize
+                ref_tokens = self.tokenize_japanese(ref)
+                hyp_tokens = self.tokenize_japanese(hyp)
+                # BLEU scores
+                for n in range(1, 5):
+                    weights = tuple([1/n] * n + [0] * (4-n))
+                    try:
+                        score = sentence_bleu(
+                            [ref_tokens],
+                            hyp_tokens,
+                            weights=weights,
+                            smoothing_function=self.smoothing
+                        )
+                        metrics[f'BLEU-{n}'].append(score)
+                    except:
+                        metrics[f'BLEU-{n}'].append(0.0)
+                # ROUGE scores
+                try:
+                    ref_spaced = ' '.join(ref_tokens)
+                    hyp_spaced = ' '.join(hyp_tokens)
+                    rouge_scores = self.rouge_scorer.score(ref_spaced, hyp_spaced)
+                    metrics['ROUGE-1'].append(rouge_scores['rouge1'].fmeasure)
+                    metrics['ROUGE-2'].append(rouge_scores['rouge2'].fmeasure)
+                    metrics['ROUGE-L'].append(rouge_scores['rougeL'].fmeasure)
+                except:
+                    metrics['ROUGE-1'].append(0.0)
+                    metrics['ROUGE-2'].append(0.0)
+                    metrics['ROUGE-L'].append(0.0)
+            except Exception as e:
+                # Add zeros for failed calculations
+                for n in range(1, 5):
+                    metrics[f'BLEU-{n}'].append(0.0)
+                metrics['ROUGE-1'].append(0.0)
+                metrics['ROUGE-2'].append(0.0)
+                metrics['ROUGE-L'].append(0.0)
+        return dict(metrics)
+    def run_fast_benchmark(self, num_samples: Optional[int] = None):
+        """Run optimized benchmark with batch processing"""
+        print("\n" + "="*80)
+        print("🚀 Running Fast Multi-GPU Benchmark")
+        print("="*80)
+        start_time = time.time()
+        # Load test data
+        test_data = self.load_test_data_fast(max_samples=num_samples)
+        if not test_data:
+            raise ValueError("No test data available!")
+        # Create DataLoader
+        dataset = TestDataset(test_data)
+        dataloader = DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=0,
+            collate_fn=custom_collate_fn,
+            pin_memory=True if self.device.type == 'cuda' else False
+        )
+        # Initialize metric collectors
+        all_base_metrics = defaultdict(list)
+        all_finetuned_metrics = defaultdict(list)
+        print(f"\n📊 Evaluating {len(test_data)} examples in {len(dataloader)} batches...")
+        print(f"   Batch size: {self.batch_size}")
+        print(f"   Using {self.num_gpus} GPU(s)")
+        # Process batches
+        successful_batches = 0
+        for batch_idx, batch in enumerate(tqdm(dataloader, desc="Processing batches")):
+            try:
+                # Extract batch data
+                inputs = [item['input'] for item in batch]
+                references = [item['reference'] for item in batch]
+                # Generate responses in batch
+                base_responses = self.generate_batch_responses(self.base_model, inputs)
+                finetuned_responses = self.generate_batch_responses(self.finetuned_model, inputs)
+                # Calculate metrics in batch
+                base_metrics = self.calculate_metrics_batch(references, base_responses)
+                finetuned_metrics = self.calculate_metrics_batch(references, finetuned_responses)
+                # Aggregate metrics
+                for key, values in base_metrics.items():
+                    all_base_metrics[key].extend(values)
+                for key, values in finetuned_metrics.items():
+                    all_finetuned_metrics[key].extend(values)
+                successful_batches += 1
+                # Log progress to WandB
+                if self.wandb_enabled and batch_idx % 5 == 0:
+                    progress = (batch_idx + 1) / len(dataloader) * 100
+                    # Calculate current averages
+                    current_bleu4_base = np.mean(all_base_metrics.get('BLEU-4', [0]))
+                    current_bleu4_finetuned = np.mean(all_finetuned_metrics.get('BLEU-4', [0]))
+                    current_rouge_l_base = np.mean(all_base_metrics.get('ROUGE-L', [0]))
+                    current_rouge_l_finetuned = np.mean(all_finetuned_metrics.get('ROUGE-L', [0]))
+                    wandb.log({
+                        "progress": progress,
+                        "batches_processed": batch_idx + 1,
+                        "samples_processed": min((batch_idx + 1) * self.batch_size, len(test_data)),
+                        "current_bleu4_base": current_bleu4_base,
+                        "current_bleu4_finetuned": current_bleu4_finetuned,
+                        "current_rouge_l_base": current_rouge_l_base,
+                        "current_rouge_l_finetuned": current_rouge_l_finetuned
+                    })
+                # Store examples for analysis
+                if batch_idx == 0 and len(inputs) > 0:
+                    for i in range(min(3, len(inputs))):
+                        self.detailed_results.append({
+                            'input': inputs[i],
+                            'reference': references[i],
+                            'base_response': base_responses[i] if i < len(base_responses) else "",
+                            'finetuned_response': finetuned_responses[i] if i < len(finetuned_responses) else ""
+                        })
+                    # Print sample
+                    print(f"\n📝 Sample Example:")
+                    print(f"Input: {inputs[0][:100]}...")
+                    print(f"Reference: {references[0][:100]}...")
+                    print(f"Base response: {base_responses[0][:100]}...")
+                    print(f"Fine-tuned response: {finetuned_responses[0][:100]}...")
+            except Exception as e:
+                print(f"Error processing batch {batch_idx}: {e}")
+                continue
+        print(f"\n✅ Successfully processed {successful_batches}/{len(dataloader)} batches")
+        # Calculate final statistics
+        self.results = self.calculate_final_statistics(all_base_metrics, all_finetuned_metrics)
+        # Calculate processing time
+        total_time = time.time() - start_time
+        samples_per_second = len(test_data) / total_time if total_time > 0 else 0
+        print(f"\n⏱️ Benchmark completed in {total_time:.2f} seconds")
+        print(f"   Processing speed: {samples_per_second:.2f} samples/second")
+        # Log final results to WandB
+        if self.wandb_enabled:
+            wandb.log({
+                "total_time_seconds": total_time,
+                "samples_per_second": samples_per_second,
+                "total_samples": len(test_data),
+                "successful_batches": successful_batches,
+                **{f"final_{k}": v for k, v in self.results['summary'].items()}
+            })
+            # Log detailed metrics
+            for metric_name, improvements in self.results['improvements'].items():
+                wandb.log({f"improvement_{metric_name}": improvements})
+            # Create visualization
+            if self.results['metrics']:
+                self.create_wandb_visualizations()
+        # Print results
+        self.print_results()
+        return self.results
+    def create_wandb_visualizations(self):
+        """Create WandB visualizations"""
+        if not self.wandb_enabled or not self.results.get('metrics'):
+            return
+        try:
+            # Create comparison table
+            data = []
+            for metric in self.results['metrics']:
+                data.append([
+                    metric,
+                    self.results['metrics'][metric]['base']['mean'],
+                    self.results['metrics'][metric]['finetuned']['mean'],
+                    self.results['improvements'][metric]
+                ])
+            table = wandb.Table(
+                columns=["Metric", "Base", "Fine-tuned", "Improvement (%)"],
+                data=data
+            )
+            wandb.log({"results_comparison": table})
+            # Log bar chart of improvements
+            wandb.log({
+                "improvements_chart": wandb.plot.bar(
+                    wandb.Table(
+                        data=[[m, self.results['improvements'][m]] for m in self.results['improvements']],
+                        columns=["Metric", "Improvement (%)"]
+                    ),
+                    "Metric", "Improvement (%)",
+                    title="Model Improvements"
+                )
+            })
+        except Exception as e:
+            print(f"Error creating visualizations: {e}")
+    def calculate_final_statistics(self, base_metrics: Dict, finetuned_metrics: Dict) -> Dict:
+        """Calculate final aggregate statistics"""
+        results = {
+            'metrics': {},
+            'improvements': {},
+            'summary': {}
+        }
+        # Calculate statistics for each metric
+        all_metric_names = set(base_metrics.keys()) | set(finetuned_metrics.keys())
+        for metric in all_metric_names:
+            base_values = base_metrics.get(metric, [0])
+            finetuned_values = finetuned_metrics.get(metric, [0])
+            # Filter out any None values
+            base_values = [v for v in base_values if v is not None]
+            finetuned_values = [v for v in finetuned_values if v is not None]
+            if not base_values:
+                base_values = [0]
+            if not finetuned_values:
+                finetuned_values = [0]
+            results['metrics'][metric] = {
+                'base': {
+                    'mean': float(np.mean(base_values)),
+                    'std': float(np.std(base_values)),
+                    'min': float(np.min(base_values)),
+                    'max': float(np.max(base_values))
+                },
+                'finetuned': {
+                    'mean': float(np.mean(finetuned_values)),
+                    'std': float(np.std(finetuned_values)),
+                    'min': float(np.min(finetuned_values)),
+                    'max': float(np.max(finetuned_values))
+                }
+            }
+            # Calculate improvement
+            base_mean = np.mean(base_values)
+            finetuned_mean = np.mean(finetuned_values)
+            if base_mean > 0:
+                improvement = ((finetuned_mean - base_mean) / base_mean) * 100
+            else:
+                improvement = 0 if finetuned_mean == 0 else 100
+            results['improvements'][metric] = improvement
+        # Calculate summary statistics
+        bleu_metrics = [m for m in results['metrics'] if 'BLEU' in m]
+        rouge_metrics = [m for m in results['metrics'] if 'ROUGE' in m]
+        results['summary'] = {
+            'bleu_avg_improvement': np.mean([results['improvements'][m] for m in bleu_metrics]) if bleu_metrics else 0,
+            'rouge_avg_improvement': np.mean([results['improvements'][m] for m in rouge_metrics]) if rouge_metrics else 0,
+            'overall_improvement': np.mean(list(results['improvements'].values())) if results['improvements'] else 0
+        }
+        return results
+    def print_results(self):
+        """Print formatted results"""
+        print("\n" + "="*80)
+        print("📊 BENCHMARK RESULTS")
+        print("="*80)
+        if not self.results or 'metrics' not in self.results:
+            print("No results to display")
+            return
+        # BLEU scores
+        print("\n📘 BLEU Scores:")
+        print("-"*60)
+        print(f"{'Metric':<15} {'Base':<15} {'Fine-tuned':<15} {'Improvement':<15}")
+        print("-"*60)
+        for metric in sorted([m for m in self.results['metrics'] if 'BLEU' in m]):
+            base = self.results['metrics'][metric]['base']['mean']
+            finetuned = self.results['metrics'][metric]['finetuned']['mean']
+            improvement = self.results['improvements'][metric]
+            print(f"{metric:<15} {base:.4f}         {finetuned:.4f}         {improvement:+.1f}%")
+        # ROUGE scores
+        print("\n📕 ROUGE Scores:")
+        print("-"*60)
+        for metric in sorted([m for m in self.results['metrics'] if 'ROUGE' in m]):
+            base = self.results['metrics'][metric]['base']['mean']
+            finetuned = self.results['metrics'][metric]['finetuned']['mean']
+            improvement = self.results['improvements'][metric]
+            print(f"{metric:<15} {base:.4f}         {finetuned:.4f}         {improvement:+.1f}%")
+        # Summary
+        print("\n" + "="*80)
+        print("📈 SUMMARY")
+        print("="*80)
+        print(f"BLEU Average Improvement:  {self.results['summary']['bleu_avg_improvement']:+.1f}%")
+        print(f"ROUGE Average Improvement: {self.results['summary']['rouge_avg_improvement']:+.1f}%")
+        print(f"Overall Improvement:       {self.results['summary']['overall_improvement']:+.1f}%")
+        print("="*80)
+    def save_results(self, output_dir: str = "./benchmark_results"):
+        """Save results"""
+        os.makedirs(output_dir, exist_ok=True)
+        # Save results
+        with open(os.path.join(output_dir, "results.json"), 'w', encoding='utf-8') as f:
+            json.dump(self.results, f, ensure_ascii=False, indent=2, default=str)
+        with open(os.path.join(output_dir, "examples.json"), 'w', encoding='utf-8') as f:
+            json.dump(self.detailed_results, f, ensure_ascii=False, indent=2)
+        # Save to WandB
+        if self.wandb_enabled:
+            try:
+                artifact = wandb.Artifact(
+                    name=f"benchmark-results-{wandb.run.id}",
+                    type="benchmark_results",
+                    description="Japanese counseling model benchmark results"
+                )
+                artifact.add_dir(output_dir)
+                wandb.log_artifact(artifact)
+            except Exception as e:
+                print(f"Error saving to WandB: {e}")
+        print(f"✅ Results saved to {output_dir}/")
+    def cleanup(self):
+        """Clean up resources"""
+        if self.wandb_enabled:
+            wandb.finish()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+def main():
+    """Main execution"""
+    import argparse
+    parser = argparse.ArgumentParser(description='Optimized Japanese Counseling Benchmark')
+    parser.add_argument('--base_model', type=str, default='LiquidAI/LFM2-1.2B')
+    parser.add_argument('--finetuned_model', type=str, default='./merged_counselor_model')
+    parser.add_argument('--test_data', type=str, default='./processed_data_score80/test.jsonl')
+    parser.add_argument('--batch_size', type=int, default=16, help='Batch size for processing')
+    parser.add_argument('--num_samples', type=int, default=None, help='Number of samples to evaluate')
+    parser.add_argument('--output_dir', type=str, default='./benchmark_results_fast')
+    parser.add_argument('--no_wandb', action='store_true', help='Disable WandB logging')
+    args = parser.parse_args()
+    try:
+        # Initialize benchmark
+        print("🚀 Initializing Optimized Benchmark Suite")
+        benchmark = OptimizedJapaneseBenchmark(
+            base_model_name=args.base_model,
+            finetuned_model_path=args.finetuned_model,
+            test_data_path=args.test_data,
+            batch_size=args.batch_size,
+            use_wandb=not args.no_wandb
+        )
+        # Load models
+        benchmark.load_models_optimized()
+        # Run benchmark
+        results = benchmark.run_fast_benchmark(num_samples=args.num_samples)
+        # Save results
+        benchmark.save_results(args.output_dir)
+        # Cleanup
+        benchmark.cleanup()
+        print("\n✅ Benchmark completed successfully!")
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        if 'benchmark' in locals():
+            benchmark.cleanup()
+if __name__ == "__main__":
+    main()

chat.py ADDED Viewed

	@@ -0,0 +1,339 @@

+"""
+Interactive Chat Interface for Testing Fine-tuned Japanese Counseling Model
+"""
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import os
+import warnings
+from datetime import datetime
+import json
+warnings.filterwarnings('ignore')
+class CounselorChatInterface:
+    def __init__(self, model_path: str = "./merged_counselor_model"):
+        """
+        Initialize the chat interface with the fine-tuned model
+        Args:
+            model_path: Path to the fine-tuned model
+        """
+        self.model_path = model_path
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print("="*80)
+        print("🎌 Japanese Counseling Model Chat Interface")
+        print("="*80)
+        print(f"📍 Device: {self.device}")
+        if self.device.type == "cuda":
+            print(f"   GPU: {torch.cuda.get_device_name(0)}")
+            print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+        self.load_model()
+        self.conversation_history = []
+    def load_model(self):
+        """Load the fine-tuned model and tokenizer"""
+        print(f"\n🤖 Loading model from {self.model_path}...")
+        try:
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_path,
+                local_files_only=True
+            )
+            # Set padding token if not set
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            # Load model
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32,
+                device_map="auto" if self.device.type == "cuda" else None,
+                local_files_only=True,
+                trust_remote_code=True
+            )
+            self.model.eval()
+            print("✅ Model loaded successfully!")
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            print("Trying alternative loading method...")
+            # Try loading with base tokenizer
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+                if self.tokenizer.pad_token is None:
+                    self.tokenizer.pad_token = self.tokenizer.eos_token
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_path,
+                    torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32,
+                    local_files_only=True
+                )
+                self.model = self.model.to(self.device)
+                self.model.eval()
+                print("✅ Model loaded with fallback tokenizer!")
+            except Exception as e2:
+                print(f"❌ Failed to load model: {e2}")
+                raise
+    def generate_response(self, user_input: str,
+                         temperature: float = 0,
+                         max_length: int = 200,
+                         use_context: bool = True) -> str:
+        """
+        Generate a counseling response
+        Args:
+            user_input: User's message
+            temperature: Generation temperature (0.1-1.0)
+            max_length: Maximum response length
+            use_context: Whether to use conversation history
+        Returns:
+            Generated response
+        """
+        # Format the prompt
+        if use_context and len(self.conversation_history) > 0:
+            # Include recent context
+            context = "\n".join(self.conversation_history[-4:])  # Last 2 exchanges
+            prompt = f"""### Instruction:
+あなたは思いやりのある心理カウンセラーです。
+クライアントの感情を理解し、共感的で支援的な応答を提供してください。
+### Context:
+{context}
+### Input:
+{user_input}
+### Response:
+"""
+        else:
+            prompt = f"""### Instruction:
+あなたは思いやりのある心理カウンセラーです。
+クライアントの感情を理解し、共感的で支援的な応答を提供してください。
+### Input:
+{user_input}
+### Response:
+"""
+        # Tokenize
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        )
+        if self.device.type == "cuda":
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+        # Generate
+        try:
+            with torch.no_grad():
+                with torch.cuda.amp.autocast() if self.device.type == "cuda" else torch.autocast("cpu"):
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_new_tokens=max_length,
+                        temperature=temperature,
+                        do_sample=True,
+                        top_p=0.9,
+                        top_k=50,
+                        repetition_penalty=1.1,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id
+                    )
+            # Decode
+            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract only the response part
+            if "### Response:" in full_response:
+                response = full_response.split("### Response:")[-1].strip()
+            else:
+                response = full_response[len(prompt):].strip()
+            return response
+        except Exception as e:
+            print(f"Error generating response: {e}")
+            return "申し訳ございません。応答の生成中にエラーが発生しました。"
+    def chat(self):
+        """Start interactive chat session"""
+        print("\n" + "="*80)
+        print("💬 チャットを開始します (Chat session started)")
+        print("="*80)
+        print("Commands:")
+        print("  /quit or /exit - 終了 (Exit)")
+        print("  /clear - 会話履歴をクリア (Clear conversation history)")
+        print("  /save - 会話を保存 (Save conversation)")
+        print("  /temp <value> - 温度パラメータを設定 (Set temperature, e.g., /temp 0.8)")
+        print("  /context on/off - コンテキスト使用の切り替え (Toggle context usage)")
+        print("-"*80)
+        temperature = 0.1
+        use_context = True
+        while True:
+            try:
+                # Get user input
+                user_input = input("\n👤 You: ").strip()
+                # Check for commands
+                if user_input.lower() in ['/quit', '/exit', '/q']:
+                    print("\n👋 さようなら！(Goodbye!)")
+                    break
+                elif user_input.lower() == '/clear':
+                    self.conversation_history = []
+                    print("✅ 会話履歴をクリアしました (Conversation history cleared)")
+                    continue
+                elif user_input.lower() == '/save':
+                    self.save_conversation()
+                    continue
+                elif user_input.lower().startswith('/temp'):
+                    try:
+                        temperature = float(user_input.split()[1])
+                        temperature = 0.1 # max(0.1, min(, temperature))
+                        print(f"✅ Temperature set to {temperature}")
+                    except:
+                        print("❌ Invalid temperature. Use: /temp 0.7")
+                    continue
+                elif user_input.lower().startswith('/context'):
+                    try:
+                        setting = user_input.split()[1].lower()
+                        use_context = setting == 'on'
+                        print(f"✅ Context {'enabled' if use_context else 'disabled'}")
+                    except:
+                        print("❌ Use: /context on or /context off")
+                    continue
+                elif user_input.startswith('/'):
+                    print("❌ Unknown command")
+                    continue
+                # Generate response
+                print("\n🤖 Counselor: ", end="", flush=True)
+                response = self.generate_response(
+                    user_input,
+                    temperature=temperature,
+                    use_context=use_context
+                )
+                print(response)
+                # Add to history
+                self.conversation_history.append(f"Client: {user_input}")
+                self.conversation_history.append(f"Counselor: {response}")
+            except KeyboardInterrupt:
+                print("\n\n👋 さようなら！(Goodbye!)")
+                break
+            except Exception as e:
+                print(f"\n❌ Error: {e}")
+                continue
+    def save_conversation(self):
+        """Save the conversation to a file"""
+        if not self.conversation_history:
+            print("❌ No conversation to save")
+            return
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"conversation_{timestamp}.json"
+        conversation_data = {
+            "timestamp": timestamp,
+            "model_path": self.model_path,
+            "conversation": self.conversation_history
+        }
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(conversation_data, f, ensure_ascii=False, indent=2)
+        print(f"✅ Conversation saved to {filename}")
+    def test_responses(self):
+        """Test the model with predefined inputs"""
+        print("\n" + "="*80)
+        print("🧪 Testing Model Responses")
+        print("="*80)
+        test_inputs = [
+            "こんにちは。最近ストレスを感じています。",
+            "仕事がうまくいかなくて悩んでいます。",
+            "人間関係で困っています。どうすればいいでしょうか。",
+            "将来が不安で眠れません。",
+            "自分に自信が持てません。",
+            "家族との関係で悩んでいます。",
+            "毎日が辛いです。",
+            "誰にも相談できません。"
+        ]
+        print("\nTesting with different temperature settings:\n")
+        for temp in [0, 0.1]:
+            print(f"\n🌡️ Temperature: {temp}")
+            print("-"*60)
+            for i, test_input in enumerate(test_inputs[:3], 1):
+                print(f"\n{i}. Input: {test_input}")
+                response = self.generate_response(test_input, temperature=temp, use_context=False)
+                print(f"   Response: {response[:200]}...")
+                print()
+        print("="*80)
+def main():
+    """Main function"""
+    import argparse
+    parser = argparse.ArgumentParser(description='Chat with fine-tuned counseling model')
+    parser.add_argument('--model_path', type=str, default='./merged_counselor_mode_2b',
+                       help='Path to the fine-tuned model')
+    parser.add_argument('--test_only', action='store_true',
+                       help='Only run test responses without chat')
+    args = parser.parse_args()
+    # Check if model exists
+    if not os.path.exists(args.model_path):
+        print(f"❌ Model not found at {args.model_path}")
+        print("\nAvailable models:")
+        for item in os.listdir('.'):
+            if 'model' in item.lower() and os.path.isdir(item):
+                print(f"  - {item}")
+        return
+    try:
+        # Initialize chat interface
+        chat = CounselorChatInterface(model_path=args.model_path)
+        if args.test_only:
+            # Run tests only
+            chat.test_responses()
+        else:
+            # Start interactive chat
+            chat.chat()
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

data_preprocessor.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import json
+import os
+from pathlib import Path
+import pandas as pd
+from typing import List, Dict, Tuple, Optional
+import random
+from tqdm import tqdm
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+class KokoroChatPreprocessor:
+    def __init__(self, data_path: str, max_length: int = 2048, min_score: int = 60):
+        """
+        Initialize the preprocessor for KokoroChat dataset
+        Args:
+            data_path: Path to KokoroChat repository
+            max_length: Maximum sequence length for model input
+            min_score: Minimum score threshold for filtering conversations (default: 60)
+        """
+        self.data_path = Path(data_path)
+        self.max_length = max_length
+        self.min_score = min_score
+        self.conversations = []
+        self.score_distribution = []  # Track score distribution
+        self.system_prompt = """あなたは思いやりのある心理カウンセラーです。
+クライアントの感情を理解し、共感的で支援的な応答を提供してください。
+プライバシーを尊重し、判断を下さず、希望と実用的な洞察を提供することに焦点を当ててください。"""
+    def load_json_files(self) -> List[Dict]:
+        """Load all JSON files from the dataset"""
+        json_files = []
+        # Changed from "data" to "kokorochat_dialogues"
+        data_dir = self.data_path / "kokorochat_dialogues"
+        # Check if data directory exists, if not try root directory
+        if not data_dir.exists():
+            data_dir = self.data_path
+            print(f"Using root directory: {data_dir}")
+        else:
+            print(f"Using data directory: {data_dir}")
+        for root, dirs, files in os.walk(data_dir):
+            for file in tqdm(files, desc="Loading JSON files"):
+                if file.endswith('.json'):
+                    file_path = os.path.join(root, file)
+                    try:
+                        with open(file_path, 'r', encoding='utf-8') as f:
+                            data = json.load(f)
+                            json_files.append(data)
+                    except Exception as e:
+                        print(f"Error loading {file_path}: {e}")
+        return json_files
+    def analyze_score_distribution(self, json_files: List[Dict]) -> Dict:
+        """
+        Analyze the distribution of scores in the dataset
+        Returns:
+            Dictionary with score statistics
+        """
+        scores = []
+        for data in json_files:
+            if 'review_by_client_jp' in data:
+                score = data['review_by_client_jp'].get('点数', 0)
+                if score > 0:  # Only count valid scores
+                    scores.append(score)
+                    self.score_distribution.append(score)
+        if scores:
+            stats = {
+                'total_conversations': len(json_files),
+                'conversations_with_scores': len(scores),
+                'mean_score': float(np.mean(scores)),
+                'median_score': float(np.median(scores)),
+                'std_score': float(np.std(scores)),
+                'min_score': float(np.min(scores)),
+                'max_score': float(np.max(scores)),
+                'percentiles': {
+                    '25th': float(np.percentile(scores, 25)),
+                    '50th': float(np.percentile(scores, 50)),
+                    '75th': float(np.percentile(scores, 75)),
+                    '90th': float(np.percentile(scores, 90))
+                },
+                'score_ranges': {
+                    '0-30': int(sum(1 for s in scores if 0 <= s < 30)),
+                    '30-50': int(sum(1 for s in scores if 30 <= s < 50)),
+                    '50-60': int(sum(1 for s in scores if 50 <= s < 60)),
+                    '60-70': int(sum(1 for s in scores if 60 <= s < 70)),
+                    '70-80': int(sum(1 for s in scores if 70 <= s < 80)),
+                    '80-90': int(sum(1 for s in scores if 80 <= s < 90)),
+                    '90-100': int(sum(1 for s in scores if 90 <= s <= 100)),
+                }
+            }
+            # Calculate how many conversations would be kept at different thresholds
+            threshold_analysis = {}
+            for threshold in [30, 40, 50, 60, 65, 70, 75, 80]:
+                kept = sum(1 for s in scores if s >= threshold)
+                threshold_analysis[f'threshold_{threshold}'] = {
+                    'conversations_kept': kept,
+                    'percentage_kept': round((kept / len(scores)) * 100, 2)
+                }
+            stats['threshold_analysis'] = threshold_analysis
+            return stats
+        else:
+            return {'error': 'No valid scores found in dataset'}
+    def plot_score_distribution(self, save_path: str = "score_distribution.png"):
+        """
+        Plot the distribution of scores
+        """
+        if not self.score_distribution:
+            print("No scores to plot. Run analyze_score_distribution first.")
+            return
+        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+        # Histogram
+        axes[0, 0].hist(self.score_distribution, bins=20, edgecolor='black', alpha=0.7)
+        axes[0, 0].axvline(self.min_score, color='red', linestyle='--',
+                          label=f'Current threshold: {self.min_score}')
+        axes[0, 0].set_xlabel('Score')
+        axes[0, 0].set_ylabel('Frequency')
+        axes[0, 0].set_title('Score Distribution')
+        axes[0, 0].legend()
+        axes[0, 0].grid(True, alpha=0.3)
+        # Box plot
+        axes[0, 1].boxplot(self.score_distribution, vert=True)
+        axes[0, 1].set_ylabel('Score')
+        axes[0, 1].set_title('Score Box Plot')
+        axes[0, 1].grid(True, alpha=0.3)
+        # Cumulative distribution
+        sorted_scores = np.sort(self.score_distribution)
+        cumulative = np.arange(1, len(sorted_scores) + 1) / len(sorted_scores)
+        axes[1, 0].plot(sorted_scores, cumulative)
+        axes[1, 0].axvline(self.min_score, color='red', linestyle='--',
+                          label=f'Current threshold: {self.min_score}')
+        axes[1, 0].set_xlabel('Score')
+        axes[1, 0].set_ylabel('Cumulative Probability')
+        axes[1, 0].set_title('Cumulative Distribution')
+        axes[1, 0].legend()
+        axes[1, 0].grid(True, alpha=0.3)
+        # Threshold impact analysis
+        thresholds = range(30, 90, 5)
+        kept_percentages = []
+        for t in thresholds:
+            kept = sum(1 for s in self.score_distribution if s >= t)
+            kept_percentages.append((kept / len(self.score_distribution)) * 100)
+        axes[1, 1].plot(thresholds, kept_percentages, marker='o')
+        axes[1, 1].axvline(self.min_score, color='red', linestyle='--',
+                          label=f'Current threshold: {self.min_score}')
+        axes[1, 1].set_xlabel('Score Threshold')
+        axes[1, 1].set_ylabel('% of Conversations Kept')
+        axes[1, 1].set_title('Impact of Score Threshold')
+        axes[1, 1].legend()
+        axes[1, 1].grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        plt.show()
+        print(f"Score distribution plot saved to {save_path}")
+    def extract_high_quality_conversations(self, data: Dict) -> List[Dict]:
+        """
+        Extract conversations with high counselor ratings based on min_score
+        Focus on conversations where counselor performed well
+        """
+        conversations = []
+        # Check if review exists and has good score
+        if 'review_by_client_jp' in data:
+            review = data['review_by_client_jp']
+            score = review.get('点数', 0)
+            # Use configurable min_score threshold
+            if score >= self.min_score:
+                dialogue = data.get('dialogue', [])
+                # Create conversation pairs
+                conversation_text = ""
+                for turn in dialogue:
+                    role = turn['role']
+                    utterance = turn['utterance']
+                    if role == 'counselor':
+                        conversation_text += f"カウンセラー: {utterance}\n"
+                    else:
+                        conversation_text += f"クライアント: {utterance}\n"
+                # Extract detailed metrics for potential weighted training
+                conversations.append({
+                    'text': conversation_text,
+                    'score': score,  # Store the score here
+                    'topic': data.get('topic', {}).get('main_jp', 'Unknown'),
+                    'review_metrics': {
+                        'empathy': review.get('聴いてもらえた、わかってもらえたと感じた', 0),
+                        'respect': review.get('尊重されたと感じた', 0),
+                        'insights': review.get('新しい気づきや体験があった', 0),
+                        'hope': review.get('希望や期待を感じられた', 0),
+                        'concerns_addressed': review.get('取り組みたかったことを扱えた', 0),
+                        'collaboration': review.get('一緒に考えながら取り組めた', 0),
+                        'rhythm': review.get('やりとりのリズムがあっていた', 0),
+                        'comfort': review.get('居心地のよいやりとりだった', 0),
+                        'overall_appropriate': review.get('全体として適切でよかった', 0),
+                        'valuable': review.get('今回の相談は価値があった', 0),
+                        'smooth_start': review.get('相談開始の円滑さ', 0),
+                        'good_ending': review.get('相談終了のタイミング（不必要に聴きすぎていないか）、円滑さ', 0),
+                        'acceptance_empathy': review.get('受容·共感', 0),
+                        'affirmation': review.get('肯定·承認', 0),
+                        'effective_questions': review.get('的確な質問による会話の促進', 0),
+                        'summarization': review.get('要約', 0),
+                        'problem_clarification': review.get('問題の明確化', 0),
+                        'goal_clarification': review.get('この相談での目標の明確化', 0),
+                        'actionable_suggestions': review.get('次の行動につながる提案', 0),
+                        'encouragement': review.get('勇気づけ·希望の喚起', 0)
+                    }
+                })
+        return conversations
+    def create_training_examples(self, conversations: List[Dict],
+                                use_weighted_sampling: bool = False) -> List[Dict]:
+        """
+        Create training examples in instruction-following format
+        Args:
+            conversations: List of conversation dictionaries
+            use_weighted_sampling: If True, create more examples from higher-scored conversations
+        """
+        training_examples = []
+        for conv in tqdm(conversations, desc="Creating training examples"):
+            dialogue_lines = conv['text'].split('\n')
+            score = conv['score']  # Get score from the conversation dict
+            # Calculate sampling weight based on score if enabled
+            if use_weighted_sampling:
+                # Higher scores get more weight (normalized to 1-3 range)
+                weight = max(1, int((score - self.min_score) / 20) + 1)
+            else:
+                weight = 1
+            # Create multiple training examples from each conversation
+            for _ in range(weight):  # Repeat based on weight
+                for i in range(0, len(dialogue_lines) - 1, 2):
+                    if i + 1 < len(dialogue_lines):
+                        client_line = dialogue_lines[i]
+                        counselor_line = dialogue_lines[i + 1]
+                        # Check if lines contain the expected prefixes
+                        if 'クライアント:' in client_line and 'カウンセラー:' in counselor_line:
+                            client_msg = client_line.replace('クライアント: ', '').replace('クライアント:', '').strip()
+                            counselor_msg = counselor_line.replace('カウンセラー: ', '').replace('カウンセラー:', '').strip()
+                            # Skip empty messages
+                            if not client_msg or not counselor_msg:
+                                continue
+                            # Format for instruction tuning
+                            example = {
+                                'instruction': self.system_prompt,
+                                'input': client_msg,
+                                'output': counselor_msg,
+                                'score': score,  # Use the score from conversation
+                                'topic': conv['topic'],
+                                'metrics': conv['review_metrics']  # Include detailed metrics
+                            }
+                            training_examples.append(example)
+        return training_examples
+    def prepare_dataset(self, test_size: float = 0.1, val_size: float = 0.1,
+                        use_weighted_sampling: bool = False,
+                        analyze_scores: bool = True):
+        """
+        Prepare train, validation, and test datasets
+        Args:
+            test_size: Proportion of data for testing
+            val_size: Proportion of data for validation
+            use_weighted_sampling: If True, oversample high-quality conversations
+            analyze_scores: If True, print score distribution analysis
+        """
+        print("Loading KokoroChat dataset...")
+        json_files = self.load_json_files()
+        print(f"Loaded {len(json_files)} conversation files")
+        # Analyze score distribution if requested
+        if analyze_scores:
+            print("\n" + "="*60)
+            print("SCORE DISTRIBUTION ANALYSIS")
+            print("="*60)
+            stats = self.analyze_score_distribution(json_files)
+            if 'error' not in stats:
+                print(f"Total conversations: {stats['total_conversations']}")
+                print(f"Conversations with scores: {stats['conversations_with_scores']}")
+                print(f"\nScore Statistics:")
+                print(f"  Mean: {stats['mean_score']:.2f}")
+                print(f"  Median: {stats['median_score']:.2f}")
+                print(f"  Std Dev: {stats['std_score']:.2f}")
+                print(f"  Range: {stats['min_score']:.0f} - {stats['max_score']:.0f}")
+                print(f"\nScore Distribution:")
+                for range_name, count in stats['score_ranges'].items():
+                    percentage = (count / stats['conversations_with_scores']) * 100
+                    print(f"  {range_name}: {count} ({percentage:.1f}%)")
+                print(f"\nThreshold Impact Analysis:")
+                for threshold_name, data in stats['threshold_analysis'].items():
+                    threshold = threshold_name.split('_')[1]
+                    print(f"  Threshold >= {threshold}: {data['conversations_kept']} conversations ({data['percentage_kept']:.1f}%)")
+                print(f"\nCurrent threshold ({self.min_score}) will keep: ", end="")
+                kept = sum(1 for s in self.score_distribution if s >= self.min_score)
+                print(f"{kept} conversations ({(kept/len(self.score_distribution))*100:.1f}%)")
+                print("="*60 + "\n")
+                # Plot distribution
+                self.plot_score_distribution()
+        all_conversations = []
+        filtered_count = 0
+        total_count = 0
+        for data in json_files:
+            if 'review_by_client_jp' in data:
+                total_count += 1
+                score = data['review_by_client_jp'].get('点数', 0)
+                if score < self.min_score:
+                    filtered_count += 1
+            conversations = self.extract_high_quality_conversations(data)
+            all_conversations.extend(conversations)
+        print(f"Filtered out {filtered_count} conversations with score < {self.min_score}")
+        print(f"Extracted {len(all_conversations)} high-quality conversations (score >= {self.min_score})")
+        # Create training examples
+        training_examples = self.create_training_examples(
+            all_conversations,
+            use_weighted_sampling=use_weighted_sampling
+        )
+        print(f"Created {len(training_examples)} training examples")
+        if use_weighted_sampling:
+            print("Note: Used weighted sampling - higher scored conversations appear more frequently")
+        # Shuffle and split
+        random.shuffle(training_examples)
+        total_size = len(training_examples)
+        test_split = int(total_size * test_size)
+        val_split = int(total_size * val_size)
+        test_data = training_examples[:test_split]
+        val_data = training_examples[test_split:test_split + val_split]
+        train_data = training_examples[test_split + val_split:]
+        print(f"\nDataset splits:")
+        print(f"  Train: {len(train_data)} examples")
+        print(f"  Validation: {len(val_data)} examples")
+        print(f"  Test: {len(test_data)} examples")
+        return {
+            'train': train_data,
+            'validation': val_data,
+            'test': test_data
+        }
+    def format_for_lfm(self, example: Dict) -> str:
+        """
+        Format example for LFM model training
+        """
+        formatted = f"""### Instruction:
+{example['instruction']}
+### Input:
+{example['input']}
+### Response:
+{example['output']}"""
+        return formatted
+    def save_datasets(self, datasets: Dict, output_dir: str):
+        """Save processed datasets with proper type conversion for JSON serialization"""
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        # Helper function to convert numpy types to Python native types
+        def convert_to_native(obj):
+            if isinstance(obj, np.integer):
+                return int(obj)
+            elif isinstance(obj, np.floating):
+                return float(obj)
+            elif isinstance(obj, np.ndarray):
+                return obj.tolist()
+            else:
+                return obj
+        # Save dataset statistics
+        stats = {
+            'min_score_threshold': int(self.min_score),
+            'dataset_sizes': {
+                'train': len(datasets['train']),
+                'validation': len(datasets['validation']),
+                'test': len(datasets['test'])
+            },
+            'score_distribution': {}
+        }
+        for split_name, data in datasets.items():
+            # Calculate score distribution for this split
+            scores = [ex['score'] for ex in data]
+            if scores:
+                stats['score_distribution'][split_name] = {
+                    'mean': float(np.mean(scores)),
+                    'median': float(np.median(scores)),
+                    'min': float(np.min(scores)),
+                    'max': float(np.max(scores)),
+                    'std': float(np.std(scores))
+                }
+            # Save as JSONL for easier streaming
+            file_path = output_path / f"{split_name}.jsonl"
+            with open(file_path, 'w', encoding='utf-8') as f:
+                for example in data:
+                    formatted_text = self.format_for_lfm(example)
+                    # Convert all numpy types to native Python types
+                    json_obj = {
+                        'text': formatted_text,
+                        'score': convert_to_native(example['score']),
+                        'topic': example['topic']
+                    }
+                    json_line = json.dumps(json_obj, ensure_ascii=False)
+                    f.write(json_line + '\n')
+            print(f"Saved {split_name} dataset with {len(data)} examples to {file_path}")
+        # Save statistics
+        stats_path = output_path / "dataset_stats.json"
+        with open(stats_path, 'w', encoding='utf-8') as f:
+            json.dump(stats, f, ensure_ascii=False, indent=2)
+        print(f"Saved dataset statistics to {stats_path}")
+        # Print summary statistics
+        print("\n" + "="*60)
+        print("DATASET SUMMARY")
+        print("="*60)
+        print(f"Minimum score threshold: {stats['min_score_threshold']}")
+        print("\nDataset sizes:")
+        for split, size in stats['dataset_sizes'].items():
+            print(f"  {split}: {size} examples")
+        print("\nScore distributions by split:")
+        for split, dist in stats['score_distribution'].items():
+            print(f"  {split}:")
+            print(f"    Mean: {dist['mean']:.2f}")
+            print(f"    Std:  {dist['std']:.2f}")
+            print(f"    Range: {dist['min']:.0f} - {dist['max']:.0f}")
+        print("="*60)
+# Run preprocessing with different score thresholds
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Preprocess KokoroChat dataset')
+    parser.add_argument('--data_path', type=str, default='./KokoroChat',
+                       help='Path to KokoroChat repository')
+    parser.add_argument('--min_score', type=int, default=70,
+                       help='Minimum score threshold for filtering (default: 70)')
+    parser.add_argument('--output_dir', type=str, default='./processed_data',
+                       help='Output directory for processed data')
+    parser.add_argument('--weighted_sampling', action='store_true',
+                       help='Use weighted sampling based on scores')
+    parser.add_argument('--test_size', type=float, default=0.1,
+                       help='Test set size (default: 0.1)')
+    parser.add_argument('--val_size', type=float, default=0.1,
+                       help='Validation set size (default: 0.1)')
+    parser.add_argument('--analyze_only', action='store_true',
+                       help='Only analyze score distribution without processing')
+    args = parser.parse_args()
+    # Initialize preprocessor with configurable min_score
+    preprocessor = KokoroChatPreprocessor(
+        data_path=args.data_path,
+        min_score=args.min_score
+    )
+    if args.analyze_only:
+        # Just analyze the score distribution
+        print("Running score distribution analysis only...")
+        json_files = preprocessor.load_json_files()
+        stats = preprocessor.analyze_score_distribution(json_files)
+        preprocessor.plot_score_distribution(f"score_analysis_threshold_{args.min_score}.png")
+    else:
+        # Full preprocessing
+        print(f"Processing with minimum score threshold: {args.min_score}")
+        datasets = preprocessor.prepare_dataset(
+            test_size=args.test_size,
+            val_size=args.val_size,
+            use_weighted_sampling=args.weighted_sampling,
+            analyze_scores=True
+        )
+        # Save with threshold in directory name
+        output_dir = f"{args.output_dir}_score{args.min_score}"
+        preprocessor.save_datasets(datasets, output_dir)
+        print(f"\nProcessing complete! Data saved to {output_dir}")
+        print("\nNext steps:")
+        print("1. Run fine-tuning: python finetune_lfm.py")
+        print("2. Run benchmarking: python benchmark_model.py")
+        print("3. Optimize for mobile: python optimize_for_mobile.py")

finalmerged_model.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73fdb12fa8819f3d5160ec5414e55e827d08d1d69874a4168035b7f0c9fb02a4
+size 1806737356

finetune_lfm.py ADDED Viewed

	@@ -0,0 +1,1311 @@

+# import torch
+# from transformers import (
+#     AutoModelForCausalLM,
+#     AutoTokenizer,
+#     TrainingArguments,
+#     Trainer,
+#     DataCollatorForLanguageModeling,
+#     BitsAndBytesConfig
+# )
+# from peft import (
+#     LoraConfig,
+#     get_peft_model,
+#     prepare_model_for_kbit_training,
+#     TaskType
+# )
+# from datasets import load_dataset, Dataset
+# import os
+# from typing import Dict, List, Optional
+# import numpy as np
+# from tqdm import tqdm
+# import json
+# import gc
+# import warnings
+# warnings.filterwarnings('ignore')
+# class LFMCounselorFineTuner:
+#     def __init__(self, model_name: str = "LiquidAI/LFM2-2.6B", use_4bit: bool = True):
+#         """
+#         Initialize the fine-tuner for LFM models
+#         Args:
+#             model_name: Name of the base model
+#             use_4bit: Whether to use 4-bit quantization for memory efficiency
+#         """
+#         self.model_name = model_name
+#         self.use_4bit = use_4bit
+#         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#         print(f"Using device: {self.device}")
+#         if torch.cuda.is_available():
+#             print(f"GPU: {torch.cuda.get_device_name(0)}")
+#             print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+#         # Disable wandb for simplicity
+#         os.environ["WANDB_DISABLED"] = "true"
+#     def setup_model_and_tokenizer(self):
+#         """Setup model with quantization and LoRA"""
+#         print("Loading tokenizer...")
+#         # Tokenizer setup
+#         try:
+#             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+#         except:
+#             # Fallback to a known working tokenizer if model-specific one fails
+#             print("Using fallback tokenizer...")
+#             self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+#         # Add padding token if it doesn't exist
+#         if self.tokenizer.pad_token is None:
+#             self.tokenizer.pad_token = self.tokenizer.eos_token
+#         if self.tokenizer.eos_token is None:
+#             self.tokenizer.eos_token = "</s>"
+#             self.tokenizer.pad_token = "</s>"
+#         self.tokenizer.padding_side = "right"
+#         # Quantization config for memory efficiency
+#         if self.use_4bit:
+#             print("Setting up 4-bit quantization...")
+#             bnb_config = BitsAndBytesConfig(
+#                 load_in_4bit=True,
+#                 bnb_4bit_quant_type="nf4",
+#                 bnb_4bit_compute_dtype=torch.float16,  # Use float16 for better compatibility
+#                 bnb_4bit_use_double_quant=True
+#             )
+#         else:
+#             bnb_config = None
+#         # Load model
+#         print(f"Loading model: {self.model_name}...")
+#         try:
+#             self.model = AutoModelForCausalLM.from_pretrained(
+#                 self.model_name,
+#                 quantization_config=bnb_config,
+#                 device_map="auto",
+#                 trust_remote_code=True,
+#                 torch_dtype=torch.float16
+#             )
+#         except Exception as e:
+#             print(f"Error loading model: {e}")
+#             print("Attempting to load without quantization...")
+#             self.model = AutoModelForCausalLM.from_pretrained(
+#                 self.model_name,
+#                 device_map="auto",
+#                 trust_remote_code=True,
+#                 torch_dtype=torch.float16,
+#                 low_cpu_mem_usage=True
+#             )
+#         # Enable gradient checkpointing to save memory
+#         if hasattr(self.model, 'gradient_checkpointing_enable'):
+#             self.model.gradient_checkpointing_enable()
+#         # Prepare model for k-bit training
+#         if self.use_4bit:
+#             print("Preparing model for 4-bit training...")
+#             self.model = prepare_model_for_kbit_training(self.model)
+#         # LoRA configuration - optimized for counseling task
+#         print("Applying LoRA configuration...")
+#         # Find the target modules dynamically
+#         target_modules = self.find_target_modules()
+#         lora_config = LoraConfig(
+#             r=16,  # Reduced rank for stability
+#             lora_alpha=32,  # Alpha parameter for LoRA scaling
+#             target_modules=target_modules,
+#             lora_dropout=0.05,
+#             bias="none",
+#             task_type=TaskType.CAUSAL_LM,
+#             inference_mode=False
+#         )
+#         # Apply LoRA
+#         self.model = get_peft_model(self.model, lora_config)
+#         # Print trainable parameters
+#         self.model.print_trainable_parameters()
+#     def find_target_modules(self):
+#         """Find linear modules to apply LoRA to"""
+#         target_modules = []
+#         for name, module in self.model.named_modules():
+#             if isinstance(module, torch.nn.Linear):
+#                 # Extract the module name
+#                 names = name.split('.')
+#                 if len(names) > 0:
+#                     target_modules.append(names[-1])
+#         # Remove duplicates and filter common patterns
+#         target_modules = list(set(target_modules))
+#         # Common patterns for transformer models
+#         common_targets = ["q_proj", "v_proj", "k_proj", "o_proj",
+#                          "gate_proj", "up_proj", "down_proj",
+#                          "fc1", "fc2", "query", "key", "value", "dense"]
+#         # Filter to only include common targets if they exist
+#         final_targets = [t for t in target_modules if any(ct in t.lower() for ct in common_targets)]
+#         # If no common targets found, use all linear layers
+#         if not final_targets:
+#             final_targets = target_modules[:6]  # Limit to prevent too many parameters
+#         print(f"LoRA target modules: {final_targets}")
+#         return final_targets if final_targets else ["q_proj", "v_proj"]  # Fallback
+#     def load_and_process_datasets(self, data_path: str):
+#         """Load and process datasets without multiprocessing issues"""
+#         print(f"Loading datasets from {data_path}...")
+#         # Load train dataset
+#         train_texts = []
+#         with open(f'{data_path}/train.jsonl', 'r', encoding='utf-8') as f:
+#             for line in tqdm(f, desc="Loading training data"):
+#                 data = json.loads(line)
+#                 train_texts.append(data['text'])
+#         # Load validation dataset
+#         val_texts = []
+#         with open(f'{data_path}/validation.jsonl', 'r', encoding='utf-8') as f:
+#             for line in tqdm(f, desc="Loading validation data"):
+#                 data = json.loads(line)
+#                 val_texts.append(data['text'])
+#         print(f"Loaded {len(train_texts)} training examples")
+#         print(f"Loaded {len(val_texts)} validation examples")
+#         # Tokenize datasets in batches (avoiding multiprocessing)
+#         print("Tokenizing training dataset...")
+#         train_encodings = self.tokenize_texts(train_texts)
+#         print("Tokenizing validation dataset...")
+#         val_encodings = self.tokenize_texts(val_texts)
+#         # Create datasets
+#         self.train_dataset = Dataset.from_dict(train_encodings)
+#         self.val_dataset = Dataset.from_dict(val_encodings)
+#         # Set format for PyTorch
+#         self.train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+#         self.val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+#         # Clean up memory
+#         del train_texts, val_texts, train_encodings, val_encodings
+#         gc.collect()
+#     def tokenize_texts(self, texts: List[str], batch_size: int = 100):
+#         """Tokenize texts in batches to avoid memory issues"""
+#         all_input_ids = []
+#         all_attention_masks = []
+#         for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
+#             batch_texts = texts[i:i + batch_size]
+#             # Tokenize batch
+#             encodings = self.tokenizer(
+#                 batch_texts,
+#                 truncation=True,
+#                 padding='max_length',
+#                 max_length=512,
+#                 return_tensors='pt'
+#             )
+#             # Convert to lists
+#             all_input_ids.extend(encodings['input_ids'].tolist())
+#             all_attention_masks.extend(encodings['attention_mask'].tolist())
+#         # Create labels (same as input_ids for language modeling)
+#         labels = all_input_ids.copy()
+#         return {
+#             'input_ids': all_input_ids,
+#             'attention_mask': all_attention_masks,
+#             'labels': labels
+#         }
+#     def setup_training_args(self, output_dir: str = "./counselor_model_2b"):
+#         """Setup training arguments optimized for counseling task"""
+#         print("Setting up training arguments...")
+#         # Calculate batch sizes based on available memory
+#         if torch.cuda.is_available():
+#             gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+#             if gpu_memory < 16:  # Less than 16GB
+#                 batch_size = 1
+#                 gradient_accumulation = 16
+#             elif gpu_memory < 24:  # Less than 24GB
+#                 batch_size = 2
+#                 gradient_accumulation = 8
+#             else:  # 24GB or more
+#                 batch_size = 4
+#                 gradient_accumulation = 4
+#         else:
+#             batch_size = 1
+#             gradient_accumulation = 16
+#         print(f"Using batch_size={batch_size}, gradient_accumulation={gradient_accumulation}")
+#         self.training_args = TrainingArguments(
+#             output_dir=output_dir,
+#             num_train_epochs=3,
+#             per_device_train_batch_size=batch_size,
+#             per_device_eval_batch_size=batch_size,
+#             gradient_accumulation_steps=gradient_accumulation,
+#             gradient_checkpointing=True,
+#             warmup_steps=100,
+#             learning_rate=5e-5,  # Conservative learning rate
+#             fp16=True,
+#             logging_steps=50,
+#             eval_strategy="steps",
+#             eval_steps=200,
+#             save_strategy="steps",
+#             save_steps=400,
+#             save_total_limit=2,
+#             load_best_model_at_end=True,
+#             metric_for_best_model="eval_loss",
+#             greater_is_better=False,
+#             report_to="none",  # Disable all reporting
+#             push_to_hub=False,
+#             optim="adamw_torch",  # Use standard optimizer
+#             lr_scheduler_type="linear",
+#             weight_decay=0.01,
+#             max_grad_norm=1.0,
+#             remove_unused_columns=False,
+#             label_names=["labels"],
+#             dataloader_num_workers=0,  # Disable multiprocessing in dataloader
+#             dataloader_pin_memory=False,  # Disable pinned memory to avoid issues
+#         )
+#     def train(self):
+#         """Execute training"""
+#         print("Initializing trainer...")
+#         # Data collator for language modeling
+#         data_collator = DataCollatorForLanguageModeling(
+#             tokenizer=self.tokenizer,
+#             mlm=False,
+#             pad_to_multiple_of=8
+#         )
+#         # Custom training to handle potential issues
+#         try:
+#             # Initialize trainer
+#             trainer = Trainer(
+#                 model=self.model,
+#                 args=self.training_args,
+#                 train_dataset=self.train_dataset,
+#                 eval_dataset=self.val_dataset,
+#                 data_collator=data_collator,
+#                 tokenizer=self.tokenizer,
+#             )
+#             # Start training
+#             print("="*50)
+#             print("Starting fine-tuning...")
+#             print("="*50)
+#             # Train with error handling
+#             train_result = trainer.train()
+#             # Save the final model
+#             print("\nSaving fine-tuned model...")
+#             trainer.save_model(f"{self.training_args.output_dir}/final_model_2b")
+#             self.tokenizer.save_pretrained(f"{self.training_args.output_dir}/final_model_2b")
+#             # Save training metrics
+#             with open(f"{self.training_args.output_dir}/training_metrics.json", 'w') as f:
+#                 json.dump(train_result.metrics, f, indent=2)
+#             print("\n" + "="*50)
+#             print("Training completed successfully!")
+#             print(f"Model saved to: {self.training_args.output_dir}/final_model_2b")
+#             print("="*50)
+#             return trainer
+#         except Exception as e:
+#             print(f"Error during training: {e}")
+#             print("Attempting to save checkpoint...")
+#             # Try to save whatever we have
+#             try:
+#                 self.model.save_pretrained(f"{self.training_args.output_dir}/checkpoint_emergency")
+#                 self.tokenizer.save_pretrained(f"{self.training_args.output_dir}/checkpoint_emergency")
+#                 print(f"Emergency checkpoint saved to: {self.training_args.output_dir}/checkpoint_emergency")
+#             except:
+#                 print("Could not save emergency checkpoint")
+#             raise e
+# def test_model(model_path: str, tokenizer_path: str):
+#     """Test the fine-tuned model with a sample input"""
+#     print("\n" + "="*50)
+#     print("Testing fine-tuned model...")
+#     print("="*50)
+#     # Load model and tokenizer
+#     from peft import PeftModel, PeftConfig
+#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+#     # Try to load as PEFT model
+#     try:
+#         config = PeftConfig.from_pretrained(model_path)
+#         model = AutoModelForCausalLM.from_pretrained(
+#             config.base_model_name_or_path,
+#             torch_dtype=torch.float16,
+#             device_map="auto"
+#         )
+#         model = PeftModel.from_pretrained(model, model_path)
+#     except:
+#         # Load as regular model
+#         model = AutoModelForCausalLM.from_pretrained(
+#             model_path,
+#             torch_dtype=torch.float16,
+#             device_map="auto"
+#         )
+#     model.eval()
+#     # Test input
+#     test_input = "こんにちは。最近ストレスを感じています。"
+#     # Generate response
+#     inputs = tokenizer(test_input, return_tensors="pt")
+#     inputs = {k: v.cuda() if torch.cuda.is_available() else v for k, v in inputs.items()}
+#     with torch.no_grad():
+#         outputs = model.generate(
+#             **inputs,
+#             max_new_tokens=100,
+#             temperature=0.1,
+#             do_sample=True,
+#             top_p=0.9
+#         )
+#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+#     print(f"Input: {test_input}")
+#     print(f"Response: {response}")
+#     print("="*50)
+# # Main training script
+# if __name__ == "__main__":
+#     import argparse
+#     parser = argparse.ArgumentParser(description='Fine-tune LFM model for counseling')
+#     parser.add_argument('--model_name', type=str, default='gpt2',  # Using GPT2 as fallback
+#                        help='Base model name (use gpt2 if liquid model fails)')
+#     parser.add_argument('--data_path', type=str, default='./processed_data_score80',
+#                        help='Path to processed data')
+#     parser.add_argument('--output_dir', type=str, default='./counselor_model_2b',
+#                        help='Output directory for fine-tuned model')
+#     parser.add_argument('--use_4bit', action='store_true', default=False,
+#                        help='Use 4-bit quantization (set to False for stability)')
+#     parser.add_argument('--test_only', action='store_true',
+#                        help='Only test existing model')
+#     args = parser.parse_args()
+#     if args.test_only:
+#         # Test existing model
+#         test_model(
+#             f"{args.output_dir}/final_model_2b",
+#             f"{args.output_dir}/final_model_2b"
+#         )
+#     else:
+#         # Check if CUDA is available
+#         if not torch.cuda.is_available():
+#             print("Warning: CUDA is not available. Training will be very slow on CPU.")
+#             print("It's highly recommended to use a GPU for training.")
+#             response = input("Do you want to continue anyway? (y/n): ")
+#             if response.lower() != 'y':
+#                 exit()
+#         try:
+#             # Clear GPU cache
+#             if torch.cuda.is_available():
+#                 torch.cuda.empty_cache()
+#             # Initialize fine-tuner
+#             print(f"Initializing fine-tuner with model: {args.model_name}")
+#             finetuner = LFMCounselorFineTuner(
+#                 model_name=args.model_name,
+#                 use_4bit=args.use_4bit
+#             )
+#             # Setup model
+#             print("\nSetting up model and tokenizer...")
+#             finetuner.setup_model_and_tokenizer()
+#             # Load datasets (using new method without multiprocessing)
+#             print("\nLoading and processing datasets...")
+#             finetuner.load_and_process_datasets(args.data_path)
+#             # Setup training arguments
+#             print("\nSetting up training arguments...")
+#             finetuner.setup_training_args(args.output_dir)
+#             # Train
+#             trainer = finetuner.train()
+#             # Test the model
+#             print("\nTesting the fine-tuned model...")
+#             test_model(
+#                 f"{args.output_dir}/final_model_2b",
+#                 f"{args.output_dir}/final_model_2b"
+#             )
+#             print("\n✅ Fine-tuning completed successfully!")
+#             print(f"📁 Model saved to: {args.output_dir}/final_model_2b")
+#             print("\nNext steps:")
+#             print("1. Test more: python finetune_lfm.py --test_only")
+#             print("2. Run benchmarking: python benchmark_model.py")
+#             print("3. Optimize for mobile: python optimize_for_mobile.py")
+#         except KeyboardInterrupt:
+#             print("\n\nTraining interrupted by user.")
+#             print("Partial model may be saved in checkpoints.")
+#         except Exception as e:
+#             print(f"\n❌ Error during fine-tuning: {e}")
+#             import traceback
+#             traceback.print_exc()
+#             print("\nTroubleshooting tips:")
+#             print("1. Try reducing batch size")
+#             print("2. Try without 4-bit quantization: remove --use_4bit")
+#             print("3. Try with a smaller model like gpt2")
+#             print("4. Ensure you have enough GPU memory")
+###### wandb login ######
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+    BitsAndBytesConfig,
+    TrainerCallback
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    TaskType
+)
+from datasets import load_dataset, Dataset
+import os
+from typing import Dict, List, Optional
+import numpy as np
+from tqdm import tqdm
+import json
+import gc
+import warnings
+import wandb
+from datetime import datetime
+warnings.filterwarnings('ignore')
+class LFMCounselorFineTuner:
+    def __init__(self, model_name: str = "LiquidAI/LFM2-2.6B", use_4bit: bool = True):
+        """
+        Initialize the fine-tuner for LFM models
+        Args:
+            model_name: Name of the base model
+            use_4bit: Whether to use 4-bit quantization for memory efficiency
+        """
+        self.model_name = model_name
+        self.use_4bit = use_4bit
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        gpu_memory = 0
+        if torch.cuda.is_available():
+            gpu_name = torch.cuda.get_device_name(0)
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+            print(f"GPU: {gpu_name}")
+            print(f"GPU Memory: {gpu_memory:.2f} GB")
+        # Initialize WandB (always enabled)
+        try:
+            # Create a unique run name with timestamp
+            run_name = f"lfm-counselor-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+            # Initialize wandb with comprehensive config
+            wandb.init(
+                project="liquid-counselor-hackathon",
+                name=run_name,
+                config={
+                    "model_name": model_name,
+                    "use_4bit_quantization": use_4bit,
+                    "device": str(self.device),
+                    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
+                    "gpu_memory_gb": gpu_memory,
+                    "framework": "transformers",
+                    "peft_method": "LoRA",
+                    "task": "japanese_counseling",
+                    "dataset": "KokoroChat"
+                },
+                tags=["counseling", "japanese", "lfm", "finetune", "hackathon"]
+            )
+            print(f"✅ WandB initialized: {wandb.run.name}")
+            print(f"📊 View run at: {wandb.run.get_url()}")
+            self.wandb_enabled = True
+        except Exception as e:
+            print(f"⚠️ WandB initialization failed: {e}")
+            print("Continuing without WandB logging...")
+            self.wandb_enabled = False
+            os.environ["WANDB_DISABLED"] = "true"
+    def setup_model_and_tokenizer(self):
+        """Setup model with quantization and LoRA"""
+        print("Loading tokenizer...")
+        # Tokenizer setup
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        except:
+            # Fallback to a known working tokenizer if model-specific one fails
+            print("Using fallback tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        # Add padding token if it doesn't exist
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        if self.tokenizer.eos_token is None:
+            self.tokenizer.eos_token = "</s>"
+            self.tokenizer.pad_token = "</s>"
+        self.tokenizer.padding_side = "right"
+        # Quantization config for memory efficiency
+        if self.use_4bit:
+            print("Setting up 4-bit quantization...")
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True
+            )
+        else:
+            bnb_config = None
+        # Load model
+        print(f"Loading model: {self.model_name}...")
+        try:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                quantization_config=bnb_config,
+                device_map="auto",
+                trust_remote_code=True,
+                torch_dtype=torch.float16
+            )
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            print("Attempting to load without quantization...")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                device_map="auto",
+                trust_remote_code=True,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True
+            )
+        # Enable gradient checkpointing to save memory
+        if hasattr(self.model, 'gradient_checkpointing_enable'):
+            self.model.gradient_checkpointing_enable()
+        # Prepare model for k-bit training
+        if self.use_4bit:
+            print("Preparing model for 4-bit training...")
+            self.model = prepare_model_for_kbit_training(self.model)
+        # LoRA configuration - optimized for counseling task
+        print("Applying LoRA configuration...")
+        # Find the target modules dynamically
+        target_modules = self.find_target_modules()
+        lora_config = LoraConfig(
+            r=16,  # Reduced rank for stability
+            lora_alpha=32,  # Alpha parameter for LoRA scaling
+            target_modules=target_modules,
+            lora_dropout=0.05,
+            bias="none",
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=False
+        )
+        # Apply LoRA
+        self.model = get_peft_model(self.model, lora_config)
+        # Get trainable parameters info
+        trainable_params = 0
+        all_params = 0
+        for _, param in self.model.named_parameters():
+            all_params += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        trainable_percentage = 100 * trainable_params / all_params if all_params > 0 else 0
+        print(f"Trainable parameters: {trainable_params:,} / {all_params:,} ({trainable_percentage:.2f}%)")
+        # Log model architecture to WandB
+        if self.wandb_enabled:
+            wandb.config.update({
+                "lora_r": lora_config.r,
+                "lora_alpha": lora_config.lora_alpha,
+                "lora_dropout": lora_config.lora_dropout,
+                "lora_target_modules": target_modules,
+                "total_parameters": all_params,
+                "trainable_parameters": trainable_params,
+                "trainable_percentage": trainable_percentage
+            })
+        self.model.print_trainable_parameters()
+    def find_target_modules(self):
+        """Find linear modules to apply LoRA to"""
+        target_modules = []
+        for name, module in self.model.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                # Extract the module name
+                names = name.split('.')
+                if len(names) > 0:
+                    target_modules.append(names[-1])
+        # Remove duplicates and filter common patterns
+        target_modules = list(set(target_modules))
+        # Common patterns for transformer models
+        common_targets = ["q_proj", "v_proj", "k_proj", "o_proj",
+                         "gate_proj", "up_proj", "down_proj",
+                         "fc1", "fc2", "query", "key", "value", "dense"]
+        # Filter to only include common targets if they exist
+        final_targets = [t for t in target_modules if any(ct in t.lower() for ct in common_targets)]
+        # If no common targets found, use all linear layers
+        if not final_targets:
+            final_targets = target_modules[:6]  # Limit to prevent too many parameters
+        print(f"LoRA target modules: {final_targets}")
+        return final_targets if final_targets else ["q_proj", "v_proj"]  # Fallback
+    def load_and_process_datasets(self, data_path: str):
+        """Load and process datasets without multiprocessing issues"""
+        print(f"Loading datasets from {data_path}...")
+        # Load train dataset
+        train_texts = []
+        train_scores = []
+        train_topics = []
+        with open(f'{data_path}/train.jsonl', 'r', encoding='utf-8') as f:
+            for line in tqdm(f, desc="Loading training data"):
+                data = json.loads(line)
+                train_texts.append(data['text'])
+                train_scores.append(data.get('score', 0))
+                train_topics.append(data.get('topic', 'Unknown'))
+        # Load validation dataset
+        val_texts = []
+        val_scores = []
+        val_topics = []
+        with open(f'{data_path}/validation.jsonl', 'r', encoding='utf-8') as f:
+            for line in tqdm(f, desc="Loading validation data"):
+                data = json.loads(line)
+                val_texts.append(data['text'])
+                val_scores.append(data.get('score', 0))
+                val_topics.append(data.get('topic', 'Unknown'))
+        print(f"Loaded {len(train_texts)} training examples")
+        print(f"Loaded {len(val_texts)} validation examples")
+        # Log dataset statistics to WandB
+        if self.wandb_enabled:
+            # Calculate score statistics
+            train_score_stats = {
+                "train_examples": len(train_texts),
+                "train_avg_score": float(np.mean(train_scores)),
+                "train_min_score": float(np.min(train_scores)),
+                "train_max_score": float(np.max(train_scores)),
+                "train_std_score": float(np.std(train_scores))
+            }
+            val_score_stats = {
+                "val_examples": len(val_texts),
+                "val_avg_score": float(np.mean(val_scores)),
+                "val_min_score": float(np.min(val_scores)),
+                "val_max_score": float(np.max(val_scores)),
+                "val_std_score": float(np.std(val_scores))
+            }
+            wandb.config.update(train_score_stats)
+            wandb.config.update(val_score_stats)
+            # Log score distribution histogram
+            wandb.log({
+                "train_score_distribution": wandb.Histogram(train_scores),
+                "val_score_distribution": wandb.Histogram(val_scores)
+            })
+            # Log topic distribution
+            train_topic_counts = {}
+            for topic in train_topics:
+                train_topic_counts[topic] = train_topic_counts.get(topic, 0) + 1
+            # Create a bar chart for topics (top 20)
+            if len(train_topic_counts) > 0:
+                top_topics = sorted(train_topic_counts.items(), key=lambda x: x[1], reverse=True)[:20]
+                wandb.log({
+                    "topic_distribution": wandb.plot.bar(
+                        wandb.Table(data=[[k, v] for k, v in top_topics],
+                                   columns=["Topic", "Count"]),
+                        "Topic", "Count", title="Training Topic Distribution (Top 20)"
+                    )
+                })
+        # Tokenize datasets in batches (avoiding multiprocessing)
+        print("Tokenizing training dataset...")
+        train_encodings = self.tokenize_texts(train_texts)
+        print("Tokenizing validation dataset...")
+        val_encodings = self.tokenize_texts(val_texts)
+        # Create datasets
+        self.train_dataset = Dataset.from_dict(train_encodings)
+        self.val_dataset = Dataset.from_dict(val_encodings)
+        # Set format for PyTorch
+        self.train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+        self.val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+        # Clean up memory
+        del train_texts, val_texts, train_encodings, val_encodings
+        gc.collect()
+    def tokenize_texts(self, texts: List[str], batch_size: int = 100):
+        """Tokenize texts in batches to avoid memory issues"""
+        all_input_ids = []
+        all_attention_masks = []
+        for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
+            batch_texts = texts[i:i + batch_size]
+            # Tokenize batch
+            encodings = self.tokenizer(
+                batch_texts,
+                truncation=True,
+                padding='max_length',
+                max_length=512,
+                return_tensors='pt'
+            )
+            # Convert to lists
+            all_input_ids.extend(encodings['input_ids'].tolist())
+            all_attention_masks.extend(encodings['attention_mask'].tolist())
+        # Create labels (same as input_ids for language modeling)
+        labels = all_input_ids.copy()
+        return {
+            'input_ids': all_input_ids,
+            'attention_mask': all_attention_masks,
+            'labels': labels
+        }
+    def setup_training_args(self, output_dir: str = "./counselor_model_2b"):
+        """Setup training arguments optimized for counseling task"""
+        print("Setting up training arguments...")
+        # Calculate batch sizes based on available memory
+        if torch.cuda.is_available():
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+            if gpu_memory < 16:  # Less than 16GB
+                batch_size = 1
+                gradient_accumulation = 16
+            elif gpu_memory < 24:  # Less than 24GB
+                batch_size = 2
+                gradient_accumulation = 8
+            else:  # 24GB or more
+                batch_size = 4
+                gradient_accumulation = 4
+        else:
+            batch_size = 1
+            gradient_accumulation = 16
+        print(f"Using batch_size={batch_size}, gradient_accumulation={gradient_accumulation}")
+        # Update WandB config with training hyperparameters
+        if self.wandb_enabled:
+            wandb.config.update({
+                "batch_size": batch_size,
+                "gradient_accumulation_steps": gradient_accumulation,
+                "effective_batch_size": batch_size * gradient_accumulation,
+                "num_epochs": 3,
+                "learning_rate": 5e-5,
+                "warmup_steps": 100,
+                "weight_decay": 0.01,
+                "max_grad_norm": 1.0,
+                "lr_scheduler": "linear",
+                "optimizer": "adamw_torch",
+                "fp16": True,
+                "max_length": 512
+            })
+        # Set report_to based on wandb availability
+        report_to = "wandb" if self.wandb_enabled else "none"
+        self.training_args = TrainingArguments(
+            output_dir=output_dir,
+            num_train_epochs=3,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            gradient_accumulation_steps=gradient_accumulation,
+            gradient_checkpointing=True,
+            warmup_steps=100,
+            learning_rate=5e-5,
+            fp16=True,
+            logging_steps=50,
+            logging_first_step=True,
+            eval_strategy="steps",
+            eval_steps=200,
+            save_strategy="steps",
+            save_steps=400,
+            save_total_limit=2,
+            load_best_model_at_end=True,
+            metric_for_best_model="eval_loss",
+            greater_is_better=False,
+            report_to=report_to,
+            run_name=wandb.run.name if self.wandb_enabled and wandb.run else "local_run",
+            push_to_hub=False,
+            optim="adamw_torch",
+            lr_scheduler_type="linear",
+            weight_decay=0.01,
+            max_grad_norm=1.0,
+            remove_unused_columns=False,
+            label_names=["labels"],
+            dataloader_num_workers=0,
+            dataloader_pin_memory=False,
+        )
+    def train(self):
+        """Execute training"""
+        print("Initializing trainer...")
+        # Data collator for language modeling
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=self.tokenizer,
+            mlm=False,
+            pad_to_multiple_of=8
+        )
+        # Custom callback for additional metrics (properly inheriting from TrainerCallback)
+        class CustomMetricsCallback(TrainerCallback):
+            def on_log(self, args, state, control, logs=None, **kwargs):
+                if logs and self.wandb_enabled:
+                    # Add perplexity metrics
+                    if "loss" in logs:
+                        logs["perplexity"] = np.exp(logs["loss"])
+                    if "eval_loss" in logs:
+                        logs["eval_perplexity"] = np.exp(logs["eval_loss"])
+                return control
+        # Create callback instance with wandb_enabled flag
+        custom_callback = CustomMetricsCallback()
+        custom_callback.wandb_enabled = self.wandb_enabled
+        # Custom training to handle potential issues
+        try:
+            # Initialize trainer with callbacks
+            trainer = Trainer(
+                model=self.model,
+                args=self.training_args,
+                train_dataset=self.train_dataset,
+                eval_dataset=self.val_dataset,
+                data_collator=data_collator,
+                tokenizer=self.tokenizer,
+                callbacks=[custom_callback] if self.wandb_enabled else [],
+            )
+            # Calculate total training steps
+            total_steps = len(self.train_dataset) // (self.training_args.per_device_train_batch_size * self.training_args.gradient_accumulation_steps) * self.training_args.num_train_epochs
+            # Start training
+            print("="*50)
+            print("Starting fine-tuning...")
+            print(f"Total training samples: {len(self.train_dataset)}")
+            print(f"Total validation samples: {len(self.val_dataset)}")
+            print(f"Total training steps: {total_steps}")
+            print("="*50)
+            # Log training start
+            if self.wandb_enabled:
+                wandb.log({"training_status": "started", "total_steps": total_steps})
+            # Train with error handling
+            train_result = trainer.train()
+            # Save the final model
+            print("\nSaving fine-tuned model...")
+            trainer.save_model(f"{self.training_args.output_dir}/final_model_2b")
+            self.tokenizer.save_pretrained(f"{self.training_args.output_dir}/final_model_2b")
+            # Save training metrics
+            with open(f"{self.training_args.output_dir}/training_metrics.json", 'w') as f:
+                json.dump(train_result.metrics, f, indent=2)
+            # Final evaluation
+            print("\nRunning final evaluation...")
+            eval_results = trainer.evaluate()
+            # Save evaluation metrics
+            with open(f"{self.training_args.output_dir}/eval_metrics.json", 'w') as f:
+                json.dump(eval_results, f, indent=2)
+            # Log final metrics to WandB
+            if self.wandb_enabled:
+                # Log final metrics
+                wandb.run.summary.update({
+                    "final_train_loss": train_result.metrics.get("train_loss", 0),
+                    "final_eval_loss": eval_results.get("eval_loss", 0),
+                    "final_eval_perplexity": np.exp(eval_results.get("eval_loss", 0)),
+                    "total_training_time": train_result.metrics.get("train_runtime", 0),
+                    "training_samples_per_second": train_result.metrics.get("train_samples_per_second", 0),
+                    "training_status": "completed"
+                })
+                # Create a summary table
+                summary_table = wandb.Table(
+                    columns=["Metric", "Value"],
+                    data=[
+                        ["Final Training Loss", f"{train_result.metrics.get('train_loss', 0):.4f}"],
+                        ["Final Eval Loss", f"{eval_results.get('eval_loss', 0):.4f}"],
+                        ["Final Perplexity", f"{np.exp(eval_results.get('eval_loss', 0)):.2f}"],
+                        ["Training Time (seconds)", f"{train_result.metrics.get('train_runtime', 0):.0f}"],
+                        ["Training Samples/Second", f"{train_result.metrics.get('train_samples_per_second', 0):.2f}"]
+                    ]
+                )
+                wandb.log({"training_summary": summary_table})
+                # Save model artifact
+                try:
+                    artifact = wandb.Artifact(
+                        name=f"counselor-model-{wandb.run.id}",
+                        type="model",
+                        description="Fine-tuned Japanese counseling model",
+                        metadata={
+                            "base_model": self.model_name,
+                            "final_loss": float(eval_results.get("eval_loss", 0)),
+                            "final_perplexity": float(np.exp(eval_results.get("eval_loss", 0))),
+                            "dataset": "KokoroChat"
+                        }
+                    )
+                    artifact.add_dir(f"{self.training_args.output_dir}/final_model_2b")
+                    wandb.log_artifact(artifact)
+                except Exception as e:
+                    print(f"Warning: Could not save model artifact: {e}")
+            print("\n" + "="*50)
+            print("✅ Training completed successfully!")
+            print(f"📁 Model saved to: {self.training_args.output_dir}/final_model_2b")
+            print(f"📉 Final eval loss: {eval_results.get('eval_loss', 0):.4f}")
+            print(f"📊 Final perplexity: {np.exp(eval_results.get('eval_loss', 0)):.2f}")
+            if self.wandb_enabled and wandb.run:
+                print(f"🔗 View results at: {wandb.run.get_url()}")
+            print("="*50)
+            return trainer
+        except Exception as e:
+            print(f"❌ Error during training: {e}")
+            # Log error to WandB
+            if self.wandb_enabled:
+                wandb.run.summary["training_status"] = "failed"
+                wandb.run.summary["error"] = str(e)
+            print("Attempting to save checkpoint...")
+            # Try to save whatever we have
+            try:
+                self.model.save_pretrained(f"{self.training_args.output_dir}/checkpoint_emergency")
+                self.tokenizer.save_pretrained(f"{self.training_args.output_dir}/checkpoint_emergency")
+                print(f"💾 Emergency checkpoint saved to: {self.training_args.output_dir}/checkpoint_emergency")
+            except:
+                print("❌ Could not save emergency checkpoint")
+            raise e
+        finally:
+            # Ensure WandB run is finished
+            if self.wandb_enabled:
+                wandb.finish()
+# def test_model(model_path: str, tokenizer_path: str):
+#     """Test the fine-tuned model with sample inputs"""
+#     print("\n" + "="*50)
+#     print("Testing fine-tuned model...")
+#     print("="*50)
+#     # Load model and tokenizer
+#     from peft import PeftModel, PeftConfig
+#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+#     if tokenizer.pad_token is None:
+#         tokenizer.pad_token = tokenizer.eos_token
+#     # Try to load as PEFT model
+#     try:
+#         config = PeftConfig.from_pretrained(model_path)
+#         model = AutoModelForCausalLM.from_pretrained(
+#             config.base_model_name_or_path,
+#             torch_dtype=torch.float16,
+#             device_map="auto"
+#         )
+#         model = PeftModel.from_pretrained(model, model_path)
+#     except:
+#         # Load as regular model
+#         model = AutoModelForCausalLM.from_pretrained(
+#             model_path,
+#             torch_dtype=torch.float16,
+#             device_map="auto"
+#         )
+#     model.eval()
+#     # Test inputs
+#     test_cases = [
+#         "こんにちは。最近ストレスを感じています。",
+#         "仕事がうまくいかなくて悩んでいます。",
+#         "人間関係で困っています。どうすればいいでしょうか。"
+#     ]
+#     print("Sample conversations:")
+#     print("-" * 50)
+def test_model(model_path: str, tokenizer_path: str):
+    """Test the fine-tuned model with sample inputs"""
+    print("\n" + "="*50)
+    print("Testing fine-tuned model...")
+    print("="*50)
+    # Load model and tokenizer with proper local path handling
+    from peft import PeftModel, PeftConfig
+    import os
+    # Fix tokenizer loading for local paths
+    try:
+        # Check if tokenizer files exist in the path
+        if os.path.exists(os.path.join(tokenizer_path, "tokenizer_config.json")):
+            print(f"Loading tokenizer from {tokenizer_path}")
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
+        else:
+            print(f"Tokenizer not found at {tokenizer_path}, using base model tokenizer")
+            # Fallback to base model tokenizer
+            tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    except Exception as e:
+        print(f"Error loading tokenizer: {e}")
+        print("Using fallback GPT-2 tokenizer")
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Try to load model
+    try:
+        # Check if it's a PEFT model
+        adapter_config_path = os.path.join(model_path, "adapter_config.json")
+        if os.path.exists(adapter_config_path):
+            print("Loading as PEFT model...")
+            config = PeftConfig.from_pretrained(model_path)
+            base_model = AutoModelForCausalLM.from_pretrained(
+                config.base_model_name_or_path,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
+            )
+            model = PeftModel.from_pretrained(base_model, model_path)
+        else:
+            # Load as regular model
+            print("Loading as regular model...")
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                local_files_only=True,
+                trust_remote_code=True
+            )
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        raise
+    model.eval()
+    # Test inputs
+    test_cases = [
+        "こんにちは。最近ストレスを感じています。",
+        "仕事がうまくいかなくて悩んでいます。",
+        "人間関係で困っています。どうすればいいでしょうか。"
+    ]
+    print("Sample conversations:")
+    print("-" * 50)
+    for test_input in test_cases:
+        # Generate response
+        inputs = tokenizer(test_input, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.cuda() if torch.cuda.is_available() else v for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=150,
+                temperature=0.1,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=tokenizer.pad_token_id
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response[len(test_input):].strip()  # Remove input from response
+        print(f"Client: {test_input}")
+        print(f"Counselor: {response[:200]}...")
+        print("-" * 50)
+    print("="*50)
+    for test_input in test_cases:
+        # Generate response
+        inputs = tokenizer(test_input, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.cuda() if torch.cuda.is_available() else v for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=150,
+                temperature=0.1,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=tokenizer.pad_token_id
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response[len(test_input):].strip()  # Remove input from response
+        print(f"Client: {test_input}")
+        print(f"Counselor: {response[:200]}...")
+        print("-" * 50)
+    print("="*50)
+# Main training script
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Fine-tune LFM model for counseling')
+    parser.add_argument('--model_name', type=str, default='LiquidAI/LFM2-2.6B',
+                       help='Base model name')
+    parser.add_argument('--data_path', type=str, default='./processed_data_score80',
+                       help='Path to processed data')
+    parser.add_argument('--output_dir', type=str, default='./counselor_model_2b',
+                       help='Output directory for fine-tuned model')
+    parser.add_argument('--use_4bit', action='store_true', default=False,
+                       help='Use 4-bit quantization')
+    parser.add_argument('--wandb_api_key', type=str, default=None,
+                       help='WandB API key (optional, can use wandb login instead)')
+    parser.add_argument('--test_only', action='store_true',
+                       help='Only test existing model')
+    args = parser.parse_args()
+    # Set WandB API key if provided
+    if args.wandb_api_key:
+        os.environ["WANDB_API_KEY"] = args.wandb_api_key
+    if args.test_only:
+        # Test existing model
+        test_model(
+            f"{args.output_dir}/final_model_2b",
+            f"{args.output_dir}/final_model_2b"
+        )
+    else:
+        # Check if CUDA is available
+        if not torch.cuda.is_available():
+            print("⚠️  Warning: CUDA is not available. Training will be very slow on CPU.")
+            print("It's highly recommended to use a GPU for training.")
+            response = input("Do you want to continue anyway? (y/n): ")
+            if response.lower() != 'y':
+                exit()
+        try:
+            # Clear GPU cache
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Initialize fine-tuner (WandB is enabled by default)
+            print(f"🚀 Initializing fine-tuner with model: {args.model_name}")
+            finetuner = LFMCounselorFineTuner(
+                model_name=args.model_name,
+                use_4bit=args.use_4bit
+            )
+            # Setup model
+            print("\n🔧 Setting up model and tokenizer...")
+            finetuner.setup_model_and_tokenizer()
+            # Load datasets
+            print("\n📚 Loading and processing datasets...")
+            finetuner.load_and_process_datasets(args.data_path)
+            # Setup training arguments
+            print("\n⚙️ Setting up training arguments...")
+            finetuner.setup_training_args(args.output_dir)
+            # Train
+            trainer = finetuner.train()
+            # Test the model
+            print("\n🧪 Testing the fine-tuned model...")
+            test_model(
+                f"{args.output_dir}/final_model_2b_v2",
+                f"{args.output_dir}/final_model_2b_v2"
+            )
+            print("\n✅ Fine-tuning completed successfully!")
+            print(f"📁 Model saved to: {args.output_dir}/final_model_2b_v2")
+            print("\n📋 Next steps:")
+            print("1. Test more: python finetune_lfm.py --test_only")
+            print("2. Run benchmarking: python benchmark_model.py")
+            print("3. Optimize for mobile: python optimize_for_mobile.py")
+        except KeyboardInterrupt:
+            print("\n\n⚠️  Training interrupted by user.")
+            print("Partial model may be saved in checkpoints.")
+            if wandb.run:
+                wandb.finish()
+        except Exception as e:
+            print(f"\n❌ Error during fine-tuning: {e}")
+            import traceback
+            traceback.print_exc()
+            if wandb.run:
+                wandb.finish()

finetune_lfm_complete_history.py ADDED Viewed

	@@ -0,0 +1,801 @@

+"""
+Fine-tuning Script for LFM2-2.6B with Complete Dialogue History
+Following KokoroChat methodology - uses entire conversation context
+Filename: finetune_lfm_complete_history.py
+"""
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+    BitsAndBytesConfig,
+    TrainerCallback
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    TaskType,
+    PeftModel,
+    PeftConfig
+)
+from datasets import load_dataset, Dataset
+import os
+from typing import Dict, List, Optional
+import numpy as np
+from tqdm import tqdm
+import json
+import gc
+import warnings
+import wandb
+from datetime import datetime
+warnings.filterwarnings('ignore')
+# Enable TF32 for H100 optimization
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+class LFMKokoroChatFineTuner:
+    def __init__(
+        self,
+        model_name: str = "LiquidAI/LFM2-2.6B",
+        use_4bit: bool = False,  # H100 has enough memory
+        max_seq_length: int = 2048  # Increased for complete dialogue history
+    ):
+        """
+        Initialize the fine-tuner for LFM models with complete dialogue history support
+        Args:
+            model_name: Name of the base model
+            use_4bit: Whether to use 4-bit quantization
+            max_seq_length: Maximum sequence length for complete dialogues
+        """
+        self.model_name = model_name
+        self.use_4bit = use_4bit
+        self.max_seq_length = max_seq_length
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print("="*80)
+        print("🚀 LFM Fine-tuning with Complete Dialogue History (KokoroChat Method)")
+        print("="*80)
+        print(f"Model: {model_name}")
+        print(f"Device: {self.device}")
+        print(f"Max sequence length: {max_seq_length}")
+        # GPU information
+        if torch.cuda.is_available():
+            num_gpus = torch.cuda.device_count()
+            print(f"Number of GPUs: {num_gpus}")
+            for i in range(num_gpus):
+                gpu_name = torch.cuda.get_device_name(i)
+                gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1e9
+                print(f"  GPU {i}: {gpu_name} ({gpu_memory:.2f} GB)")
+        # Initialize WandB
+        self.init_wandb()
+    def init_wandb(self):
+        """Initialize WandB for experiment tracking"""
+        try:
+            run_name = f"lfm-kokoro-complete-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+            wandb.init(
+                project="lfm-kokoro-complete-history",
+                name=run_name,
+                config={
+                    "model_name": self.model_name,
+                    "use_4bit_quantization": self.use_4bit,
+                    "max_seq_length": self.max_seq_length,
+                    "device": str(self.device),
+                    "num_gpus": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+                    "methodology": "Complete dialogue history (KokoroChat)",
+                    "framework": "transformers + peft",
+                    "task": "japanese_counseling"
+                },
+                tags=["counseling", "japanese", "lfm", "complete-history", "kokoro"]
+            )
+            print(f"✅ WandB initialized: {wandb.run.name}")
+            print(f"📊 View run at: {wandb.run.get_url()}")
+            self.wandb_enabled = True
+        except Exception as e:
+            print(f"⚠️ WandB initialization failed: {e}")
+            self.wandb_enabled = False
+            os.environ["WANDB_DISABLED"] = "true"
+    def setup_model_and_tokenizer(self):
+        """Setup model with quantization and LoRA"""
+        print("\n📚 Setting up model and tokenizer...")
+        # Load tokenizer
+        print("Loading tokenizer...")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                trust_remote_code=True
+            )
+        except:
+            print("Using fallback tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        # Set special tokens
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        if self.tokenizer.eos_token is None:
+            self.tokenizer.eos_token = "</s>"
+            self.tokenizer.pad_token = "</s>"
+        self.tokenizer.padding_side = "left"  # Important for batch generation
+        # Quantization config
+        if self.use_4bit:
+            print("Setting up 4-bit quantization...")
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,  # BF16 for H100
+                bnb_4bit_use_double_quant=True
+            )
+        else:
+            bnb_config = None
+        # Load model
+        print(f"Loading model: {self.model_name}...")
+        model_kwargs = {
+            "trust_remote_code": True,
+            "torch_dtype": torch.bfloat16,  # BF16 for H100
+            "device_map": "auto",
+        }
+        if bnb_config:
+            model_kwargs["quantization_config"] = bnb_config
+        try:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                **model_kwargs
+            )
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            print("Attempting without device_map...")
+            model_kwargs.pop("device_map", None)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                **model_kwargs
+            )
+            self.model = self.model.to(self.device)
+        # Enable gradient checkpointing
+        if hasattr(self.model, 'gradient_checkpointing_enable'):
+            self.model.gradient_checkpointing_enable()
+        # Prepare for k-bit training if using quantization
+        if self.use_4bit:
+            print("Preparing model for 4-bit training...")
+            self.model = prepare_model_for_kbit_training(self.model)
+        # LoRA configuration optimized for dialogue with complete history
+        print("Applying LoRA configuration...")
+        # Find target modules
+        target_modules = self.find_target_modules()
+        # Higher rank for complex dialogue understanding
+        lora_config = LoraConfig(
+            r=64,  # Increased for better dialogue understanding
+            lora_alpha=128,
+            target_modules=target_modules,
+            lora_dropout=0.05,
+            bias="none",
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=False
+        )
+        # Apply LoRA
+        self.model = get_peft_model(self.model, lora_config)
+        # Print trainable parameters
+        trainable_params = 0
+        all_params = 0
+        for _, param in self.model.named_parameters():
+            all_params += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        trainable_percentage = 100 * trainable_params / all_params if all_params > 0 else 0
+        print(f"Trainable parameters: {trainable_params:,} / {all_params:,} ({trainable_percentage:.2f}%)")
+        # Log to WandB
+        if self.wandb_enabled:
+            wandb.config.update({
+                "lora_r": lora_config.r,
+                "lora_alpha": lora_config.lora_alpha,
+                "lora_dropout": lora_config.lora_dropout,
+                "lora_target_modules": target_modules,
+                "total_parameters": all_params,
+                "trainable_parameters": trainable_params,
+                "trainable_percentage": trainable_percentage
+            })
+        self.model.print_trainable_parameters()
+    def find_target_modules(self):
+        """Find linear modules to apply LoRA to"""
+        target_modules = []
+        for name, module in self.model.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                names = name.split('.')
+                if len(names) > 0:
+                    target_modules.append(names[-1])
+        # Remove duplicates
+        target_modules = list(set(target_modules))
+        # Common patterns for transformer models
+        common_targets = ["q_proj", "v_proj", "k_proj", "o_proj",
+                         "gate_proj", "up_proj", "down_proj",
+                         "fc1", "fc2", "query", "key", "value", "dense"]
+        # Filter to common targets
+        final_targets = [t for t in target_modules if any(ct in t.lower() for ct in common_targets)]
+        if not final_targets:
+            # Fallback to specific modules for LFM
+            final_targets = ["q_proj", "v_proj", "k_proj", "o_proj"]
+        print(f"LoRA target modules: {final_targets}")
+        return final_targets
+    def load_and_process_datasets(self, data_path: str):
+        """
+        Load and process datasets with complete dialogue history
+        Handles the new data format with full conversation context
+        """
+        print(f"\n📚 Loading datasets from {data_path}...")
+        # Check for dataset statistics
+        stats_file = os.path.join(data_path, 'dataset_stats.json')
+        if os.path.exists(stats_file):
+            with open(stats_file, 'r') as f:
+                stats = json.load(f)
+                print("Dataset statistics:")
+                print(f"  Average dialogue history: {stats['dialogue_history_stats']['mean_length']:.1f} turns")
+                print(f"  Max dialogue history: {stats['dialogue_history_stats']['max_length']} turns")
+                print(f"  Median dialogue history: {stats['dialogue_history_stats']['median_length']:.1f} turns")
+        # Load datasets
+        train_data = []
+        val_data = []
+        # Load training data
+        train_file = os.path.join(data_path, 'train.jsonl')
+        with open(train_file, 'r', encoding='utf-8') as f:
+            for line in tqdm(f, desc="Loading training data"):
+                item = json.loads(line)
+                train_data.append({
+                    'text': item['text'],
+                    'history_length': item.get('history_length', 0),
+                    'score': item.get('score', 100),
+                    'topic': item.get('topic', 'general')
+                })
+        # Load validation data
+        val_file = os.path.join(data_path, 'val.jsonl')
+        with open(val_file, 'r', encoding='utf-8') as f:
+            for line in tqdm(f, desc="Loading validation data"):
+                item = json.loads(line)
+                val_data.append({
+                    'text': item['text'],
+                    'history_length': item.get('history_length', 0),
+                    'score': item.get('score', 100),
+                    'topic': item.get('topic', 'general')
+                })
+        print(f"Loaded {len(train_data)} training examples")
+        print(f"Loaded {len(val_data)} validation examples")
+        # Analyze dialogue history lengths
+        train_history_lengths = [d['history_length'] for d in train_data]
+        val_history_lengths = [d['history_length'] for d in val_data]
+        print(f"\nDialogue history length distribution:")
+        print(f"  Training - Mean: {np.mean(train_history_lengths):.1f}, Max: {max(train_history_lengths)}")
+        print(f"  Validation - Mean: {np.mean(val_history_lengths):.1f}, Max: {max(val_history_lengths)}")
+        # Log to WandB
+        if self.wandb_enabled:
+            wandb.config.update({
+                "train_examples": len(train_data),
+                "val_examples": len(val_data),
+                "avg_train_history_length": float(np.mean(train_history_lengths)),
+                "max_train_history_length": int(max(train_history_lengths)),
+                "avg_val_history_length": float(np.mean(val_history_lengths)),
+                "max_val_history_length": int(max(val_history_lengths))
+            })
+            # Log history length distribution
+            wandb.log({
+                "train_history_distribution": wandb.Histogram(train_history_lengths),
+                "val_history_distribution": wandb.Histogram(val_history_lengths)
+            })
+        # Tokenize datasets
+        print("\nTokenizing datasets with complete dialogue history...")
+        print(f"Using max sequence length: {self.max_seq_length}")
+        # Extract texts for tokenization
+        train_texts = [d['text'] for d in train_data]
+        val_texts = [d['text'] for d in val_data]
+        # Tokenize with longer context for complete history
+        train_encodings = self.tokenize_texts(train_texts, desc="Tokenizing training data")
+        val_encodings = self.tokenize_texts(val_texts, desc="Tokenizing validation data")
+        # Create datasets
+        self.train_dataset = Dataset.from_dict(train_encodings)
+        self.val_dataset = Dataset.from_dict(val_encodings)
+        # Set format for PyTorch
+        self.train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+        self.val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+        # Clean up memory
+        del train_texts, val_texts, train_encodings, val_encodings, train_data, val_data
+        gc.collect()
+        print("✅ Datasets loaded and tokenized")
+    def tokenize_texts(self, texts: List[str], batch_size: int = 50, desc: str = "Tokenizing"):
+        """
+        Tokenize texts in batches with support for longer sequences
+        """
+        all_input_ids = []
+        all_attention_masks = []
+        # Process in smaller batches for long sequences
+        for i in tqdm(range(0, len(texts), batch_size), desc=desc):
+            batch_texts = texts[i:i + batch_size]
+            # Tokenize batch with longer max length
+            encodings = self.tokenizer(
+                batch_texts,
+                truncation=True,
+                padding='max_length',
+                max_length=self.max_seq_length,
+                return_tensors='pt'
+            )
+            # Convert to lists
+            all_input_ids.extend(encodings['input_ids'].tolist())
+            all_attention_masks.extend(encodings['attention_mask'].tolist())
+        # Create labels (same as input_ids for causal LM)
+        labels = all_input_ids.copy()
+        return {
+            'input_ids': all_input_ids,
+            'attention_mask': all_attention_masks,
+            'labels': labels
+        }
+    def setup_training_args(self, output_dir: str = "./lfm_kokoro_complete"):
+        """Setup training arguments optimized for complete dialogue history"""
+        print("\n⚙️ Setting up training arguments...")
+        # Calculate batch sizes based on sequence length and GPU memory
+        if torch.cuda.is_available():
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+            num_gpus = torch.cuda.device_count()
+            # Adjust batch size based on sequence length and GPU memory
+            if self.max_seq_length >= 2048:
+                if gpu_memory >= 80:  # H100 80GB
+                    batch_size = 4
+                    gradient_accumulation = 4
+                elif gpu_memory >= 40:
+                    batch_size = 2
+                    gradient_accumulation = 8
+                else:
+                    batch_size = 1
+                    gradient_accumulation = 16
+            else:
+                batch_size = 8
+                gradient_accumulation = 2
+            # Adjust for multiple GPUs
+            if num_gpus > 1:
+                batch_size = batch_size * num_gpus
+                gradient_accumulation = max(1, gradient_accumulation // num_gpus)
+        else:
+            batch_size = 1
+            gradient_accumulation = 32
+        print(f"Batch configuration:")
+        print(f"  Per device batch size: {batch_size}")
+        print(f"  Gradient accumulation steps: {gradient_accumulation}")
+        print(f"  Effective batch size: {batch_size * gradient_accumulation}")
+        # Update WandB config
+        if self.wandb_enabled:
+            wandb.config.update({
+                "batch_size": batch_size,
+                "gradient_accumulation_steps": gradient_accumulation,
+                "effective_batch_size": batch_size * gradient_accumulation,
+                "num_epochs": 3,
+                "learning_rate": 2e-4,
+                "warmup_ratio": 0.1,
+                "weight_decay": 0.01,
+                "max_grad_norm": 1.0,
+                "lr_scheduler": "cosine",
+                "optimizer": "adamw_torch"
+            })
+        self.training_args = TrainingArguments(
+            output_dir=output_dir,
+            num_train_epochs=3,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            gradient_accumulation_steps=gradient_accumulation,
+            gradient_checkpointing=True,
+            warmup_ratio=0.1,
+            learning_rate=2e-4,
+            bf16=True,  # Use BF16 for H100
+            tf32=True,  # Enable TF32 for H100
+            logging_steps=10,
+            logging_first_step=True,
+            eval_strategy="steps",
+            eval_steps=100,
+            save_strategy="steps",
+            save_steps=200,
+            save_total_limit=3,
+            load_best_model_at_end=True,
+            metric_for_best_model="eval_loss",
+            greater_is_better=False,
+            report_to="wandb" if self.wandb_enabled else "none",
+            run_name=wandb.run.name if self.wandb_enabled and wandb.run else "local_run",
+            optim="adamw_torch",
+            lr_scheduler_type="cosine",
+            weight_decay=0.01,
+            max_grad_norm=1.0,
+            remove_unused_columns=False,
+            label_names=["labels"],
+            dataloader_num_workers=4,
+            dataloader_pin_memory=True,
+            ddp_find_unused_parameters=False if torch.cuda.device_count() > 1 else None,
+        )
+    def train(self):
+        """Execute training with complete dialogue history"""
+        print("\n🎯 Starting training with complete dialogue history...")
+        # Data collator
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=self.tokenizer,
+            mlm=False,
+            pad_to_multiple_of=8
+        )
+        # Custom callback for metrics
+        class MetricsCallback(TrainerCallback):
+            def __init__(self, wandb_enabled):
+                self.wandb_enabled = wandb_enabled
+            def on_log(self, args, state, control, logs=None, **kwargs):
+                if logs and self.wandb_enabled:
+                    # Add perplexity
+                    if "loss" in logs:
+                        logs["perplexity"] = np.exp(logs["loss"])
+                    if "eval_loss" in logs:
+                        logs["eval_perplexity"] = np.exp(logs["eval_loss"])
+                    # Log to WandB
+                    wandb.log(logs, step=state.global_step)
+                return control
+        # Initialize trainer
+        trainer = Trainer(
+            model=self.model,
+            args=self.training_args,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.val_dataset,
+            data_collator=data_collator,
+            tokenizer=self.tokenizer,
+            callbacks=[MetricsCallback(self.wandb_enabled)] if self.wandb_enabled else [],
+        )
+        # Calculate total steps
+        total_steps = len(self.train_dataset) // (
+            self.training_args.per_device_train_batch_size *
+            self.training_args.gradient_accumulation_steps
+        ) * self.training_args.num_train_epochs
+        print("="*60)
+        print("Training Information:")
+        print(f"  Total training samples: {len(self.train_dataset)}")
+        print(f"  Total validation samples: {len(self.val_dataset)}")
+        print(f"  Total training steps: {total_steps}")
+        print(f"  Max sequence length: {self.max_seq_length}")
+        print("="*60)
+        # Log training start
+        if self.wandb_enabled:
+            wandb.log({
+                "training_status": "started",
+                "total_steps": total_steps,
+                "max_seq_length": self.max_seq_length
+            })
+        try:
+            # Train
+            print("\n🚀 Training started...")
+            train_result = trainer.train()
+            # Save model
+            print("\n💾 Saving fine-tuned model...")
+            final_model_path = os.path.join(self.training_args.output_dir, "final_model")
+            trainer.save_model(final_model_path)
+            self.tokenizer.save_pretrained(final_model_path)
+            # Save training metrics
+            with open(os.path.join(self.training_args.output_dir, "training_metrics.json"), 'w') as f:
+                json.dump(train_result.metrics, f, indent=2)
+            # Final evaluation
+            print("\n📊 Running final evaluation...")
+            eval_results = trainer.evaluate()
+            # Save evaluation metrics
+            with open(os.path.join(self.training_args.output_dir, "eval_metrics.json"), 'w') as f:
+                json.dump(eval_results, f, indent=2)
+            # Log final metrics
+            if self.wandb_enabled:
+                wandb.run.summary.update({
+                    "final_train_loss": train_result.metrics.get("train_loss", 0),
+                    "final_eval_loss": eval_results.get("eval_loss", 0),
+                    "final_eval_perplexity": np.exp(eval_results.get("eval_loss", 0)),
+                    "total_training_time": train_result.metrics.get("train_runtime", 0),
+                    "training_samples_per_second": train_result.metrics.get("train_samples_per_second", 0),
+                    "training_status": "completed"
+                })
+                # Save model artifact
+                artifact = wandb.Artifact(
+                    name=f"kokoro-model-complete-{wandb.run.id}",
+                    type="model",
+                    description="LFM model fine-tuned with complete dialogue history",
+                    metadata={
+                        "base_model": self.model_name,
+                        "final_loss": float(eval_results.get("eval_loss", 0)),
+                        "final_perplexity": float(np.exp(eval_results.get("eval_loss", 0))),
+                        "max_seq_length": self.max_seq_length,
+                        "methodology": "Complete dialogue history (KokoroChat)"
+                    }
+                )
+                artifact.add_dir(final_model_path)
+                wandb.log_artifact(artifact)
+            print("\n" + "="*60)
+            print("✅ Training completed successfully!")
+            print(f"📁 Model saved to: {final_model_path}")
+            print(f"📉 Final eval loss: {eval_results.get('eval_loss', 0):.4f}")
+            print(f"📊 Final perplexity: {np.exp(eval_results.get('eval_loss', 0)):.2f}")
+            if self.wandb_enabled and wandb.run:
+                print(f"🔗 View results at: {wandb.run.get_url()}")
+            print("="*60)
+            return trainer
+        except Exception as e:
+            print(f"❌ Error during training: {e}")
+            if self.wandb_enabled:
+                wandb.run.summary["training_status"] = "failed"
+                wandb.run.summary["error"] = str(e)
+            # Save emergency checkpoint
+            try:
+                emergency_path = os.path.join(self.training_args.output_dir, "emergency_checkpoint")
+                self.model.save_pretrained(emergency_path)
+                self.tokenizer.save_pretrained(emergency_path)
+                print(f"💾 Emergency checkpoint saved to: {emergency_path}")
+            except:
+                print("❌ Could not save emergency checkpoint")
+            raise e
+        finally:
+            if self.wandb_enabled:
+                wandb.finish()
+def test_model_with_complete_history(model_path: str):
+    """Test the fine-tuned model with complete dialogue history examples"""
+    print("\n" + "="*60)
+    print("🧪 Testing model with complete dialogue history")
+    print("="*60)
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
+    # Check if it's a PEFT model
+    adapter_config_path = os.path.join(model_path, "adapter_config.json")
+    if os.path.exists(adapter_config_path):
+        print("Loading as PEFT model...")
+        config = PeftConfig.from_pretrained(model_path)
+        base_model = AutoModelForCausalLM.from_pretrained(
+            config.base_model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        model = PeftModel.from_pretrained(base_model, model_path)
+    else:
+        print("Loading as regular model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            local_files_only=True,
+            trust_remote_code=True
+        )
+    model.eval()
+    # Test with dialogue history examples
+    test_cases = [
+        {
+            "history": "クライアント: こんにちは。最近ストレスを感じています。\nカウンセラー: こんにちは。ストレスを感じていらっしゃるのですね。どのような状況でストレスを感じることが多いですか？\n",
+            "current": "クライアント: 仕事が忙しくて、休む時間がありません。"
+        },
+        {
+            "history": "",
+            "current": "クライアント: 人間関係で悩んでいます。"
+        }
+    ]
+    print("Testing with complete dialogue history:\n")
+    for i, test_case in enumerate(test_cases, 1):
+        print(f"Test Case {i}:")
+        print("-" * 40)
+        # Format input with complete history
+        if test_case["history"]:
+            prompt = f"""### Instruction:
+あなたは専門的な訓練を受けた心理カウンセラーです。
+以下の完全な対話履歴を踏まえて、カウンセラーとして適切な応答を生成してください。
+### Dialogue History:
+{test_case["history"]}{test_case["current"]}
+### Response:
+"""
+        else:
+            prompt = f"""### Instruction:
+あなたは専門的な訓練を受けた心理カウンセラーです。
+### Dialogue History:
+{test_case["current"]}
+### Response:
+"""
+        # Generate response
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
+        inputs = {k: v.cuda() if torch.cuda.is_available() else v for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=150,
+                temperature=0,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=tokenizer.pad_token_id
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response.split("### Response:")[-1].strip() if "### Response:" in response else response
+        # print(f"History Length: {len(test_case['history'].split('\\n')) if test_case['history'] else 0} turns")
+        print("History Length: {} turns".format(len(test_case['history'].split('\\n')) if test_case['history'] else 0))
+        print(f"Current Input: {test_case['current']}")
+        print(f"Generated Response: {response[:300]}...")
+        print()
+    print("="*60)
+# Main execution
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Fine-tune LFM model with complete dialogue history')
+    parser.add_argument('--model_name', type=str, default='LiquidAI/LFM2-2.6B',
+                       help='Base model name')
+    parser.add_argument('--data_path', type=str, default='./kokoro_processed_data',
+                       help='Path to processed data with complete dialogue history')
+    parser.add_argument('--output_dir', type=str, default='./lfm_kokoro_complete',
+                       help='Output directory for fine-tuned model')
+    parser.add_argument('--max_seq_length', type=int, default=2048,
+                       help='Maximum sequence length for complete dialogues')
+    parser.add_argument('--use_4bit', action='store_true',
+                       help='Use 4-bit quantization')
+    parser.add_argument('--test_only', action='store_true',
+                       help='Only test existing model')
+    args = parser.parse_args()
+    if args.test_only:
+        # Test existing model
+        test_model_with_complete_history(
+            os.path.join(args.output_dir, "final_model")
+        )
+    else:
+        # Check CUDA availability
+        if not torch.cuda.is_available():
+            print("⚠️ Warning: CUDA is not available. Training will be slow.")
+            response = input("Continue? (y/n): ")
+            if response.lower() != 'y':
+                exit()
+        try:
+            # Clear GPU cache
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Initialize fine-tuner
+            print(f"🚀 Initializing fine-tuner for complete dialogue history")
+            finetuner = LFMKokoroChatFineTuner(
+                model_name=args.model_name,
+                use_4bit=args.use_4bit,
+                max_seq_length=args.max_seq_length
+            )
+            # Setup model
+            finetuner.setup_model_and_tokenizer()
+            # Load datasets
+            finetuner.load_and_process_datasets(args.data_path)
+            # Setup training arguments
+            finetuner.setup_training_args(args.output_dir)
+            # Train
+            trainer = finetuner.train()
+            # Test the model
+            print("\n🧪 Testing the fine-tuned model...")
+            test_model_with_complete_history(
+                os.path.join(args.output_dir, "final_model")
+            )
+            print("\n✅ Fine-tuning with complete dialogue history completed!")
+            print(f"📁 Model saved to: {args.output_dir}/final_model")
+            print("\n📋 Next steps:")
+            print(f"1. Test more: python {__file__} --test_only --output_dir {args.output_dir}")
+            print("2. Run benchmarking with complete history support")
+            print("3. Deploy for production use")
+        except KeyboardInterrupt:
+            print("\n\n⚠️ Training interrupted by user.")
+            if wandb.run:
+                wandb.finish()
+        except Exception as e:
+            print(f"\n❌ Error: {e}")
+            import traceback
+            traceback.print_exc()
+            if wandb.run:
+                wandb.finish()

finetune_trl_supervised.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+Minimal Working Fine-tuning Script - No Complex Dependencies
+Filename: finetune_minimal.py
+"""
+import torch
+import os
+import json
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+import numpy as np
+# Fix the import issues by reinstalling
+import subprocess
+import sys
+def fix_environment():
+    """Fix the broken environment"""
+    print("Fixing environment...")
+    subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "torchvision"], check=False)
+    subprocess.run([sys.executable, "-m", "pip", "install", "--no-deps", "transformers==4.36.0"], check=False)
+    subprocess.run([sys.executable, "-m", "pip", "install", "peft==0.7.0", "accelerate==0.25.0"], check=False)
+# Uncomment if needed
+# fix_environment()
+# Now import after fixing
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import LoraConfig, get_peft_model, TaskType
+class SimpleDataset(Dataset):
+    def __init__(self, data_path, tokenizer, max_length=1024):
+        self.data = []
+        with open(data_path, 'r') as f:
+            for line in f:
+                item = json.loads(line)
+                self.data.append(item['text'])
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        text = self.data[idx]
+        encoded = self.tokenizer(
+            text,
+            truncation=True,
+            padding='max_length',
+            max_length=self.max_length,
+            return_tensors='pt'
+        )
+        return {
+            'input_ids': encoded['input_ids'].squeeze(),
+            'attention_mask': encoded['attention_mask'].squeeze()
+        }
+def train_simple():
+    """Simple training without complex dependencies"""
+    # Configuration
+    model_name = "LiquidAI/LFM2-2.6B"
+    data_dir = "./kokoro_processed_data"
+    output_dir = "./lfm_minimal_output"
+    batch_size = 4
+    learning_rate = 2e-4
+    num_epochs = 2
+    max_length = 1024
+    os.makedirs(output_dir, exist_ok=True)
+    print("="*60)
+    print("Minimal Fine-tuning Script")
+    print("="*60)
+    # Device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+    # Load tokenizer
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load model
+    print("Loading model...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    # Apply LoRA
+    print("Applying LoRA...")
+    peft_config = LoraConfig(
+        r=32,
+        lora_alpha=64,
+        target_modules=["q_proj", "v_proj"],
+        lora_dropout=0.05,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM
+    )
+    model = get_peft_model(model, peft_config)
+    model.print_trainable_parameters()
+    # Load dataset
+    print("Loading dataset...")
+    train_dataset = SimpleDataset(
+        os.path.join(data_dir, "train.jsonl"),
+        tokenizer,
+        max_length
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True
+    )
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+    # Training loop
+    print(f"\nStarting training for {num_epochs} epochs...")
+    model.train()
+    global_step = 0
+    for epoch in range(num_epochs):
+        print(f"\nEpoch {epoch+1}/{num_epochs}")
+        total_loss = 0
+        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
+        for batch in progress_bar:
+            global_step += 1
+            # Move to device
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            # Forward pass
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                labels=input_ids
+            )
+            loss = outputs.loss
+            total_loss += loss.item()
+            # Backward pass
+            loss.backward()
+            # Update weights every 4 steps (gradient accumulation)
+            if global_step % 4 == 0:
+                optimizer.step()
+                optimizer.zero_grad()
+            # Update progress bar
+            progress_bar.set_postfix({'loss': loss.item()})
+            # Save checkpoint
+            if global_step % 500 == 0:
+                print(f"\nSaving checkpoint at step {global_step}...")
+                model.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
+                tokenizer.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
+        avg_loss = total_loss / len(train_loader)
+        print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
+    # Save final model
+    print("\nSaving final model...")
+    model.save_pretrained(os.path.join(output_dir, "final_model"))
+    tokenizer.save_pretrained(os.path.join(output_dir, "final_model"))
+    print(f"\n✅ Training complete! Model saved to {output_dir}/final_model")
+    # Test the model
+    print("\nTesting model...")
+    test_model(os.path.join(output_dir, "final_model"))
+def test_model(model_path):
+    """Test the fine-tuned model"""
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    test_input = "最近ストレスを感じています。"
+    prompt = f"""### Instruction:
+あなたは心理カウンセラーです。
+### Input:
+{test_input}
+### Response:
+"""
+    inputs = tokenizer(prompt, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(
+            inputs.input_ids.cuda(),
+            max_new_tokens=100,
+            temperature=0.7,
+            do_sample=True
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print(f"\nTest Input: {test_input}")
+    print(f"Response: {response.split('### Response:')[-1].strip()}")
+if __name__ == "__main__":
+    train_simple()

merge_model.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# from peft import PeftModel
+# import torch
+# print("Loading base model...")
+# base_model = AutoModelForCausalLM.from_pretrained(
+#     "./models/LFM2-1.2B",
+#     torch_dtype=torch.bfloat16,
+#     device_map="auto",
+#     trust_remote_code=True
+# )
+# print("Loading LoRA adapters...")
+# model = PeftModel.from_pretrained(base_model, "./counselor_model/final_model")
+# print("Merging adapters with base model...")
+# merged_model = model.merge_and_unload()
+# print("Saving merged model...")
+# merged_model.save_pretrained("./counselor_model-merged", safe_serialization=True)
+# tokenizer = AutoTokenizer.from_pretrained("./models/LFM2-1.2B")
+# tokenizer.save_pretrained("./counselor_model-merged")
+# print("Model merge complete!")
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel, PeftConfig
+import os
+def merge_and_save_model(
+    base_model_name: str = "LiquidAI/LFM2-2.6B",
+    adapter_path: str = "./lfm_minimal_output/final_model",
+    output_path: str = "./merged_counselor_minimal_2b"
+):
+    """
+    Properly merge LoRA weights with base model
+    """
+    print("Loading base model...")
+    # Load the base model
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model_name,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    print("Loading LoRA adapter...")
+    # Load the PEFT model (LoRA adapter)
+    model = PeftModel.from_pretrained(
+        base_model,
+        adapter_path,
+        torch_dtype=torch.float16,
+    )
+    print("Merging weights...")
+    # Merge LoRA weights with base model
+    model = model.merge_and_unload()
+    print(f"Saving merged model to {output_path}...")
+    # Save the merged model
+    model.save_pretrained(output_path)
+    # Also save the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
+    tokenizer.save_pretrained(output_path)
+    print("✅ Model merged and saved successfully!")
+    return model, tokenizer
+# Run the merge
+if __name__ == "__main__":
+    merge_and_save_model()

preprocess_kokoro_method.py ADDED Viewed

	@@ -0,0 +1,651 @@

+"""
+Fixed Data Preprocessing for directory of JSON files with client-counselor dialogues
+Following KokoroChat methodology with COMPLETE dialogue history
+Filename: preprocess_kokoro_directory_fixed.py
+"""
+import json
+import os
+from typing import List, Dict, Tuple, Optional, Any
+from tqdm import tqdm
+import random
+from collections import defaultdict
+import numpy as np
+from pathlib import Path
+import glob
+class KokoroChatDirectoryPreprocessor:
+    def __init__(self,
+                 input_dir: str = "./raw_counseling_data",
+                 output_dir: str = "./kokoro_processed_data",
+                 min_score: int = 70,
+                 train_ratio: float = 0.8,
+                 val_ratio: float = 0.1,
+                 test_ratio: float = 0.1):
+        """
+        Initialize preprocessor for directory of JSON files
+        Args:
+            input_dir: Directory containing JSON files with conversations
+            output_dir: Directory to save processed data
+            min_score: Minimum score threshold for filtering (if scores exist)
+            train_ratio: Ratio for training data
+            val_ratio: Ratio for validation data
+            test_ratio: Ratio for test data
+        """
+        self.input_dir = input_dir
+        self.output_dir = output_dir
+        self.min_score = min_score
+        self.train_ratio = train_ratio
+        self.val_ratio = val_ratio
+        self.test_ratio = test_ratio
+        os.makedirs(output_dir, exist_ok=True)
+        # Track statistics
+        self.total_conversations = 0
+        self.total_utterances = 0
+        self.skipped_files = 0
+    def load_json_file(self, filepath: str) -> Optional[Dict]:
+        """Load a single JSON file"""
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                return data
+        except Exception as e:
+            print(f"⚠️ Error loading {filepath}: {e}")
+            self.skipped_files += 1
+            return None
+    def safe_get_value(self, obj: Any, default: Any = None) -> Any:
+        """Safely get a value, handling nested dicts and lists"""
+        if isinstance(obj, dict):
+            # If it's a dict, try to get a meaningful string representation
+            if 'name' in obj:
+                return str(obj['name'])
+            elif 'value' in obj:
+                return str(obj['value'])
+            elif 'text' in obj:
+                return str(obj['text'])
+            else:
+                # Return first string value found or convert to string
+                for v in obj.values():
+                    if isinstance(v, str):
+                        return v
+                return str(list(obj.values())[0]) if obj else default
+        elif isinstance(obj, list):
+            # If it's a list, join elements or return first element
+            if obj:
+                return str(obj[0]) if len(obj) == 1 else ', '.join(str(x) for x in obj)
+            return default
+        elif obj is None:
+            return default
+        else:
+            return str(obj)
+    def extract_dialogue_from_json(self, data: Dict, filepath: str) -> List[Dict]:
+        """
+        Extract dialogue from various JSON formats
+        Handles different possible structures
+        """
+        conversations = []
+        # Try different possible structures
+        if isinstance(data, list):
+            # If the JSON is directly a list of utterances
+            conversations.append({
+                'dialogue': data,
+                'id': os.path.basename(filepath).replace('.json', ''),
+                'score': 100,  # Default score
+                'topic': 'general',
+                'source_file': filepath
+            })
+        elif isinstance(data, dict):
+            # Extract score safely
+            score = data.get('score', 100)
+            if isinstance(score, dict):
+                score = score.get('value', 100) if 'value' in score else 100
+            try:
+                score = float(score)
+            except:
+                score = 100
+            # Extract topic safely
+            topic = self.safe_get_value(data.get('topic', 'general'), 'general')
+            # Check for different possible keys
+            if 'dialogue' in data:
+                conversations.append({
+                    'dialogue': data['dialogue'],
+                    'id': data.get('id', os.path.basename(filepath).replace('.json', '')),
+                    'score': score,
+                    'topic': topic,
+                    'source_file': filepath
+                })
+            elif 'messages' in data:
+                conversations.append({
+                    'dialogue': data['messages'],
+                    'id': data.get('id', os.path.basename(filepath).replace('.json', '')),
+                    'score': score,
+                    'topic': topic,
+                    'source_file': filepath
+                })
+            elif 'utterances' in data:
+                conversations.append({
+                    'dialogue': data['utterances'],
+                    'id': data.get('id', os.path.basename(filepath).replace('.json', '')),
+                    'score': score,
+                    'topic': topic,
+                    'source_file': filepath
+                })
+            elif 'conversations' in data:
+                # Multiple conversations in one file
+                for conv in data['conversations']:
+                    if isinstance(conv, dict) and any(key in conv for key in ['dialogue', 'messages', 'utterances']):
+                        dialogue_key = 'dialogue' if 'dialogue' in conv else ('messages' if 'messages' in conv else 'utterances')
+                        # Extract score and topic safely for each conversation
+                        conv_score = conv.get('score', score)
+                        if isinstance(conv_score, dict):
+                            conv_score = conv_score.get('value', 100) if 'value' in conv_score else 100
+                        try:
+                            conv_score = float(conv_score)
+                        except:
+                            conv_score = 100
+                        conv_topic = self.safe_get_value(conv.get('topic', topic), 'general')
+                        conversations.append({
+                            'dialogue': conv[dialogue_key],
+                            'id': conv.get('id', f"{os.path.basename(filepath)}_{len(conversations)}"),
+                            'score': conv_score,
+                            'topic': conv_topic,
+                            'source_file': filepath
+                        })
+            else:
+                # Try to find any list that looks like dialogue
+                for key, value in data.items():
+                    if isinstance(value, list) and len(value) > 0:
+                        # Check if it looks like dialogue data
+                        if isinstance(value[0], dict) and any(k in value[0] for k in ['speaker', 'role', 'text', 'content', 'utterance']):
+                            conversations.append({
+                                'dialogue': value,
+                                'id': data.get('id', os.path.basename(filepath).replace('.json', '')),
+                                'score': score,
+                                'topic': topic,
+                                'source_file': filepath
+                            })
+                            break
+        return conversations
+    def normalize_utterance(self, utterance: Dict) -> Optional[Dict]:
+        """
+        Normalize utterance format from various possible structures
+        Returns: {'speaker': str, 'text': str} or None
+        """
+        # Determine speaker
+        speaker = None
+        if 'speaker' in utterance:
+            speaker = utterance['speaker']
+        elif 'role' in utterance:
+            speaker = utterance['role']
+        elif 'sender' in utterance:
+            speaker = utterance['sender']
+        elif 'from' in utterance:
+            speaker = utterance['from']
+        elif 'type' in utterance:
+            speaker = utterance['type']
+        # Determine text content
+        text = None
+        if 'text' in utterance:
+            text = utterance['text']
+        elif 'content' in utterance:
+            text = utterance['content']
+        elif 'message' in utterance:
+            text = utterance['message']
+        elif 'utterance' in utterance:
+            text = utterance['utterance']
+        elif 'response' in utterance:
+            text = utterance['response']
+        if speaker and text:
+            # Normalize speaker labels
+            speaker_lower = str(speaker).lower()
+            if speaker_lower in ['client', 'user', 'patient', 'クライアント', '相談者', 'c']:
+                normalized_speaker = 'client'
+            elif speaker_lower in ['counselor', 'therapist', 'assistant', 'カウンセラー', '相談員', 's', 'system']:
+                normalized_speaker = 'counselor'
+            else:
+                # Try to infer from position or content
+                normalized_speaker = 'client' if 'client' in speaker_lower else 'counselor'
+            return {
+                'speaker': normalized_speaker,
+                'text': str(text).strip()
+            }
+        return None
+    def merge_consecutive_utterances(self, dialogue: List[Dict]) -> List[Dict]:
+        """
+        Merge consecutive utterances from the same speaker
+        Following KokoroChat paper methodology
+        """
+        if not dialogue:
+            return []
+        merged = []
+        current_utterance = None
+        for utt in dialogue:
+            normalized = self.normalize_utterance(utt)
+            if not normalized:
+                continue
+            if current_utterance is None:
+                current_utterance = normalized
+            elif current_utterance['speaker'] == normalized['speaker']:
+                # Same speaker - merge utterances
+                current_utterance['text'] += ' ' + normalized['text']
+            else:
+                # Different speaker - save current and start new
+                merged.append(current_utterance)
+                current_utterance = normalized
+        # Don't forget the last utterance
+        if current_utterance:
+            merged.append(current_utterance)
+        return merged
+    def create_training_examples(self, conversation: Dict) -> List[Dict]:
+        """
+        Create training examples with COMPLETE dialogue history
+        Following the paper: Dt = {uC1, uS2, uC3, ..., uCt} -> uSt+1
+        """
+        examples = []
+        # Get dialogue
+        dialogue = conversation.get('dialogue', [])
+        if not dialogue:
+            return []
+        # Merge consecutive utterances from same speaker
+        merged_dialogue = self.merge_consecutive_utterances(dialogue)
+        if not merged_dialogue:
+            return []
+        # Create examples with COMPLETE history
+        for i in range(len(merged_dialogue)):
+            current = merged_dialogue[i]
+            # Only create examples where counselor responds
+            if current['speaker'] == 'counselor':
+                # Get COMPLETE dialogue history from beginning
+                complete_history = merged_dialogue[:i]
+                # Skip if no history or if history doesn't start with client
+                if not complete_history or complete_history[0]['speaker'] != 'client':
+                    continue
+                # Ensure topic is a string
+                topic = conversation.get('topic', 'general')
+                if not isinstance(topic, str):
+                    topic = self.safe_get_value(topic, 'general')
+                # Create training example
+                example = {
+                    'dialogue_history': complete_history,
+                    'response': current['text'],
+                    'score': conversation.get('score', 100),
+                    'topic': topic,
+                    'conversation_id': conversation.get('id', 'unknown'),
+                    'source_file': conversation.get('source_file', 'unknown'),
+                    'turn_number': i,
+                    'history_length': len(complete_history)
+                }
+                examples.append(example)
+        return examples
+    def format_for_training(self, example: Dict, format_type: str = 'simple') -> str:
+        """
+        Format example for training
+        Args:
+            format_type: 'simple' or 'llama' format
+        """
+        # Build complete dialogue history
+        history_text = ""
+        for turn in example['dialogue_history']:
+            speaker = "クライアント" if turn['speaker'] == 'client' else "カウンセラー"
+            history_text += f"{speaker}: {turn['text']}\n"
+        if format_type == 'llama':
+            # Llama-style format with special tokens
+            formatted = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+あなたは専門的な訓練を受けた心理カウンセラーです。クライアントの感情に共感し、適切な支援を提供してください。
+これまでの対話履歴全体を考慮して、適切な応答を生成してください。<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+以下は、クライアントとカウンセラーの完全な対話履歴です。
+この履歴全体を踏まえて、次のカウンセラーの応答を生成してください。
+完全な対話履歴:
+{history_text}
+次のカウンセラーの応答を生成してください。<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+{example['response']}<|eot_id|>"""
+        else:
+            # Simple format for models without special tokens
+            formatted = f"""### Instruction:
+あなたは専門的な訓練を受けた心理カウンセラーです。
+以下の完全な対話履歴を踏まえて、カウンセラーとして適切な応答を生成してください。
+### Dialogue History:
+{history_text}
+### Response:
+{example['response']}"""
+        return formatted
+    def process_directory(self, format_type: str = 'simple'):
+        """Process all JSON files in the input directory"""
+        print(f"🔍 Scanning directory: {self.input_dir}")
+        # Find all JSON files
+        json_files = []
+        for pattern in ['*.json', '*.jsonl']:
+            json_files.extend(glob.glob(os.path.join(self.input_dir, '**', pattern), recursive=True))
+        print(f"Found {len(json_files)} JSON files")
+        if not json_files:
+            print("❌ No JSON files found in the directory!")
+            return
+        # Process each file
+        all_conversations = []
+        for filepath in tqdm(json_files, desc="Loading JSON files"):
+            # Handle both .json and .jsonl files
+            if filepath.endswith('.jsonl'):
+                # JSONL file - each line is a separate JSON object
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    for line_num, line in enumerate(f):
+                        try:
+                            data = json.loads(line)
+                            conversations = self.extract_dialogue_from_json(data, f"{filepath}_line{line_num}")
+                            all_conversations.extend(conversations)
+                        except:
+                            continue
+            else:
+                # Regular JSON file
+                data = self.load_json_file(filepath)
+                if data:
+                    conversations = self.extract_dialogue_from_json(data, filepath)
+                    all_conversations.extend(conversations)
+        print(f"✅ Loaded {len(all_conversations)} conversations from {len(json_files) - self.skipped_files} files")
+        print(f"⚠️ Skipped {self.skipped_files} files due to errors")
+        # Filter by score
+        conversations_before_filter = len(all_conversations)
+        filtered_conversations = [
+            conv for conv in all_conversations
+            if conv.get('score', 100) >= self.min_score
+        ]
+        conversations_after_filter = len(filtered_conversations)
+        print(f"📊 Score filtering (>= {self.min_score}):")
+        print(f"   Before: {conversations_before_filter} conversations")
+        print(f"   After: {conversations_after_filter} conversations")
+        print(f"   Filtered out: {conversations_before_filter - conversations_after_filter} conversations")
+        # Create training examples
+        all_examples = []
+        history_lengths = []
+        for conv in tqdm(filtered_conversations, desc="Creating training examples"):
+            examples = self.create_training_examples(conv)
+            all_examples.extend(examples)
+            history_lengths.extend([ex['history_length'] for ex in examples])
+        if not all_examples:
+            print("❌ No training examples created!")
+            return
+        print(f"✅ Created {len(all_examples)} training examples from {len(filtered_conversations)} conversations")
+        print(f"📊 Dialogue history statistics:")
+        print(f"   - Mean length: {np.mean(history_lengths):.1f} turns")
+        print(f"   - Median length: {np.median(history_lengths):.1f} turns")
+        print(f"   - Max length: {max(history_lengths)} turns")
+        print(f"   - Min length: {min(history_lengths)} turns")
+        # Shuffle and split
+        random.shuffle(all_examples)
+        train_size = int(self.train_ratio * len(all_examples))
+        val_size = int(self.val_ratio * len(all_examples))
+        train_data = all_examples[:train_size]
+        val_data = all_examples[train_size:train_size + val_size]
+        test_data = all_examples[train_size + val_size:]
+        print(f"\n📂 Split sizes:")
+        print(f"   Train: {len(train_data)} ({self.train_ratio*100:.0f}%)")
+        print(f"   Val: {len(val_data)} ({self.val_ratio*100:.0f}%)")
+        print(f"   Test: {len(test_data)} ({self.test_ratio*100:.0f}%)")
+        # Save splits
+        self.save_split(train_data, 'train', format_type)
+        self.save_split(val_data, 'val', format_type)
+        self.save_split(test_data, 'test', format_type)
+        # Save statistics
+        self.save_statistics(
+            train_data, val_data, test_data,
+            all_conversations, filtered_conversations,
+            history_lengths
+        )
+        print(f"\n✅ Processing complete! Data saved to {self.output_dir}")
+    def save_split(self, data: List[Dict], split_name: str, format_type: str = 'simple'):
+        """Save processed data split"""
+        output_file = os.path.join(self.output_dir, f"{split_name}.jsonl")
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for example in tqdm(data, desc=f"Saving {split_name} data"):
+                formatted_text = self.format_for_training(example, format_type)
+                # Ensure topic is string
+                topic = example.get('topic', 'general')
+                if not isinstance(topic, str):
+                    topic = self.safe_get_value(topic, 'general')
+                output_item = {
+                    'text': formatted_text,
+                    'dialogue_history': example['dialogue_history'],
+                    'response': example['response'],
+                    'score': example['score'],
+                    'topic': topic,
+                    'conversation_id': example['conversation_id'],
+                    'source_file': example['source_file'],
+                    'turn_number': example['turn_number'],
+                    'history_length': example['history_length']
+                }
+                f.write(json.dumps(output_item, ensure_ascii=False) + '\n')
+        print(f"✅ Saved {split_name} data to {output_file}")
+    def save_statistics(self, train_data, val_data, test_data,
+                        all_conversations, filtered_conversations, history_lengths):
+        """Save comprehensive statistics"""
+        # Calculate topic distribution (safely)
+        topic_counts = defaultdict(int)
+        for example in train_data:
+            topic = example.get('topic', 'general')
+            if not isinstance(topic, str):
+                topic = self.safe_get_value(topic, 'general')
+            topic_counts[topic] += 1
+        # Calculate source file distribution
+        source_counts = defaultdict(int)
+        for example in train_data:
+            source_file = os.path.basename(example.get('source_file', 'unknown'))
+            source_counts[source_file] += 1
+        # Score statistics for filtered conversations
+        scores = [conv.get('score', 100) for conv in filtered_conversations]
+        stats = {
+            'preprocessing_info': {
+                'input_directory': self.input_dir,
+                'output_directory': self.output_dir,
+                'total_files_processed': len(set(conv.get('source_file', 'unknown') for conv in all_conversations)),
+                'total_conversations_loaded': len(all_conversations),
+                'conversations_after_filtering': len(filtered_conversations),
+                'conversations_filtered_out': len(all_conversations) - len(filtered_conversations),
+                'total_training_examples': len(train_data) + len(val_data) + len(test_data),
+                'min_score_threshold': self.min_score,
+                'methodology': 'KokoroChat paper - complete dialogue history'
+            },
+            'score_filtering': {
+                'threshold': self.min_score,
+                'before_filtering': len(all_conversations),
+                'after_filtering': len(filtered_conversations),
+                'filtered_out': len(all_conversations) - len(filtered_conversations),
+                'percentage_kept': (len(filtered_conversations) / len(all_conversations) * 100) if all_conversations else 0
+            },
+            'score_statistics': {
+                'mean': float(np.mean(scores)),
+                'std': float(np.std(scores)),
+                'min': float(min(scores)),
+                'max': float(max(scores)),
+                'median': float(np.median(scores)),
+                'percentile_25': float(np.percentile(scores, 25)),
+                'percentile_75': float(np.percentile(scores, 75))
+            },
+            'split_sizes': {
+                'train': len(train_data),
+                'val': len(val_data),
+                'test': len(test_data),
+                'train_ratio': self.train_ratio,
+                'val_ratio': self.val_ratio,
+                'test_ratio': self.test_ratio
+            },
+            'dialogue_history_stats': {
+                'mean_length': float(np.mean(history_lengths)),
+                'std_length': float(np.std(history_lengths)),
+                'min_length': int(min(history_lengths)),
+                'max_length': int(max(history_lengths)),
+                'median_length': float(np.median(history_lengths)),
+                'percentile_25': float(np.percentile(history_lengths, 25)),
+                'percentile_75': float(np.percentile(history_lengths, 75)),
+                'percentile_95': float(np.percentile(history_lengths, 95))
+            },
+            'topic_distribution': dict(list(topic_counts.items())[:20]),  # Top 20 topics
+            'source_file_distribution': dict(list(source_counts.items())[:20]),  # Top 20 files
+            'history_length_bins': {
+                '1-5_turns': sum(1 for l in history_lengths if l <= 5),
+                '6-10_turns': sum(1 for l in history_lengths if 5 < l <= 10),
+                '11-15_turns': sum(1 for l in history_lengths if 10 < l <= 15),
+                '16-20_turns': sum(1 for l in history_lengths if 15 < l <= 20),
+                '21-30_turns': sum(1 for l in history_lengths if 20 < l <= 30),
+                '31-50_turns': sum(1 for l in history_lengths if 30 < l <= 50),
+                '50+_turns': sum(1 for l in history_lengths if l > 50)
+            }
+        }
+        stats_file = os.path.join(self.output_dir, 'dataset_stats.json')
+        with open(stats_file, 'w', encoding='utf-8') as f:
+            json.dump(stats, f, ensure_ascii=False, indent=2)
+        print(f"\n📊 Statistics saved to {stats_file}")
+        # Print summary
+        print("\n" + "="*70)
+        print("📈 DATASET STATISTICS SUMMARY")
+        print("="*70)
+        print(f"Files processed: {stats['preprocessing_info']['total_files_processed']}")
+        print(f"Conversations loaded: {stats['preprocessing_info']['total_conversations_loaded']}")
+        print(f"After score filtering (>={self.min_score}): {stats['preprocessing_info']['conversations_after_filtering']}")
+        print(f"Training examples created: {stats['preprocessing_info']['total_training_examples']}")
+        print(f"\nScore Statistics (after filtering):")
+        print(f"  Mean: {stats['score_statistics']['mean']:.1f}")
+        print(f"  Median: {stats['score_statistics']['median']:.1f}")
+        print(f"  Range: {stats['score_statistics']['min']:.0f} - {stats['score_statistics']['max']:.0f}")
+        print(f"\nDialogue History Length Distribution:")
+        for bin_name, count in stats['history_length_bins'].items():
+            percentage = (count / len(history_lengths)) * 100 if history_lengths else 0
+            print(f"  {bin_name}: {count} ({percentage:.1f}%)")
+        print("="*70)
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Preprocess directory of JSON files with counseling dialogues'
+    )
+    parser.add_argument(
+        '--input_dir',
+        type=str,
+        default='./KokoroChat/kokorochat_dialogues',
+        help='Directory containing JSON files with conversations'
+    )
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='./kokoro_processed_data',
+        help='Output directory for processed data'
+    )
+    parser.add_argument(
+        '--min_score',
+        type=int,
+        default=70,
+        help='Minimum score threshold (if scores exist in data)'
+    )
+    parser.add_argument(
+        '--format',
+        type=str,
+        choices=['simple', 'llama'],
+        default='simple',
+        help='Output format type'
+    )
+    args = parser.parse_args()
+    # Initialize preprocessor
+    preprocessor = KokoroChatDirectoryPreprocessor(
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+        min_score=args.min_score
+    )
+    print("🚀 Starting preprocessing with COMPLETE dialogue history")
+    print("   Following KokoroChat paper methodology")
+    print("="*70)
+    # Process directory
+    preprocessor.process_directory(format_type=args.format)
+    print("\n✅ Preprocessing complete!")
+if __name__ == "__main__":
+    main()

score_analysis_threshold_60.png ADDED Viewed

Git LFS Details

SHA256: db92bcddc596a139b29fd09421a87035d96547c5ff020732e2662e2c366e7d79
Pointer size: 131 Bytes
Size of remote file: 343 kB

score_distribution.png ADDED Viewed

Git LFS Details

SHA256: 4a943a076098bbe516b6cb0cb71a39c8a5d704b6c7b60398b89747065665fd15
Pointer size: 131 Bytes
Size of remote file: 353 kB

training_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "model_name_or_path": "LiquidAI/LFM2-2.6B",
+    "use_lora": true,
+    "lora_r": 64,
+    "lora_alpha": 128,
+    "lora_dropout": 0.05,
+    "data_path": "./kokoro_processed_data",
+    "max_seq_length": 2048,
+    "response_template": "### Response:",
+    "output_dir": "./lfm_trl_finetuned",
+    "num_train_epochs": 3,
+    "per_device_train_batch_size": 4,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "learning_rate": 2e-4,
+    "warmup_ratio": 0.1,
+    "logging_steps": 10,
+    "save_steps": 100,
+    "eval_steps": 100,
+    "bf16": true,
+    "tf32": true,
+    "seed": 42
+}