Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

data_preparation.py +133 -0
dataprocessing_multiturn.py +177 -0
finetune_lfm2.6b.py +303 -0

data_preparation.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# data_preparation.py
+import json
+import os
+from pathlib import Path
+import pandas as pd
+from typing import List, Dict, Tuple
+import numpy as np
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+class KokoroChatProcessor:
+    def __init__(self, data_path: str):
+        self.data_path = Path(data_path)
+        self.conversations = []
+        self.processed_data = []
+    def load_all_conversations(self) -> List[Dict]:
+        """Load all JSON files from KokoroChat dataset"""
+        json_files = list(self.data_path.glob("**/*.json"))
+        print(f"Found {len(json_files)} conversation files")
+        for json_file in tqdm(json_files, desc="Loading conversations"):
+            try:
+                with open(json_file, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                    self.conversations.append(data)
+            except Exception as e:
+                print(f"Error loading {json_file}: {e}")
+        return self.conversations
+    def create_training_examples(self) -> List[Dict]:
+        """Convert conversations to training format"""
+        for conv_data in tqdm(self.conversations, desc="Processing conversations"):
+            dialogue = conv_data.get('dialogue', [])
+            topic = conv_data.get('topic', {})
+            review = conv_data.get('review_by_client_jp', {})
+            # Create conversation context
+            conversation_pairs = []
+            for i in range(0, len(dialogue) - 1, 2):
+                if i + 1 < len(dialogue):
+                    counselor_msg = dialogue[i]
+                    client_msg = dialogue[i + 1] if i + 1 < len(dialogue) else None
+                    if counselor_msg['role'] == 'counselor' and client_msg and client_msg['role'] == 'client':
+                        # Build context from previous messages
+                        context = self._build_context(dialogue[:i+1])
+                        training_example = {
+                            'instruction': "あなたは共感的で専門的な心理カウンセラーです。クライアントの悩みに寄り添い、適切なサポートを提供してください。",
+                            'input': f"クライアント: {client_msg['utterance']}",
+                            'output': counselor_msg['utterance'],
+                            'context': context,
+                            'topic': topic.get('main_jp', ''),
+                            'quality_score': self._calculate_quality_score(review)
+                        }
+                        self.processed_data.append(training_example)
+        return self.processed_data
+    def _build_context(self, dialogue_history: List[Dict], max_turns: int = 5) -> str:
+        """Build conversation context from history"""
+        context_parts = []
+        start_idx = max(0, len(dialogue_history) - max_turns * 2)
+        for msg in dialogue_history[start_idx:]:
+            role = "カウンセラー" if msg['role'] == 'counselor' else "クライアント"
+            context_parts.append(f"{role}: {msg['utterance']}")
+        return "\n".join(context_parts)
+    def _calculate_quality_score(self, review: Dict) -> float:
+        """Calculate quality score from client review"""
+        if not review or review.get('点数') is None:
+            return 0.5  # Default middle score
+        # Normalize score (assuming max score is 100)
+        return review.get('点数', 50) / 100.0
+    def prepare_for_finetuning(self, test_size: float = 0.1, val_size: float = 0.1):
+        """Prepare train/val/test splits"""
+        # Filter high-quality examples (score > 0.6)
+        high_quality = [ex for ex in self.processed_data if ex['quality_score'] > 0.6]
+        print(f"Selected {len(high_quality)} high-quality examples")
+        # Create splits
+        train_data, test_data = train_test_split(high_quality, test_size=test_size, random_state=42)
+        train_data, val_data = train_test_split(train_data, test_size=val_size, random_state=42)
+        # Format for fine-tuning
+        def format_example(ex):
+            prompt = f"""### 指示:
+{ex['instruction']}
+### コンテキスト:
+{ex['context']}
+### 入力:
+{ex['input']}
+### 応答:
+{ex['output']}"""
+            return {'text': prompt}
+        train_formatted = [format_example(ex) for ex in train_data]
+        val_formatted = [format_example(ex) for ex in val_data]
+        test_formatted = [format_example(ex) for ex in test_data]
+        return train_formatted, val_formatted, test_formatted
+# Execute data preparation
+processor = KokoroChatProcessor('KokoroChat/data')
+processor.load_all_conversations()
+processor.create_training_examples()
+train_data, val_data, test_data = processor.prepare_for_finetuning()
+# Save processed data
+import pickle
+with open('processed_data.pkl', 'wb') as f:
+    pickle.dump({
+        'train': train_data,
+        'val': val_data,
+        'test': test_data
+    }, f)
+print(f"Training examples: {len(train_data)}")
+print(f"Validation examples: {len(val_data)}")
+print(f"Test examples: {len(test_data)}")

dataprocessing_multiturn.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# prepare_dataset_multiturn.py
+import json
+import os
+from datasets import Dataset, Features, Value
+import pandas as pd
+from pathlib import Path
+def parse_kokorochat_with_context(json_file_path, context_window=4, max_history_tokens=1500):
+    """
+    Parse KokoroChat with conversation history for realistic counseling.
+    Args:
+        json_file_path: Path to JSON file
+        context_window: Number of previous turns to include (default: 4 = 2 exchanges)
+        max_history_tokens: Approximate token limit for history (prevents too long sequences)
+    """
+    try:
+        with open(json_file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except Exception as e:
+        return [], 0
+    conversations = []
+    dialogue = data.get('dialogue', [])
+    # Get quality score
+    review_en = data.get('review_by_client_en', {})
+    total_score = review_en.get('score', 0)
+    # Get topic
+    topic = data.get('topic', {})
+    main_topic = topic.get('main_en', '')
+    sub_topic = topic.get('sub', '')
+    # Extract examples with context
+    for i in range(len(dialogue) - 1):
+        current = dialogue[i]
+        next_turn = dialogue[i + 1]
+        # Look for client -> counselor pairs
+        if current['role'] == 'client' and next_turn['role'] == 'counselor':
+            client_msg = current['utterance'].strip()
+            counselor_msg = next_turn['utterance'].strip()
+            if len(client_msg) > 5 and len(counselor_msg) > 5:
+                # Get conversation history (previous turns)
+                start_idx = max(0, i - context_window)
+                history = dialogue[start_idx:i]
+                # Estimate token count (rough: ~3 chars per token for Japanese)
+                history_text = ''.join([h['utterance'] for h in history])
+                if len(history_text) < max_history_tokens * 3:  # Keep reasonable length
+                    conversations.append({
+                        'history': history,
+                        'client': client_msg,
+                        'counselor': counselor_msg,
+                        'quality_score': total_score,
+                        'topic_main': main_topic,
+                        'topic_sub': sub_topic,
+                        'dialogue_id': Path(json_file_path).stem
+                    })
+    return conversations, total_score
+def format_conversation_for_lfm2(conversation):
+    """
+    Format conversation with history into LFM2 ChatML template
+    """
+    # Start with system prompt
+    formatted = "<|im_start|>system\n"
+    formatted += "あなたは経験豊富な心理カウンセラーです。クライアントの話を傾聴し、共感的で支援的な応答をしてください。<|im_end|>\n"
+    # Add conversation history
+    for turn in conversation['history']:
+        if turn['role'] == 'client':
+            formatted += f"<|im_start|>user\n{turn['utterance']}<|im_end|>\n"
+        elif turn['role'] == 'counselor':
+            formatted += f"<|im_start|>assistant\n{turn['utterance']}<|im_end|>\n"
+    # Add current exchange (what we're training on)
+    formatted += f"<|im_start|>user\n{conversation['client']}<|im_end|>\n"
+    formatted += f"<|im_start|>assistant\n{conversation['counselor']}<|im_end|><|endoftext|>"
+    return formatted
+def create_training_dataset_multiturn(
+    data_dir="./KokoroChat/data",
+    min_score=70,
+    context_window=4
+):
+    """
+    Create training dataset with conversation context.
+    Args:
+        data_dir: Directory containing JSON files
+        min_score: Minimum quality score (0-100, recommend 85 for top quality)
+        context_window: Number of previous turns to include
+    """
+    json_files = list(Path(data_dir).rglob("*.json"))
+    print(f"Found {len(json_files)} JSON files")
+    all_conversations = []
+    score_distribution = []
+    print("\nProcessing files with multi-turn context...")
+    for idx, json_file in enumerate(json_files):
+        if idx % 1000 == 0:
+            print(f"Processed {idx}/{len(json_files)} files...")
+        try:
+            convs, score = parse_kokorochat_with_context(
+                json_file,
+                context_window=context_window
+            )
+            score_distribution.append(score)
+            if score >= min_score:
+                all_conversations.extend(convs)
+        except Exception as e:
+            continue
+    print(f"\n=== Processing Results ===")
+    print(f"High-quality files (>= {min_score}): {sum(1 for s in score_distribution if s >= min_score)}")
+    print(f"Total conversation examples: {len(all_conversations)}")
+    if len(all_conversations) == 0:
+        print(f"❌ No conversations found! Try lowering min_score (current: {min_score})")
+        return None
+    # Format for LFM2
+    formatted_data = []
+    for conv in all_conversations:
+        formatted_text = format_conversation_for_lfm2(conv)
+        formatted_data.append({
+            'text': formatted_text,
+            'quality_score': conv['quality_score'],
+            'topic_main': conv['topic_main'],
+            'topic_sub': conv['topic_sub'],
+            'has_context': len(conv['history']) > 0
+        })
+    # Create dataset
+    features = Features({
+        'text': Value('string'),
+        'quality_score': Value('int64'),
+        'topic_main': Value('string'),
+        'topic_sub': Value('string'),
+        'has_context': Value('bool')
+    })
+    df = pd.DataFrame(formatted_data)
+    dataset = Dataset.from_pandas(df, features=features)
+    dataset = dataset.train_test_split(test_size=0.1, seed=42)
+    print(f"\n=== Final Dataset ===")
+    print(f"Training samples: {len(dataset['train'])}")
+    print(f"Validation samples: {len(dataset['test'])}")
+    print(f"Examples with context: {sum(df['has_context'])}")
+    # Save
+    dataset.save_to_disk("./kokorochat_processed_multiturn")
+    print("\n✅ Multi-turn dataset saved to ./kokorochat_processed_multiturn")
+    # Show sample
+    print("\n=== Sample Training Example (with context) ===")
+    sample = dataset['train'][5]['text']
+    print(sample[:1000] + "\n..." if len(sample) > 1000 else sample)
+    return dataset
+if __name__ == "__main__":
+    dataset = create_training_dataset_multiturn(
+        data_dir="./KokoroChat/kokorochat_dialogues",
+        min_score=60,  # Top 30% quality
+        context_window=4  # Include 4 previous turns
+    )

finetune_lfm2.6b.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# finetune_lfm2_2.6b_FIXED.py
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TrainingArguments,
+    Trainer,
+    BitsAndBytesConfig,
+    GPT2Tokenizer
+)
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from datasets import load_from_disk
+from dataclasses import dataclass
+from typing import Any, Dict, List
+import wandb
+import os
+import warnings
+warnings.filterwarnings('ignore')
+print("=" * 80)
+print("LFM2-2.6B FINE-TUNING - FIXED VERSION")
+print("=" * 80)
+print(f"PyTorch: {torch.__version__}")
+print(f"CUDA: {torch.cuda.is_available()}")
+print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
+if torch.cuda.is_available():
+    gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+    print(f"GPU Memory: {gpu_memory_gb:.1f} GB")
+import bitsandbytes as bnb
+print("✅ BitsAndBytes OK")
+# Initialize W&B
+wandb.init(
+    project="liquid-ai-hackathon-kokorochat",
+    name="LFM2-2.6B-counselor-FIXED",
+    config={
+        "model": "LFM2-2.6B",
+        "dataset": "KokoroChat-MultiTurn",
+        "task": "psychological-counseling"
+    }
+)
+print("\n" + "=" * 80)
+print("LOADING MODEL (WITH FALLBACK)")
+print("=" * 80)
+LOCAL_MODEL_PATH = "./models/LFM2-2.6B"
+HF_MODEL_NAME = "LiquidAI/LFM2-2.6B"
+# 1. Load tokenizer with GPT2 fallback
+print("\n1. Loading tokenizer...")
+try:
+    tokenizer = AutoTokenizer.from_pretrained(
+        LOCAL_MODEL_PATH,
+        trust_remote_code=True,
+        local_files_only=True
+    )
+    print("   ✅ LFM2 tokenizer loaded!")
+except Exception as e:
+    print(f"   ⚠️  LFM2 tokenizer failed")
+    print("   🔄 Using GPT2 tokenizer...")
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    print("   ✅ GPT2 tokenizer loaded!")
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+# 2. QLoRA config
+print("\n2. Configuring QLoRA...")
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+)
+# 3. Load model with proper fallback
+print("\n3. Loading LFM2-2.6B model...")
+# First, try to ensure we have the custom model files
+print("   📥 Checking for custom model files...")
+# Check if modeling files exist
+custom_files = ["modeling_lfm2.py", "configuration_lfm2.py"]
+has_custom_files = all(
+    os.path.exists(os.path.join(LOCAL_MODEL_PATH, f))
+    for f in custom_files
+)
+if not has_custom_files:
+    print("   ⚠️  Custom model files missing in local directory")
+    print("   📥 Need to download from HuggingFace with custom code...")
+    # Download with custom code
+    from huggingface_hub import snapshot_download
+    print("   ⏳ Downloading model with custom code (one-time)...")
+    snapshot_download(
+        repo_id=HF_MODEL_NAME,
+        local_dir=LOCAL_MODEL_PATH,
+        local_dir_use_symlinks=False,
+        ignore_patterns=[]  # Don't ignore anything
+    )
+    print("   ✅ Model downloaded with custom code!")
+# Now load the model
+print("   ⏳ Loading model (~2-4 minutes)...")
+try:
+    # Try local first with trust_remote_code
+    model = AutoModelForCausalLM.from_pretrained(
+        LOCAL_MODEL_PATH,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,  # CRITICAL!
+        torch_dtype=torch.bfloat16,
+        local_files_only=False  # Allow downloading custom code if needed
+    )
+    print("   ✅ Model loaded from local!")
+except Exception as e:
+    print(f"   ⚠️  Local load failed: {str(e)[:100]}")
+    print("   📥 Loading directly from HuggingFace...")
+    # Load from HuggingFace Hub
+    model = AutoModelForCausalLM.from_pretrained(
+        HF_MODEL_NAME,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16
+    )
+    print("   ✅ Model loaded from HuggingFace!")
+model = prepare_model_for_kbit_training(model)
+model.config.use_cache = False
+print("   ✅ Model prepared!")
+# 4. LoRA - 2.6B configuration
+print("\n4. Applying LoRA (2.6B config)...")
+lora_config = LoraConfig(
+    r=64,  # Higher for 2.6B
+    lora_alpha=128,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                   "gate_proj", "up_proj", "down_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+model = get_peft_model(model, lora_config)
+print("\n📊 Trainable Parameters:")
+model.print_trainable_parameters()
+# 5. Load dataset
+print("\n5. Loading dataset...")
+dataset = load_from_disk("./kokorochat_processed_multiturn")
+print(f"   ✅ Training: {len(dataset['train']):,}, Val: {len(dataset['test']):,}")
+# 6. Data Collator (same as 1.2B)
+@dataclass
+class DataCollatorForCausalLM:
+    tokenizer: Any
+    max_length: int = 2048
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        texts = [f["text"] for f in features]
+        batch = self.tokenizer(
+            texts,
+            max_length=self.max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+        batch["labels"] = batch["input_ids"].clone()
+        batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100
+        return batch
+data_collator = DataCollatorForCausalLM(tokenizer=tokenizer)
+# 7. Training Configuration - 2.6B optimized
+print("\n6. Configuring training (2.6B optimized)...")
+gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+if gpu_memory_gb >= 70:
+    per_device_batch = 2
+    grad_accum = 16
+    print(f"   🚀 {gpu_memory_gb:.0f}GB GPU → batch=2, accum=16")
+else:
+    per_device_batch = 1
+    grad_accum = 32
+    print(f"   ⚡ {gpu_memory_gb:.0f}GB GPU → batch=1, accum=32")
+training_args = TrainingArguments(
+    output_dir="./lfm2-2.6b-checkpoints-fixed",
+    # Batch (memory-adjusted for 2.6B)
+    per_device_train_batch_size=per_device_batch,
+    per_device_eval_batch_size=per_device_batch,
+    gradient_accumulation_steps=grad_accum,
+    # Learning (optimized for 2.6B)
+    num_train_epochs=3,  # 2.6B learns faster
+    learning_rate=2e-4,  # Lower for stability
+    warmup_steps=200,
+    lr_scheduler_type="cosine",
+    # Optimization
+    fp16=False,
+    bf16=True,
+    logging_steps=10,
+    eval_strategy="steps",
+    eval_steps=50,
+    save_strategy="steps",
+    save_steps=100,
+    save_total_limit=5,
+    load_best_model_at_end=True,
+    metric_for_best_model="eval_loss",
+    optim="paged_adamw_8bit",
+    report_to="wandb",
+    gradient_checkpointing=True,
+    max_grad_norm=0.3,
+    logging_dir="./logs",
+    remove_unused_columns=False,
+    dataloader_num_workers=4,
+    dataloader_pin_memory=True,
+)
+effective_batch = per_device_batch * grad_accum
+steps_per_epoch = len(dataset['train']) // effective_batch
+total_steps = steps_per_epoch * 3
+print("\n" + "=" * 80)
+print("📊 2.6B TRAINING CONFIGURATION")
+print("=" * 80)
+print(f"\n✅ Batch Config:")
+print(f"   Per-device: {per_device_batch}")
+print(f"   Gradient accum: {grad_accum}")
+print(f"   → Effective: {effective_batch}")
+print(f"\n✅ Learning Config:")
+print(f"   Learning rate: 2e-4 (vs 3e-4 for 1.2B)")
+print(f"   Epochs: 3 (vs 4 for 1.2B)")
+print(f"   LoRA rank: 64 (vs 32 for 1.2B)")
+print(f"\n✅ Training Stats:")
+print(f"   Training samples: {len(dataset['train']):,}")
+print(f"   Steps per epoch: {steps_per_epoch:,}")
+print(f"   Total steps: {total_steps:,}")
+print(f"\n⏱️  Estimated Time:")
+if gpu_memory_gb >= 80:
+    print(f"   ~5-8 hours on {gpu_memory_gb:.0f}GB GPU")
+else:
+    print(f"   ~8-12 hours on {gpu_memory_gb:.0f}GB GPU")
+# 8. Trainer (same as 1.2B)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    data_collator=data_collator,
+)
+# 9. Start training
+print("\n" + "=" * 80)
+print("🚀 STARTING 2.6B TRAINING")
+print("=" * 80)
+print(f"📊 Monitor: https://wandb.ai/sandeeptechiot-ai/liquid-ai-hackathon-kokorochat\n")
+try:
+    trainer.train()
+    print("\n✅ TRAINING COMPLETE!")
+except KeyboardInterrupt:
+    print("\n⚠️  Interrupted - saving...")
+    trainer.save_model("./lfm2-2.6b-interrupted")
+except Exception as e:
+    print(f"\n❌ Error: {e}")
+    import traceback
+    traceback.print_exc()
+    raise
+# 10. Save
+output_dir = "./lfm2-2.6b-counselor-final"
+lora_dir = "./lfm2-2.6b-counselor-lora"
+trainer.save_model(output_dir)
+tokenizer.save_pretrained(output_dir)
+model.save_pretrained(lora_dir)
+print(f"\n✅ Model saved to: {output_dir}")
+wandb.finish()
+print("\n" + "=" * 80)
+print("🎉 2.6B TRAINING COMPLETE!")
+print("=" * 80)