"""
Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)

Instructions for Colab:
1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
2. Run the following installation commands in your first cell:

!pip install unsloth "trl<0.9.0"
!pip install openenv-core pydantic httpx
!git clone <YOUR-GITHUB-REPO-URL>
!cd AutoMathReasoner && pip install -e .

3. Run the following Python script in the next cell.
"""

import collections
import random
import unsloth  # Must be imported before trl/transformers/peft for patching.
from datasets import Dataset
import torch
import numpy as np

# Unsloth & TRL
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer

# AutoMathReasoner OpenEnv Client
import sys
sys.path.append("./AutoMathReasoner")
from AutoMathReasoner.client import AutomathreasonerEnv
from AutoMathReasoner.env.models import AutomathreasonerAction

# 1. Configuration
# Replace with your actual Hugging Face Space URL!
HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
env = AutomathreasonerEnv(url=HF_SPACE_URL)

max_seq_length = 1024  # Fits well within Colab T4 16GB VRAM limit
lora_rank = 16

# T4 (and many non-Ampere GPUs) do not support bf16; pick precision dynamically.
has_cuda = torch.cuda.is_available()
use_bf16 = has_cuda and torch.cuda.is_bf16_supported()
use_fp16 = has_cuda and not use_bf16

# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
print("Loading model via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",  # Pre-quantized 4bit for fast download 
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# Enable LoRA fine-tuning 
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",  # Crucial for fitting into T4
)

# 3. Prepare Prompts from the Remote Environment
print("Gathering initial prompts from HF Space environment...")
initial_prompts = []
for _ in range(50):  # Increased from 30 for better coverage
    # This fires an HTTP request to your Hugging Face Space
    obs = env.reset()
    initial_prompts.append({"prompt": obs.problem_text})

# Deduplicate
seen = set()
unique_prompts = []
for p in initial_prompts:
    if p["prompt"] not in seen:
        seen.add(p["prompt"])
        unique_prompts.append(p)

print(f"   Generated {len(unique_prompts)} unique training prompts")
dataset = Dataset.from_list(unique_prompts)

# 4. Define Reward Function for TRL
# Track stats for logging
reward_stats = {"total_calls": 0, "total_correct": 0, "total_reward": 0.0}

def compute_rewards(prompts, completions, **kwargs):
    """
    Interfaces with the OpenEnv running on Hugging Face Spaces.
    Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
    
    Improvements over v1:
    1. Better answer parsing with multiple delimiter support
    2. Confidence-weighted self-consistency bonus
    3. Format compliance awareness
    4. Progress logging
    """
    rewards = []
    parsed_actions = []
    prompt_answers = collections.defaultdict(list)
    
    # Parse all completions
    for prompt, completion in zip(prompts, completions):
        try:
            if "Answer:" in completion:
                parts = completion.split("Answer:")
                reasoning = parts[0].strip()
                answer = parts[1].strip() if len(parts) > 1 else ""
            elif "answer:" in completion.lower():
                idx = completion.lower().index("answer:")
                reasoning = completion[:idx].strip()
                answer = completion[idx + 7:].strip()
            else:
                lines = completion.strip().split('\n')
                if len(lines) > 1:
                    reasoning = '\n'.join(lines[:-1]).strip()
                    answer = lines[-1].strip()
                else:
                    reasoning = completion
                    answer = ""
        except Exception:
            reasoning = completion
            answer = ""
            
        parsed_actions.append((prompt, completion, reasoning, answer))
        prompt_answers[prompt].append(answer)
        
    # Majority voting with confidence
    majority_answers = {}
    majority_confidence = {}
    for p, ans_list in prompt_answers.items():
        if ans_list:
            counter = collections.Counter(ans_list)
            most_common = counter.most_common(1)[0]
            majority_answers[p] = most_common[0]
            majority_confidence[p] = most_common[1] / len(ans_list)

    for p, c, r, a in parsed_actions:
        action = AutomathreasonerAction(reasoning=r, final_answer=a)
        
        # Reset and step through HTTP API
        obs = env.reset()
        step_obs = env.step(action)
        r_total = step_obs.reward
        
        # Confidence-weighted self-consistency bonus
        majority = majority_answers.get(p, "")
        confidence = majority_confidence.get(p, 0.0)
        if (a == majority) and len(a) > 0 and confidence > 0.3:
            r_total += 0.05 + 0.10 * confidence
            
        r_total = max(-1.0, min(1.5, r_total))
        rewards.append(r_total)
        
        # Stats
        reward_stats["total_calls"] += 1
        is_correct = step_obs.metadata.get('is_correct', False) if hasattr(step_obs, 'metadata') else False
        reward_stats["total_correct"] += 1 if is_correct else 0
        reward_stats["total_reward"] += r_total
    
    # Log every 30 calls
    if reward_stats["total_calls"] % 30 < len(prompts):
        n = reward_stats["total_calls"]
        avg_r = reward_stats["total_reward"] / max(1, n)
        acc = reward_stats["total_correct"] / max(1, n)
        print(f"  📊 Colab Step {n}: AvgReward={avg_r:.3f}, Accuracy={acc:.2%}")
            
    return rewards

# 5. Execute Training (T4-optimized parameters)
training_args = GRPOConfig(
    output_dir="colab_outputs",
    
    # Learning rate — matched to dense reward signal
    learning_rate=5e-6,
    
    # Batch — T4 memory-safe
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    
    # Sequence lengths — room for math reasoning + hints
    max_prompt_length=192,          # Was 128
    max_completion_length=384,      # Was 256
    
    # GRPO group — K=8 (kept for T4 memory, was 4)
    num_generations=8,              # Increased from 4, still T4-safe
    
    # Training duration
    max_steps=200,                  # Was 150
    
    # Logging
    logging_steps=5,
    
    # Warmup
    warmup_ratio=0.08,
    
    # 8-bit optimizer saves VRAM
    optim="adamw_8bit",
    bf16=use_bf16,
    fp16=use_fp16,
    use_cpu=not has_cuda,
)

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[compute_rewards],
    args=training_args,
    train_dataset=dataset,
)

print("🚀 Starting GRPO Training in Colab using Remote HF Environment...")
print(f"   Config: lr={training_args.learning_rate}, "
      f"generations={training_args.num_generations}, "
      f"max_steps={training_args.max_steps}")

# Will show wandb/tensorboard logging so you can prove "it is actually learning"
trainer.train()

# Print final summary
n = reward_stats["total_calls"]
if n > 0:
    print(f"\n📈 Final Colab Training Summary:")
    print(f"   Total reward calls: {n}")
    print(f"   Overall accuracy: {reward_stats['total_correct'] / n:.2%}")
    print(f"   Average reward: {reward_stats['total_reward'] / n:.4f}")

# 6. Push to Hugging Face
# Optional: save locally or push to Hub after it learns
# model.push_to_hub("your-name/AutoMathReasoner-Trained")