import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import gradio as gr
from transformers import pipeline, AutoTokenizer
import torch

print("=== BUBBLE AI STARTING ===")

# Hermes-3-Llama-3.1-8B (already proven working)
model_name = "NousResearch/Hermes-3-Llama-3.1-8B"

print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Fixed pipeline loading (dtype instead of deprecated torch_dtype)
pipe = pipeline("text-generation", 
                model_name,
                device=-1,  # CPU
                dtype=torch.float16,
                trust_remote_code=True)

print("✅ Model loaded successfully!")

def chat(message, history):
    """Fixed chat function - handles pipeline correctly"""
    try:
        # TextGenerationPipeline expects raw text string directly
        outputs = pipe(
            message,
            max_new_tokens=300,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            truncation=True
        )
        
        # Handle pipeline output format
        if isinstance(outputs, list) and len(outputs) > 0:
            full_response = outputs[0]["generated_text"]
        else:
            full_response = str(outputs)
        
        # Remove echoed input if model repeats message
        if full_response.startswith(message):
            response = full_response[len(message):].strip()
        else:
            response = full_response.strip()
        
        return response or "Interesting, tell me more..."
        
    except Exception as e:
        return f"Error: {str(e)}"

# Clean Gradio interface - no invalid parameters
demo = gr.ChatInterface(
    fn=chat,
    title="Bubble AI - Hermes 3",
    description="Claude 4.5 Opus-level conversational AI for your platform"
)

if __name__ == "__main__":
    demo.launch()