import gradio as gr
import torch
import re
import warnings
import sys
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

# Suppress all warnings including asyncio
warnings.filterwarnings("ignore")
os.environ['PYTHONWARNINGS'] = 'ignore'

# Redirect stderr to suppress asyncio exceptions
class SuppressStderr:
    def __enter__(self):
        self._original_stderr = sys.stderr
        sys.stderr = open(os.devnull, 'w')
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stderr.close()
        sys.stderr = self._original_stderr

# ======================
# Load model
# ======================
MODEL_ID = "google/gemma-3-270m"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,
    device_map="cpu"
)
print("Model loaded successfully!")

# ======================
# Clean output
# ======================
def clean_output(text):
    text = text.strip()
    # Remove repeated patterns
    text = re.sub(r'(.{10,}?)\1+', r'\1', text)
    
    # Find first complete sentence
    sentences = re.split(r'[.!?]\s+', text)
    if sentences:
        return sentences[0] + ('.' if not sentences[0].endswith(('.', '!', '?')) else '')
    return text

# ======================
# Chat function
# ======================
def chat(message, history):
    if not message or not message.strip():
        return "Please enter a message."
    
    try:
        prompt = f"<bos><start_of_turn>user\n{message}\n<end_of_turn>\n<start_of_turn>model\n"
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.2
            )
        
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract model response
        if "model" in decoded:
            reply = decoded.split("model")[-1].strip()
        else:
            reply = decoded.strip()
            
        reply = clean_output(reply)
        
        return reply if reply else "I couldn't generate a response. Please try again."
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

# ======================
# UI with proper examples
# ======================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Gemma3 270M Cloud Chat")
    gr.Markdown("Gemma3 270M running on Hugging Face Spaces")
    
    chatbot = gr.Chatbot(height=400)
    msg = gr.Textbox(
        label="Your message",
        placeholder="Type your message here...",
        lines=2
    )
    
    with gr.Row():
        submit = gr.Button("Send", variant="primary")
        clear = gr.Button("Clear")
    
    gr.Markdown("### Try these examples:")
    with gr.Row():
        example1 = gr.Button("Hi, how are you?", size="sm")
        example2 = gr.Button("What is AI?", size="sm")
        example3 = gr.Button("Write hello world in Python", size="sm")
    
    # Chat interaction
    def respond(message, chat_history):
        bot_message = chat(message, chat_history)
        chat_history.append((message, bot_message))
        return "", chat_history
    
    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    submit.click(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)
    
    # Example buttons
    example1.click(lambda: "Hi, how are you?", None, msg)
    example2.click(lambda: "What is AI?", None, msg)
    example3.click(lambda: "Write hello world in Python", None, msg)

if __name__ == "__main__":
    import atexit
    
    # Clean exit handler
    def cleanup():
        try:
            import asyncio
            loop = asyncio.get_event_loop()
            if loop.is_running():
                loop.stop()
        except:
            pass
    
    atexit.register(cleanup)
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        quiet=True  # Suppress Gradio startup messages
    )