import gradio as gr

MODEL_NAME = "likhonhfai/mysterious-coding-model"

def load_model():
    """
    Attempt to lazily import transformers and torch and load the CodeAI model.
    Returns (model, tokenizer) if loaded successfully, otherwise (None, None).
    """
    try:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        return model, tokenizer
    except Exception:
        return None, None


# Load the model once at startup
model, tokenizer = load_model()


def respond(message, history):
    """
    Generate a response using the loaded model or provide a placeholder message.
    """
    # If the model is available, generate a response using it
    if model is not None and tokenizer is not None:
        import torch  # Safe to import since it was available during model loading
        prompt = ""
        for user_msg, bot_msg in history:
            prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
        prompt += f"User: {message}\nAssistant:"
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            output_ids = model.generate(
                inputs,
                max_new_tokens=256,
                temperature=0.7,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id,
            )
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        if "Assistant:" in output_text:
            return output_text.split("Assistant:")[-1].strip()
        else:
            return output_text.strip()

    # Fallback responses when the model is unavailable
    lower = message.lower()
    if "hello" in lower:
        return (
            "Hello! I'm a placeholder chatbot while the full CodeAI model loads. Ask me about long-context processing, "
            "multimodal understanding, or code generation."
        )
    if "code" in lower:
        return (
            "Our model excels at code generation, completion, bug fixing, refactoring and documentation. "
            "Try asking: 'write a python function to add two numbers'."
        )
    if "image" in lower:
        return "The CodeAI model supports image understanding tasks like visual question answering and image captioning."
    if "audio" in lower or "speech" in lower:
        return "Our model can process audio for speech recognition and audio understanding."
    if "thanks" in lower or "thank you" in lower:
        return "You're welcome! Let me know if you have more questions."
    return (
        "This is a demo placeholder response. The CodeAI model uses safetensors storage, supports 8-bit and mxfp4 "
        "mixed-precision variants, is compatible with the vLLM engine, and is trained using Hugging Face AutoTrain. "
        "It handles long contexts (up to 200,000 tokens) and performs text, image, audio, and multimodal reasoning tasks."
    )


demo = gr.ChatInterface(
    fn=respond,
    title="Mysterious Coding Chatbot",
    description=(
        "Chat with our CodeAI model about coding, AI and more. The model supports long-context understanding, "
        "text, image and audio processing, and multimodal reasoning. If the full model can't load due to resource limits, "
        "the chatbot will provide informative placeholder responses."
    ),
)


if __name__ == "__main__":
    demo.launch()