# app.py
"""
Gemma3 (GGUF) - Gradio Space app (fallback-ready)
Updated: fix for Hugging Face InferenceClient.text_generation() signature
"""

import os
import time
import traceback

import gradio as gr

# -------------------------------------------------------------------------
# Try to import llama-cpp-python (native) — may fail in Spaces build
# -------------------------------------------------------------------------
LLAMA_AVAILABLE = False
llm = None
try:
    from llama_cpp import Llama
    LLAMA_AVAILABLE = True
except Exception as e:
    print("llama-cpp-python not available:", e)
    LLAMA_AVAILABLE = False

# -------------------------------------------------------------------------
# Try to import Hugging Face InferenceClient as fallback
# -------------------------------------------------------------------------
HF_AVAILABLE = False
hf_client = None
try:
    from huggingface_hub import InferenceClient
    # InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set
    hf_client = InferenceClient()
    HF_AVAILABLE = True
except Exception as e:
    print("HF InferenceClient not available or not configured:", e)
    HF_AVAILABLE = False

# -------------------------------------------------------------------------
# Configuration (env vars)
# -------------------------------------------------------------------------
MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
GGUF_PATH = os.environ.get("GGUF_PATH", None)  # if the gguf is uploaded to the Space
HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "")  # optional override for HF inference model id
DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))

# -------------------------------------------------------------------------
# If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
# -------------------------------------------------------------------------
if LLAMA_AVAILABLE:
    try:
        model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")
        if GGUF_PATH and os.path.exists(GGUF_PATH):
            model_path_to_try = GGUF_PATH
        elif os.path.exists(model_path_to_try):
            pass
        else:
            raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")

        print("Loading local model via llama-cpp-python from:", model_path_to_try)
        llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
        print("Loaded local model successfully.")
    except Exception as e:
        print("Failed to load local gguf with llama-cpp-python:", e)
        print(traceback.format_exc())
        llm = None
        LLAMA_AVAILABLE = False

# -------------------------------------------------------------------------
# Helper functions for inference
# -------------------------------------------------------------------------
def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
    if not llm:
        return "Local model not loaded."
    try:
        resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
        return resp["choices"][0]["text"]
    except Exception as e:
        print("Error in local_generate:", e)
        return f"Local generation error: {e}"

def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
    """
    Corrected HF usage:
    - Pass prompt as positional first arg to text_generation()
    - Use max_new_tokens (not max_tokens)
    - Optionally pass model=HF_INFERENCE_MODEL if set
    """
    if not HF_AVAILABLE or hf_client is None:
        return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."

    try:
        kwargs = {
            "max_new_tokens": int(max_tokens),
            "temperature": float(temperature),
            # you can also set stream=True or details=True if desired
        }
        # include model override only if provided (avoid passing empty string)
        if HF_INFERENCE_MODEL:
            kwargs["model"] = HF_INFERENCE_MODEL

        # NOTE: text_generation expects the prompt as first positional arg.
        raw = hf_client.text_generation(prompt, **kwargs)

        # raw may be:
        #  - a simple string with generated text,
        #  - a TextGenerationOutput object (dataclass-like) or dict,
        #  - a list containing dict(s) depending on version/backends
        # Normalize to a string response:
        # case: simple str
        if isinstance(raw, str):
            return raw

        # case: list (e.g., [{"generated_text": "..."}])
        if isinstance(raw, list) and len(raw) > 0:
            first = raw[0]
            if isinstance(first, dict):
                # prefer keys commonly returned
                return first.get("generated_text") or first.get("text") or str(first)
            return str(first)

        # case: object with attribute generated_text or dict-like
        if hasattr(raw, "generated_text"):
            return getattr(raw, "generated_text")
        if isinstance(raw, dict):
            # try common keys
            return raw.get("generated_text") or raw.get("text") or str(raw)

        # fallback to string conversion
        return str(raw)

    except TypeError as te:
        # common mistake: wrong kw names (we tried to guard this), print helpful msg
        print("TypeError from hf_client.text_generation:", te)
        print(traceback.format_exc())
        return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)"
    except Exception as e:
        print("HF generation error:", e)
        print(traceback.format_exc())
        return f"Hugging Face generation error: {e}"

def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
    prompt = (prompt or "").strip()
    if not prompt:
        return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
    # Prefer local if available
    if LLAMA_AVAILABLE and llm:
        return local_generate(prompt, max_tokens=max_tokens, temperature=temperature)
    elif HF_AVAILABLE and hf_client:
        return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
    else:
        return (
            "No model runtime is available.\n\n"
            "Options:\n"
            "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
            "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
        )

# -------------------------------------------------------------------------
# Gradio UI
# -------------------------------------------------------------------------
title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
description_text = """
**Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
"""

with gr.Blocks(title=title_text) as demo:
    gr.Markdown(f"# {title_text}")
    gr.Markdown(description_text)

    with gr.Row():
        with gr.Column(scale=3):
            prompt_input = gr.Textbox(lines=5, label="તમારો પ્રશ્ન / Prompt", placeholder="અહીં લખો... (Gujarati/English)")
            with gr.Row():
                max_tokens = gr.Slider(label="Max tokens", minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS)
                temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, step=0.05, value=DEFAULT_TEMPERATURE)
            submit_btn = gr.Button("જવાબ આપો")
        with gr.Column(scale=2):
            status_md = gr.Markdown(
                f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
                f"- MODEL_REPO: `{MODEL_REPO}`\n"
                f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n"
            )
            tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")

    output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
    submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])

if __name__ == "__main__":
    print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
    print("HF_AVAILABLE:", HF_AVAILABLE)
    print("MODEL_REPO:", MODEL_REPO)
    print("GGUF_PATH:", GGUF_PATH)
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))