# app.py """ Gemma3 (GGUF) - Gradio Space app (fallback-ready) Updated: fix for Hugging Face InferenceClient.text_generation() signature """ import os import time import traceback import gradio as gr # ------------------------------------------------------------------------- # Try to import llama-cpp-python (native) — may fail in Spaces build # ------------------------------------------------------------------------- LLAMA_AVAILABLE = False llm = None try: from llama_cpp import Llama LLAMA_AVAILABLE = True except Exception as e: print("llama-cpp-python not available:", e) LLAMA_AVAILABLE = False # ------------------------------------------------------------------------- # Try to import Hugging Face InferenceClient as fallback # ------------------------------------------------------------------------- HF_AVAILABLE = False hf_client = None try: from huggingface_hub import InferenceClient # InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set hf_client = InferenceClient() HF_AVAILABLE = True except Exception as e: print("HF InferenceClient not available or not configured:", e) HF_AVAILABLE = False # ------------------------------------------------------------------------- # Configuration (env vars) # ------------------------------------------------------------------------- MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf") GGUF_PATH = os.environ.get("GGUF_PATH", None) # if the gguf is uploaded to the Space HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "") # optional override for HF inference model id DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256)) DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8)) # ------------------------------------------------------------------------- # If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model # ------------------------------------------------------------------------- if LLAMA_AVAILABLE: try: model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf") if GGUF_PATH and os.path.exists(GGUF_PATH): model_path_to_try = GGUF_PATH elif os.path.exists(model_path_to_try): pass else: raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.") print("Loading local model via llama-cpp-python from:", model_path_to_try) llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2) print("Loaded local model successfully.") except Exception as e: print("Failed to load local gguf with llama-cpp-python:", e) print(traceback.format_exc()) llm = None LLAMA_AVAILABLE = False # ------------------------------------------------------------------------- # Helper functions for inference # ------------------------------------------------------------------------- def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE): if not llm: return "Local model not loaded." try: resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature) return resp["choices"][0]["text"] except Exception as e: print("Error in local_generate:", e) return f"Local generation error: {e}" def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE): """ Corrected HF usage: - Pass prompt as positional first arg to text_generation() - Use max_new_tokens (not max_tokens) - Optionally pass model=HF_INFERENCE_MODEL if set """ if not HF_AVAILABLE or hf_client is None: return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK." try: kwargs = { "max_new_tokens": int(max_tokens), "temperature": float(temperature), # you can also set stream=True or details=True if desired } # include model override only if provided (avoid passing empty string) if HF_INFERENCE_MODEL: kwargs["model"] = HF_INFERENCE_MODEL # NOTE: text_generation expects the prompt as first positional arg. raw = hf_client.text_generation(prompt, **kwargs) # raw may be: # - a simple string with generated text, # - a TextGenerationOutput object (dataclass-like) or dict, # - a list containing dict(s) depending on version/backends # Normalize to a string response: # case: simple str if isinstance(raw, str): return raw # case: list (e.g., [{"generated_text": "..."}]) if isinstance(raw, list) and len(raw) > 0: first = raw[0] if isinstance(first, dict): # prefer keys commonly returned return first.get("generated_text") or first.get("text") or str(first) return str(first) # case: object with attribute generated_text or dict-like if hasattr(raw, "generated_text"): return getattr(raw, "generated_text") if isinstance(raw, dict): # try common keys return raw.get("generated_text") or raw.get("text") or str(raw) # fallback to string conversion return str(raw) except TypeError as te: # common mistake: wrong kw names (we tried to guard this), print helpful msg print("TypeError from hf_client.text_generation:", te) print(traceback.format_exc()) return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)" except Exception as e: print("HF generation error:", e) print(traceback.format_exc()) return f"Hugging Face generation error: {e}" def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE): prompt = (prompt or "").strip() if not prompt: return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)." # Prefer local if available if LLAMA_AVAILABLE and llm: return local_generate(prompt, max_tokens=max_tokens, temperature=temperature) elif HF_AVAILABLE and hf_client: return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature) else: return ( "No model runtime is available.\n\n" "Options:\n" "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n" "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n" ) # ------------------------------------------------------------------------- # Gradio UI # ------------------------------------------------------------------------- title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)" description_text = """ **Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API. """ with gr.Blocks(title=title_text) as demo: gr.Markdown(f"# {title_text}") gr.Markdown(description_text) with gr.Row(): with gr.Column(scale=3): prompt_input = gr.Textbox(lines=5, label="તમારો પ્રશ્ન / Prompt", placeholder="અહીં લખો... (Gujarati/English)") with gr.Row(): max_tokens = gr.Slider(label="Max tokens", minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS) temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, step=0.05, value=DEFAULT_TEMPERATURE) submit_btn = gr.Button("જવાબ આપો") with gr.Column(scale=2): status_md = gr.Markdown( f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n" f"- MODEL_REPO: `{MODEL_REPO}`\n" f"- HF model (inference): `{HF_INFERENCE_MODEL or ''}`\n" ) tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.") output_box = gr.Textbox(lines=10, label="જવાબ (Response)") submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box]) if __name__ == "__main__": print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE) print("HF_AVAILABLE:", HF_AVAILABLE) print("MODEL_REPO:", MODEL_REPO) print("GGUF_PATH:", GGUF_PATH) demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))