Spaces:
Sleeping
Sleeping
File size: 8,800 Bytes
853c66a 91cbc1a bf4458b 91cbc1a 8c6d174 853c66a 91cbc1a 945deec 8c6d174 91cbc1a bf4458b 853c66a 91cbc1a 853c66a 91cbc1a 853c66a 91cbc1a bf4458b 91cbc1a bf4458b 91cbc1a 853c66a bf4458b 91cbc1a 853c66a 91cbc1a 853c66a 91cbc1a 853c66a 91cbc1a 853c66a 91cbc1a 853c66a 91cbc1a bf4458b 91cbc1a bf4458b 91cbc1a bf4458b 91cbc1a bf4458b 91cbc1a bf4458b 91cbc1a bf4458b 91cbc1a 853c66a 91cbc1a bf4458b 91cbc1a f20fb82 91cbc1a 853c66a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# app.py
"""
Gemma3 (GGUF) - Gradio Space app (fallback-ready)
Updated: fix for Hugging Face InferenceClient.text_generation() signature
"""
import os
import time
import traceback
import gradio as gr
# -------------------------------------------------------------------------
# Try to import llama-cpp-python (native) — may fail in Spaces build
# -------------------------------------------------------------------------
LLAMA_AVAILABLE = False
llm = None
try:
from llama_cpp import Llama
LLAMA_AVAILABLE = True
except Exception as e:
print("llama-cpp-python not available:", e)
LLAMA_AVAILABLE = False
# -------------------------------------------------------------------------
# Try to import Hugging Face InferenceClient as fallback
# -------------------------------------------------------------------------
HF_AVAILABLE = False
hf_client = None
try:
from huggingface_hub import InferenceClient
# InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set
hf_client = InferenceClient()
HF_AVAILABLE = True
except Exception as e:
print("HF InferenceClient not available or not configured:", e)
HF_AVAILABLE = False
# -------------------------------------------------------------------------
# Configuration (env vars)
# -------------------------------------------------------------------------
MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
GGUF_PATH = os.environ.get("GGUF_PATH", None) # if the gguf is uploaded to the Space
HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "") # optional override for HF inference model id
DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))
# -------------------------------------------------------------------------
# If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
# -------------------------------------------------------------------------
if LLAMA_AVAILABLE:
try:
model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")
if GGUF_PATH and os.path.exists(GGUF_PATH):
model_path_to_try = GGUF_PATH
elif os.path.exists(model_path_to_try):
pass
else:
raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")
print("Loading local model via llama-cpp-python from:", model_path_to_try)
llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
print("Loaded local model successfully.")
except Exception as e:
print("Failed to load local gguf with llama-cpp-python:", e)
print(traceback.format_exc())
llm = None
LLAMA_AVAILABLE = False
# -------------------------------------------------------------------------
# Helper functions for inference
# -------------------------------------------------------------------------
def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
if not llm:
return "Local model not loaded."
try:
resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
return resp["choices"][0]["text"]
except Exception as e:
print("Error in local_generate:", e)
return f"Local generation error: {e}"
def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
"""
Corrected HF usage:
- Pass prompt as positional first arg to text_generation()
- Use max_new_tokens (not max_tokens)
- Optionally pass model=HF_INFERENCE_MODEL if set
"""
if not HF_AVAILABLE or hf_client is None:
return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."
try:
kwargs = {
"max_new_tokens": int(max_tokens),
"temperature": float(temperature),
# you can also set stream=True or details=True if desired
}
# include model override only if provided (avoid passing empty string)
if HF_INFERENCE_MODEL:
kwargs["model"] = HF_INFERENCE_MODEL
# NOTE: text_generation expects the prompt as first positional arg.
raw = hf_client.text_generation(prompt, **kwargs)
# raw may be:
# - a simple string with generated text,
# - a TextGenerationOutput object (dataclass-like) or dict,
# - a list containing dict(s) depending on version/backends
# Normalize to a string response:
# case: simple str
if isinstance(raw, str):
return raw
# case: list (e.g., [{"generated_text": "..."}])
if isinstance(raw, list) and len(raw) > 0:
first = raw[0]
if isinstance(first, dict):
# prefer keys commonly returned
return first.get("generated_text") or first.get("text") or str(first)
return str(first)
# case: object with attribute generated_text or dict-like
if hasattr(raw, "generated_text"):
return getattr(raw, "generated_text")
if isinstance(raw, dict):
# try common keys
return raw.get("generated_text") or raw.get("text") or str(raw)
# fallback to string conversion
return str(raw)
except TypeError as te:
# common mistake: wrong kw names (we tried to guard this), print helpful msg
print("TypeError from hf_client.text_generation:", te)
print(traceback.format_exc())
return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)"
except Exception as e:
print("HF generation error:", e)
print(traceback.format_exc())
return f"Hugging Face generation error: {e}"
def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
prompt = (prompt or "").strip()
if not prompt:
return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
# Prefer local if available
if LLAMA_AVAILABLE and llm:
return local_generate(prompt, max_tokens=max_tokens, temperature=temperature)
elif HF_AVAILABLE and hf_client:
return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
else:
return (
"No model runtime is available.\n\n"
"Options:\n"
"1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
"2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
)
# -------------------------------------------------------------------------
# Gradio UI
# -------------------------------------------------------------------------
title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
description_text = """
**Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
"""
with gr.Blocks(title=title_text) as demo:
gr.Markdown(f"# {title_text}")
gr.Markdown(description_text)
with gr.Row():
with gr.Column(scale=3):
prompt_input = gr.Textbox(lines=5, label="તમારો પ્રશ્ન / Prompt", placeholder="અહીં લખો... (Gujarati/English)")
with gr.Row():
max_tokens = gr.Slider(label="Max tokens", minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS)
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, step=0.05, value=DEFAULT_TEMPERATURE)
submit_btn = gr.Button("જવાબ આપો")
with gr.Column(scale=2):
status_md = gr.Markdown(
f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
f"- MODEL_REPO: `{MODEL_REPO}`\n"
f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n"
)
tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")
output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])
if __name__ == "__main__":
print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
print("HF_AVAILABLE:", HF_AVAILABLE)
print("MODEL_REPO:", MODEL_REPO)
print("GGUF_PATH:", GGUF_PATH)
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|