File size: 8,800 Bytes
853c66a
91cbc1a
 
bf4458b
91cbc1a
 
8c6d174
853c66a
91cbc1a
 
945deec
8c6d174
91cbc1a
 
 
 
bf4458b
853c66a
91cbc1a
 
853c66a
91cbc1a
 
853c66a
91cbc1a
 
 
 
 
 
 
bf4458b
91cbc1a
 
 
 
 
 
 
 
 
 
 
bf4458b
91cbc1a
 
 
 
 
 
 
853c66a
bf4458b
91cbc1a
 
 
 
 
 
 
 
 
 
853c66a
91cbc1a
 
853c66a
91cbc1a
853c66a
91cbc1a
 
 
 
 
 
853c66a
91cbc1a
 
853c66a
91cbc1a
 
 
 
bf4458b
 
 
 
 
 
91cbc1a
 
bf4458b
91cbc1a
bf4458b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91cbc1a
 
 
bf4458b
 
91cbc1a
bf4458b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91cbc1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf4458b
91cbc1a
 
 
 
 
 
 
 
 
853c66a
 
91cbc1a
 
 
 
 
 
 
 
 
 
bf4458b
91cbc1a
 
 
 
 
f20fb82
 
91cbc1a
 
 
 
853c66a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# app.py
"""
Gemma3 (GGUF) - Gradio Space app (fallback-ready)
Updated: fix for Hugging Face InferenceClient.text_generation() signature
"""

import os
import time
import traceback

import gradio as gr

# -------------------------------------------------------------------------
# Try to import llama-cpp-python (native) — may fail in Spaces build
# -------------------------------------------------------------------------
LLAMA_AVAILABLE = False
llm = None
try:
    from llama_cpp import Llama
    LLAMA_AVAILABLE = True
except Exception as e:
    print("llama-cpp-python not available:", e)
    LLAMA_AVAILABLE = False

# -------------------------------------------------------------------------
# Try to import Hugging Face InferenceClient as fallback
# -------------------------------------------------------------------------
HF_AVAILABLE = False
hf_client = None
try:
    from huggingface_hub import InferenceClient
    # InferenceClient will pick HUGGINGFACE_HUB_TOKEN from env if set
    hf_client = InferenceClient()
    HF_AVAILABLE = True
except Exception as e:
    print("HF InferenceClient not available or not configured:", e)
    HF_AVAILABLE = False

# -------------------------------------------------------------------------
# Configuration (env vars)
# -------------------------------------------------------------------------
MODEL_REPO = os.environ.get("MODEL_REPO", "google/gemma-3-4b-it-qat-q4_0-gguf")
GGUF_PATH = os.environ.get("GGUF_PATH", None)  # if the gguf is uploaded to the Space
HF_INFERENCE_MODEL = os.environ.get("HF_INFERENCE_MODEL", "")  # optional override for HF inference model id
DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", 256))
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", 0.8))

# -------------------------------------------------------------------------
# If llama-cpp available and a GGUF path is provided (or MODEL_REPO downloaded), load model
# -------------------------------------------------------------------------
if LLAMA_AVAILABLE:
    try:
        model_path_to_try = GGUF_PATH or os.path.join("/workspace", "model.gguf")
        if GGUF_PATH and os.path.exists(GGUF_PATH):
            model_path_to_try = GGUF_PATH
        elif os.path.exists(model_path_to_try):
            pass
        else:
            raise FileNotFoundError(f"No local .gguf found at GGUF_PATH or default ({model_path_to_try}). Set GGUF_PATH or upload the .gguf file into the Space.")

        print("Loading local model via llama-cpp-python from:", model_path_to_try)
        llm = Llama(model_path=model_path_to_try, n_ctx=2048, n_threads=2)
        print("Loaded local model successfully.")
    except Exception as e:
        print("Failed to load local gguf with llama-cpp-python:", e)
        print(traceback.format_exc())
        llm = None
        LLAMA_AVAILABLE = False

# -------------------------------------------------------------------------
# Helper functions for inference
# -------------------------------------------------------------------------
def local_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
    if not llm:
        return "Local model not loaded."
    try:
        resp = llm.create_completion(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
        return resp["choices"][0]["text"]
    except Exception as e:
        print("Error in local_generate:", e)
        return f"Local generation error: {e}"

def hf_generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
    """
    Corrected HF usage:
    - Pass prompt as positional first arg to text_generation()
    - Use max_new_tokens (not max_tokens)
    - Optionally pass model=HF_INFERENCE_MODEL if set
    """
    if not HF_AVAILABLE or hf_client is None:
        return "Hugging Face Inference client not available. Set HUGGINGFACE_HUB_TOKEN or enable HF SDK."

    try:
        kwargs = {
            "max_new_tokens": int(max_tokens),
            "temperature": float(temperature),
            # you can also set stream=True or details=True if desired
        }
        # include model override only if provided (avoid passing empty string)
        if HF_INFERENCE_MODEL:
            kwargs["model"] = HF_INFERENCE_MODEL

        # NOTE: text_generation expects the prompt as first positional arg.
        raw = hf_client.text_generation(prompt, **kwargs)

        # raw may be:
        #  - a simple string with generated text,
        #  - a TextGenerationOutput object (dataclass-like) or dict,
        #  - a list containing dict(s) depending on version/backends
        # Normalize to a string response:
        # case: simple str
        if isinstance(raw, str):
            return raw

        # case: list (e.g., [{"generated_text": "..."}])
        if isinstance(raw, list) and len(raw) > 0:
            first = raw[0]
            if isinstance(first, dict):
                # prefer keys commonly returned
                return first.get("generated_text") or first.get("text") or str(first)
            return str(first)

        # case: object with attribute generated_text or dict-like
        if hasattr(raw, "generated_text"):
            return getattr(raw, "generated_text")
        if isinstance(raw, dict):
            # try common keys
            return raw.get("generated_text") or raw.get("text") or str(raw)

        # fallback to string conversion
        return str(raw)

    except TypeError as te:
        # common mistake: wrong kw names (we tried to guard this), print helpful msg
        print("TypeError from hf_client.text_generation:", te)
        print(traceback.format_exc())
        return f"Hugging Face generation TypeError: {te}. (Check huggingface_hub version & parameter names.)"
    except Exception as e:
        print("HF generation error:", e)
        print(traceback.format_exc())
        return f"Hugging Face generation error: {e}"

def generate(prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE):
    prompt = (prompt or "").strip()
    if not prompt:
        return "કૃપયા પ્રશ્ન લખો (Please provide a prompt)."
    # Prefer local if available
    if LLAMA_AVAILABLE and llm:
        return local_generate(prompt, max_tokens=max_tokens, temperature=temperature)
    elif HF_AVAILABLE and hf_client:
        return hf_generate(prompt, max_tokens=max_tokens, temperature=temperature)
    else:
        return (
            "No model runtime is available.\n\n"
            "Options:\n"
            "1) Upload a .gguf file into the Space and set GGUF_PATH environment variable to its path,\n"
            "2) Set HUGGINGFACE_HUB_TOKEN (secret) and HF_INFERENCE_MODEL to a hosted model id to use HF Inference API.\n"
        )

# -------------------------------------------------------------------------
# Gradio UI
# -------------------------------------------------------------------------
title_text = "💎 Gemma3 — Desi Chatbot (GGUF / HF fallback)"
description_text = """
**Gemma3 (quantized GGUF)** — Local inference if available, otherwise fallback to Hugging Face Inference API.
"""

with gr.Blocks(title=title_text) as demo:
    gr.Markdown(f"# {title_text}")
    gr.Markdown(description_text)

    with gr.Row():
        with gr.Column(scale=3):
            prompt_input = gr.Textbox(lines=5, label="તમારો પ્રશ્ન / Prompt", placeholder="અહીં લખો... (Gujarati/English)")
            with gr.Row():
                max_tokens = gr.Slider(label="Max tokens", minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS)
                temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, step=0.05, value=DEFAULT_TEMPERATURE)
            submit_btn = gr.Button("જવાબ આપો")
        with gr.Column(scale=2):
            status_md = gr.Markdown(
                f"**Runtime:** {'local llama-cpp' if (LLAMA_AVAILABLE and llm) else ('HuggingFace Inference' if HF_AVAILABLE else 'No runtime available')}\n\n"
                f"- MODEL_REPO: `{MODEL_REPO}`\n"
                f"- HF model (inference): `{HF_INFERENCE_MODEL or '<not set>'}`\n"
            )
            tips = gr.Markdown("**Tips:** Reduce max tokens if you see OOM. Upload a smaller Q4 quantized GGUF for Spaces.")

    output_box = gr.Textbox(lines=10, label="જવાબ (Response)")
    submit_btn.click(fn=generate, inputs=[prompt_input, max_tokens, temperature], outputs=[output_box])

if __name__ == "__main__":
    print("LLAMA_AVAILABLE:", LLAMA_AVAILABLE)
    print("HF_AVAILABLE:", HF_AVAILABLE)
    print("MODEL_REPO:", MODEL_REPO)
    print("GGUF_PATH:", GGUF_PATH)
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))