Spaces:

manthilaffs
/

Gamunu-Inference

Sleeping

App Files Files Community

manthilaffs commited on Nov 2, 2025

Commit

9b9ba13

verified ·

1 Parent(s): 726fa01

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -28

app.py CHANGED Viewed

@@ -1,15 +1,14 @@
 import gradio as gr
 import torch
 import spaces
-from transformers import AutoTokenizer
 # ----------------------------------------------------
-# Globals (lazy-loaded later)
 # ----------------------------------------------------
 model = None
 tokenizer = None
-# Sinhala Alpaca-style prompt template
 alpaca_prompt = """පහත දැක්වෙන්නේ යම් කාර්යයක් පිළිබඳ විස්තර කරන උපදෙසක් සහ එයට අදාළ තොරතුරු ඇතුළත් ආදානයකි. ඉල්ලූ කාර්යය නිවැරදිව සම්පූර්ණ කළ හැකි ප්‍රතිචාරයක් සපයන්න.
 ### උපදෙස:
@@ -22,23 +21,23 @@ alpaca_prompt = """පහත දැක්වෙන්නේ යම් කාර
 {}"""
 # ----------------------------------------------------
-# GPU inference — executed only when ZeroGPU allocates GPU
 # ----------------------------------------------------
 @spaces.GPU
 def infer(instruction, input_text=""):
-    """Run Gamunu inference on GPU (ZeroGPU burst)."""
     global model, tokenizer
-    # ✅ Lazy import Unsloth *after* GPU is available
-    from unsloth import FastLanguageModel
     if model is None:
-        model, tokenizer = FastLanguageModel.from_pretrained(
             "manthilaffs/Gamunu-4B-Instruct-Alpha"
         )
-        FastLanguageModel.for_inference(model)
-    # Build Alpaca-style prompt
     prompt = alpaca_prompt.format(
         "ඔබ ගැමුණු නම් AI සහායකයායි. ඔබව නිර්මාණය කර ඇත්තේ මන්තිල විසිනි. "
         "ඔබේ කාර්යය වන්නේ පරිශීලකයන්ගේ ප්‍රශ්නවලට නිවැරදිව පිළිතුරු සපයමින් ඔවුන්ට සහය වීමයි.",
@@ -46,32 +45,26 @@ def infer(instruction, input_text=""):
         input_text.strip(),
     )
-    inputs = tokenizer(text=prompt, return_tensors="pt").to(model.device)
-    # ✅ Disable TorchDynamo (fix Gemma3 compile bug on Torch 2.8)
-    import torch._dynamo
-    torch._dynamo.config.suppress_errors = True
-    torch._dynamo.disable()
-    with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=256,
             temperature=0.4,
             top_k=64,
             top_p=0.95,
-            min_p=0.75,
         )
     text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     if "### ප්‍රතිචාරය:" in text:
         text = text.split("### ප්‍රතිචාරය:")[-1].strip()
     return text
 # ----------------------------------------------------
-# Gradio Interface — ZeroGPU ready
 # ----------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
@@ -79,8 +72,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         # 🧠 Gamunu 4B Instruct Alpha
         *Sinhala Instruct LLM — ZeroGPU Demo*
-        ⚙️ Built with Unsloth FastLanguageModel
-        💠 Temporary GPU acceleration via `@spaces.GPU`
         """
     )
@@ -94,16 +86,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         input_text = gr.Textbox(
             label="📥 Additional Context (Optional)",
-            placeholder="අමතර තොරතුරු ඇතුළත් කරන්න (උදා: කාල පරිච්ඡේදය හෝ තත්ත්වය)",
             lines=2,
         )
     output = gr.Markdown(label="🧩 Gamunu Response")
     run_btn = gr.Button("🔮 Generate Response")
     run_btn.click(infer, inputs=[instruction, input_text], outputs=output)
-    # 🪄 Example questions for visitors
     gr.Examples(
         examples=[
             ["ඉන්දියානු මහා සමුද්‍රය යනු කොහෙද?", ""],
@@ -118,8 +108,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         """
         ---
         🪶 **Model:** `manthilaffs/Gamunu-4B-Instruct-Alpha`
-        🧰 **Stack:** Unsloth + Transformers + Gradio + ZeroGPU
-        © 2025 Gamunu Project | Experimental Research Release
         """
     )

 import gradio as gr
 import torch
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
 # ----------------------------------------------------
+# Globals
 # ----------------------------------------------------
 model = None
 tokenizer = None
 alpaca_prompt = """පහත දැක්වෙන්නේ යම් කාර්යයක් පිළිබඳ විස්තර කරන උපදෙසක් සහ එයට අදාළ තොරතුරු ඇතුළත් ආදානයකි. ඉල්ලූ කාර්යය නිවැරදිව සම්පූර්ණ කළ හැකි ප්‍රතිචාරයක් සපයන්න.
 ### උපදෙස:
 {}"""
 # ----------------------------------------------------
+# GPU inference — official ZeroGPU style
 # ----------------------------------------------------
 @spaces.GPU
 def infer(instruction, input_text=""):
     global model, tokenizer
     if model is None:
+        tokenizer = AutoTokenizer.from_pretrained(
             "manthilaffs/Gamunu-4B-Instruct-Alpha"
         )
+        model = AutoModelForCausalLM.from_pretrained(
+            "manthilaffs/Gamunu-4B-Instruct-Alpha",
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto",
+        )
+        model.eval()
     prompt = alpaca_prompt.format(
         "ඔබ ගැමුණු නම් AI සහායකයායි. ඔබව නිර්මාණය කර ඇත්තේ මන්තිල විසිනි. "
         "ඔබේ කාර්යය වන්නේ පරිශීලකයන්ගේ ප්‍රශ්නවලට නිවැරදිව පිළිතුරු සපයමින් ඔවුන්ට සහය වීමයි.",
         input_text.strip(),
     )
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.inference_mode():
         outputs = model.generate(
             **inputs,
             max_new_tokens=256,
             temperature=0.4,
             top_k=64,
             top_p=0.95,
+            repetition_penalty=1.05,
         )
     text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     if "### ප්‍රතිචාරය:" in text:
         text = text.split("### ප්‍රතිචාරය:")[-1].strip()
     return text
 # ----------------------------------------------------
+# Gradio UI
 # ----------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         # 🧠 Gamunu 4B Instruct Alpha
         *Sinhala Instruct LLM — ZeroGPU Demo*
+        ⚙️ Pure Transformers Inference | 💠 ZeroGPU GPU Burst
         """
     )
     with gr.Row():
         input_text = gr.Textbox(
             label="📥 Additional Context (Optional)",
+            placeholder="අමතර තොරතුරු (ඇත්නම්) එහි සටහන් කරන්න",
             lines=2,
         )
     output = gr.Markdown(label="🧩 Gamunu Response")
     run_btn = gr.Button("🔮 Generate Response")
     run_btn.click(infer, inputs=[instruction, input_text], outputs=output)
     gr.Examples(
         examples=[
             ["ඉන්දියානු මහා සමුද්‍රය යනු කොහෙද?", ""],
         """
         ---
         🪶 **Model:** `manthilaffs/Gamunu-4B-Instruct-Alpha`
+        🧰 **Stack:** Transformers + Torch + Gradio + ZeroGPU
+        © 2025 Gamunu Project | Experimental Release
         """
     )