Spaces:

jeyanthangj2004
/

RAGLLM

Sleeping

App Files Files Community

jeyanthangj2004 commited on Jan 15

Commit

f130a02

verified ·

1 Parent(s): 2c20d66

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -37

app.py CHANGED Viewed

@@ -1,57 +1,71 @@
 import gradio as gr
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
-# =========================
-# MODEL AUTO-DOWNLOAD
-# =========================
-MODEL_PATH = hf_hub_download(
-    repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
-    filename="qwen2.5-1.5b-instruct-q4_k_m.gguf"
 )
-# =========================
-# LOAD MODEL (CPU OPTIMIZED)
-# =========================
-llm = Llama(
-    model_path=MODEL_PATH,
-    n_ctx=4096,        # RAG-friendly
-    n_threads=2,       # HF free CPU
-    n_batch=256,
-    verbose=False
 )
-# =========================
-# GENERATION FUNCTION
-# =========================
 def generate(
     prompt,
-    max_new_tokens=1024,
-    temperature=0.2,
     top_p=0.9
 ):
-    output = llm(
         prompt,
-        max_tokens=max_new_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        stop=["<|endoftext|>"]
     )
-    return output["choices"][0]["text"]
-# =========================
-# GRADIO INTERFACE (API ENABLED)
-# =========================
 demo = gr.Interface(
     fn=generate,
     inputs=[
-        gr.Textbox(label="Prompt / Context", lines=10),
-        gr.Slider(256, 2048, value=1024, step=128, label="Max New Tokens"),
-        gr.Slider(0.1, 1.0, value=0.2, step=0.05, label="Temperature"),
         gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p"),
     ],
-    outputs=gr.Textbox(label="Response", lines=15),
-    title="Qwen2.5-1.5B-Instruct (GGUF Q4 • FAST CPU • No Rate Limits)",
 )
 demo.launch()

+import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# -----------------------------
+# Load tokenizer
+# -----------------------------
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_ID,
+    use_fast=True
 )
+# -----------------------------
+# Load model (CPU, non-quantized)
+# -----------------------------
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float32,
+    device_map="cpu"
 )
+model.eval()
+# -----------------------------
+# Generation function
+# -----------------------------
 def generate(
     prompt,
+    max_new_tokens=512,
+    temperature=0.7,
     top_p=0.9
 ):
+    inputs = tokenizer(
         prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=2048
+    )
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True
+        )
+    return tokenizer.decode(
+        outputs[0],
+        skip_special_tokens=True
     )
+# -----------------------------
+# Gradio Interface (API enabled)
+# -----------------------------
 demo = gr.Interface(
     fn=generate,
     inputs=[
+        gr.Textbox(label="Prompt", lines=6),
+        gr.Slider(64, 1024, value=512, step=64, label="Max New Tokens"),
+        gr.Slider(0.1, 1.0, value=0.7, step=0.05, label="Temperature"),
         gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p"),
     ],
+    outputs=gr.Textbox(label="Response", lines=10),
+    title="TinyLlama-1.1B-Chat (Non-Quantized, CPU)"
 )
 demo.launch()