Spaces:

Yash030
/

Qwen_Base_Model_1.7b_GGUF

Running

App Files Files Community

Yash030 commited on Jan 3

Commit

cb45af6

1 Parent(s): de0862d

Clean ZeroGPU Deploy

Browse files

Files changed (4) hide show

.gitattributes +1 -1
README.md +9 -8
app.py +49 -40
requirements.txt +4 -1

.gitattributes CHANGED Viewed

@@ -35,4 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.gguf filter=lfs diff=lfs merge=lfs -text
 Qwen_Base_Model_1.7b_GGUF/Qwen3-1.7B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen_Base_Model_1.7b_GGUF/Qwen3-1.7B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.gguf filter=lfs diff=lfs merge=lfs -text
 Qwen_Base_Model_1.7b_GGUF/Qwen3-1.7B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen_Base_Model_1.7b_GGUF/Qwen3-1.7B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
----
-title: Qwen Base Model 1.7b GGUF
-emoji: 🦀
-colorFrom: purple
-colorTo: pink
-sdk: docker
-sdk_version: 6.2.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+title: Llama 3.2 1B Chat
+emoji: 🦙
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 5.0.0
 app_file: app.py
 pinned: false
 ---
+# Llama-3.2-1B Chat (ZeroGPU)
+This Space runs Llama-3.2-1B-Instruct using Hugging Face ZeroGPU for fast inference.

app.py CHANGED Viewed

@@ -1,63 +1,72 @@
 import gradio as gr
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
-# Configuration: Llama-3.2-1B (Fast, Smart, Supported)
-REPO_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF"
-FILENAME = "Llama-3.2-1B-Instruct-Q8_0.gguf"
-print(f"Downloading {FILENAME} from {REPO_ID}...")
 try:
-    model_path = hf_hub_download(
-        repo_id=REPO_ID,
-        filename=FILENAME
     )
 except Exception as e:
-    print(f"Error downloading {FILENAME}: {e}")
-    # Fallback to Q4_K_M (smaller, faster)
-    print("Trying fallback to Q4_K_M...")
-    FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
-    model_path = hf_hub_download(
-        repo_id=REPO_ID,
-        filename=FILENAME
-    )
-print(f"Loading model from {model_path}...")
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,
-    n_threads=2,
-    chat_format="llama-3"
-)
 def predict(message, history):
     messages = []
     for human_msg, ai_msg in history:
         messages.append({"role": "user", "content": human_msg})
         messages.append({"role": "assistant", "content": ai_msg})
     messages.append({"role": "user", "content": message})
-    response = llm.create_chat_completion(
-        messages=messages,
-        stream=True,
-        max_tokens=512,
         temperature=0.7,
-        top_p=0.95
     )
     partial_message = ""
-    for chunk in response:
-        delta = chunk['choices'][0]['delta']
-        if 'content' in delta:
-            partial_message += delta['content']
-            yield partial_message
 demo = gr.ChatInterface(
     fn=predict,
-    title="Llama 3.2 1B Chat",
-    description=f"Chat with Llama-3.2-1B (GGUF). Fast and smart. Model: {FILENAME}",
-    examples=["Hello, how are you?", "Write a Python script.", "Explain quantum computing."],
 )
 if __name__ == "__main__":

 import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+import torch
+import os
+# Llama 3.2 1B (Requires HF_TOKEN in Space Settings)
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
+print(f"Loading {MODEL_ID}...")
+# Check for token (optional but helpful warning)
+if not os.environ.get("HF_TOKEN"):
+    print("WARNING: HF_TOKEN not found. Llama 3.2 is a gated model. This might fail 401.")
 try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16,
+        device_map="auto"
     )
+    print("Model loaded successfully.")
 except Exception as e:
+    print(f"Error loading model: {e}")
+    print("Did you accept the license at https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct and set HF_TOKEN?")
+    raise e
+@spaces.GPU
 def predict(message, history):
     messages = []
     for human_msg, ai_msg in history:
         messages.append({"role": "user", "content": human_msg})
         messages.append({"role": "assistant", "content": ai_msg})
     messages.append({"role": "user", "content": message})
+    # Llama 3.2 uses standard chat template
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        max_new_tokens=512,
+        do_sample=True,
         temperature=0.7,
+        top_p=0.9
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
     partial_message = ""
+    for new_token in streamer:
+        partial_message += new_token
+        yield partial_message
 demo = gr.ChatInterface(
     fn=predict,
+    title="Llama 3.2 1B (ZeroGPU)",
+    description="Running on standard Hugging Face GPU hardware.",
+    examples=["Hello!", "Explain quantum physics.", "Write code for snake game."],
 )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,2 +1,5 @@
 gradio
-huggingface-hub

 gradio
+spaces
+torch
+transformers
+accelerate