aakashrajput commited on
Commit
1d57e13
·
verified ·
1 Parent(s): 50d0fcb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -0
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from huggingface_hub import hf_hub_download
4
+ from llama_cpp import Llama
5
+
6
+ # Fetch token from Hugging Face Secrets
7
+ hf_token = os.getenv("HF_TOKEN")
8
+
9
+ # 1. Download the quantized model
10
+ # Using Q4_K_M (4-bit) for the best balance of speed and intelligence
11
+ model_path = hf_hub_download(
12
+ repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
13
+ filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf",
14
+ token=hf_token
15
+ )
16
+
17
+ # 2. Initialize the model
18
+ # n_ctx=2048: Enough for good conversations without lagging the CPU
19
+ # n_threads=2: Matches the 2-core limit of the HF Free Tier
20
+ llm = Llama(
21
+ model_path=model_path,
22
+ n_ctx=2048,
23
+ n_threads=2,
24
+ verbose=False
25
+ )
26
+
27
+ def generate_response(message, history):
28
+ # Construct the Llama 3.2 Chat Template
29
+ prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>"
30
+
31
+ for user_msg, assistant_msg in history:
32
+ prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_msg}<|eot_id|>"
33
+
34
+ prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
35
+
36
+ # Streaming the response for a "fast" feel
37
+ response = ""
38
+ stream = llm(
39
+ prompt,
40
+ max_tokens=512,
41
+ stop=["<|eot_id|>", "<|start_header_id|>"],
42
+ stream=True
43
+ )
44
+
45
+ for output in stream:
46
+ token = output["choices"][0]["text"]
47
+ response += token
48
+ yield response
49
+
50
+ # 3. Gradio UI with a clean "Chat" look
51
+ demo = gr.ChatInterface(
52
+ fn=generate_response,
53
+ title="Llama 3.2 (3B) - Optimized CPU",
54
+ description="Running with llama-cpp-python for maximum speed on free hardware.",
55
+ theme="glass"
56
+ )
57
+
58
+ if __name__ == "__main__":
59
+ demo.launch()
60
+