arjunbroepic commited on
Commit
a36c25b
·
verified ·
1 Parent(s): ac03fd2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from huggingface_hub import hf_hub_download
4
+ from llama_cpp import Llama
5
+
6
+ # 1. Download the specific GGUF model file at startup
7
+ REPO_ID = "n0ctyx/wifuGPT-1.7B-GGUF"
8
+ FILENAME = "wifuGPT-1.7B-Q4_K_M.gguf"
9
+
10
+ print("Downloading GGUF model from Hugging Face Hub...")
11
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
12
+ print(f"Model successfully cached at: {model_path}")
13
+
14
+ # 2. Initialize the llama.cpp instance on the CPU
15
+ # We use 2 threads to match the Hugging Face Free CPU tier allocation
16
+ llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
17
+
18
+ def predict(message, history):
19
+ # Construct the prompt using your exact ChatML structure
20
+ prompt = ""
21
+
22
+ # Format past conversation history
23
+ for msg in history:
24
+ role = msg["role"]
25
+ content = msg["content"]
26
+ prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
27
+
28
+ # Append the new user message
29
+ prompt += f"<|im_start|>user\n{message}<|im_end|>\n"
30
+
31
+ # Prime the assistant response.
32
+ # Note: We leave the <think> tag open so that if it's a reasoning model,
33
+ # it can dynamically generate its thoughts and close it with </think> itself.
34
+ prompt += "<|im_start|>assistant\n<think>\n"
35
+
36
+ # Generate the streaming response from the CPU
37
+ response_stream = llm(
38
+ prompt,
39
+ max_tokens=1024,
40
+ temperature=0.7,
41
+ top_p=0.8,
42
+ stream=True,
43
+ stop=["<|im_end|>", "<|im_start|>"]
44
+ )
45
+
46
+ # Stream the output token-by-token to the Gradio UI
47
+ partial_text = ""
48
+ for chunk in response_stream:
49
+ token = chunk["choices"][0]["text"]
50
+ partial_text += token
51
+ yield partial_text
52
+
53
+ # 3. Build the Gradio UI Layout
54
+ demo = gr.ChatInterface(
55
+ fn=predict,
56
+ type="messages",
57
+ title="🌸 wifuGPT 1.7B Local Chat",
58
+ description="Running entirely on a free Hugging Face CPU Space instance using optimized GGUF inference.",
59
+ examples=["Hello! Introduce yourself.", "Write a short poem about coding in Python."],
60
+ cache_examples=False,
61
+ )
62
+
63
+ if __name__ == "__main__":
64
+ demo.launch(server_name="0.0.0.0", server_port=7860)
65
+