Spaces:

arjunbroepic
/

gpt

Sleeping

App Files Files Community

arjunbroepic commited on 6 days ago

Commit

a36c25b

verified ·

1 Parent(s): ac03fd2

Create app.py

Browse files

Files changed (1) hide show

app.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# 1. Download the specific GGUF model file at startup
+REPO_ID = "n0ctyx/wifuGPT-1.7B-GGUF"
+FILENAME = "wifuGPT-1.7B-Q4_K_M.gguf"
+print("Downloading GGUF model from Hugging Face Hub...")
+model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
+print(f"Model successfully cached at: {model_path}")
+# 2. Initialize the llama.cpp instance on the CPU
+# We use 2 threads to match the Hugging Face Free CPU tier allocation
+llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
+def predict(message, history):
+    # Construct the prompt using your exact ChatML structure
+    prompt = ""
+    # Format past conversation history
+    for msg in history:
+        role = msg["role"]
+        content = msg["content"]
+        prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
+    # Append the new user message
+    prompt += f"<|im_start|>user\n{message}<|im_end|>\n"
+    # Prime the assistant response.
+    # Note: We leave the <think> tag open so that if it's a reasoning model,
+    # it can dynamically generate its thoughts and close it with </think> itself.
+    prompt += "<|im_start|>assistant\n<think>\n"
+    # Generate the streaming response from the CPU
+    response_stream = llm(
+        prompt,
+        max_tokens=1024,
+        temperature=0.7,
+        top_p=0.8,
+        stream=True,
+        stop=["<|im_end|>", "<|im_start|>"]
+    )
+    # Stream the output token-by-token to the Gradio UI
+    partial_text = ""
+    for chunk in response_stream:
+        token = chunk["choices"][0]["text"]
+        partial_text += token
+        yield partial_text
+# 3. Build the Gradio UI Layout
+demo = gr.ChatInterface(
+    fn=predict,
+    type="messages",
+    title="🌸 wifuGPT 1.7B Local Chat",
+    description="Running entirely on a free Hugging Face CPU Space instance using optimized GGUF inference.",
+    examples=["Hello! Introduce yourself.", "Write a short poem about coding in Python."],
+    cache_examples=False,
+)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)