Spaces:

Datangtang
/

iris

Sleeping

App Files Files Community

Datangtang commited on Dec 4, 2025

Commit

9aa1169

verified ·

1 Parent(s): fd571bb

更新app

Browse files

Files changed (1) hide show

app.py +57 -57

app.py CHANGED Viewed

@@ -1,78 +1,78 @@
 import gradio as gr
-from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-import os
-print("Downloading GGUF model from HuggingFace...")
-# Download model
 model_path = hf_hub_download(
-    repo_id="Datangtang/GGUF3B",
-    filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
-    local_dir="./model"
 )
-print(f"Model downloaded to: {model_path}")
-print("Loading GGUF model with optimized settings...")
-# Load with optimized settings
 llm = Llama(
     model_path=model_path,
-    n_ctx=1024,              # Reduced from 2048 (faster)
-    n_threads=6,             # Increased from 4 (use more CPU)
-    n_batch=512,             # Added: larger batch for faster processing
-    n_gpu_layers=0,
-    verbose=False,
-    use_mlock=True,          # Keep model in RAM
-    use_mmap=True,           # Use memory mapping
 )
-print("Model loaded successfully!")
-def chat(message, history):
-    llm_message = message["content"]
-    conversation = "System: You are a helpful assistant.\n"
-    for msg in history[-3:]:
-        if msg["role"] == "user":
-            conversation += f"User: {msg['content']}\n"
         else:
-            conversation += f"Assistant: {msg['content']}\n"
-    conversation += f"User: {llm_message}\nAssistant:"
-    response = llm(
-        conversation,
-        max_tokens=128,
         temperature=0.7,
-        top_p=0.9,
-        top_k=40,
-        repeat_penalty=1.1,
-        stop=["User:", "Assistant:"]
     )
-    reply = response["choices"][0]["text"].strip()
-    return {"role": "assistant", "content": reply}
-# Create interface WITHOUT example caching
-demo = gr.ChatInterface(
-    fn=chat,
-    title="Bit & Sugar/llama-3.2-3b-finetome-1000steps-gguf",
-    description=(
-        "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
-        "Optimized with GGUF Q4_K_M quantization | "
-        "ID2223 Lab 2"
-    ),
-    examples=[
-        "What is machine learning?",
-        "Explain AI briefly",
-        "What is LoRA?",
-    ],
-    cache_examples=False,  # IMPORTANT: Disable caching
-)
 if __name__ == "__main__":
-    demo.launch()

+import os
 import gradio as gr
 from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# ============ 下载模型 ==============
+# 从环境变量读取 HF Token（在 Spaces → Settings → Secrets 设置）
+HF_TOKEN = os.environ.get("HF_Token")
+# 模型仓库与文件
+REPO_ID = "Datangtang/GGUF3B"
+FILE_NAME = "llama-3.2-3b-instruct.Q4_K_M.gguf"
 model_path = hf_hub_download(
+    repo_id=REPO_ID,
+    filename=FILE_NAME,
+    token=HF_TOKEN
 )
+# ============ 加载模型 ==============
 llm = Llama(
     model_path=model_path,
+    n_ctx=4096,
+    n_threads=4,
+    chat_format="llama-3",
 )
+# ============ 推理函数 ==============
+def chat_fn(history, user_input):
+    """
+    history 为 Gradio 聊天历史
+    user_input 为当前用户输入
+    """
+    messages = []
+    # 组织对话历史，适配 llama_cpp 的聊天格式
+    for role, text in history:
+        if role == "user":
+            messages.append({"role": "user", "content": text})
         else:
+            messages.append({"role": "assistant", "content": text})
+    # 新输入
+    messages.append({"role": "user", "content": user_input})
+    # 调用 LLM
+    result = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=512,
         temperature=0.7,
+        top_p=0.95
     )
+    output = result["choices"][0]["message"]["content"]
+    # 返回：更新后的历史记录
+    history.append(("user", user_input))
+    history.append(("assistant", output))
+    return history, ""
+# ============ Gradio UI ==============
+with gr.Blocks() as demo:
+    gr.Markdown("# 💬 Chat with Your Fine-tuned LLM")
+    chatbot = gr.Chatbot(height=500)
+    user_input = gr.Textbox(show_label=False, placeholder="Enter message...")
+    submit = gr.Button("Send")
+    submit.click(
+        fn=chat_fn,
+        inputs=[chatbot, user_input],
+        outputs=[chatbot, user_input]
+    )
 if __name__ == "__main__":
+    demo.launch()