Spaces:

Datangtang
/

iris

Sleeping

App Files Files Community

Datangtang commited on Dec 4, 2025

Commit

4061801

verified ·

1 Parent(s): 1adfee6

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -16

app.py CHANGED Viewed

@@ -1,24 +1,80 @@
 import gradio as gr
-from ctransformers import AutoModelForCausalLM
-model_id = "Datangtang/GGUF_New_1B"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    model_file="llama-3.2-1b-instruct.Q4_K_M.gguf",   # 改成你的 gguf 文件名
-    model_type="llama",
-    gpu_layers=0
 )
-def chat_fn(message):
-    response = model(message, max_new_tokens=200)
-    return response
-demo = gr.Interface(
-    fn=chat_fn,
-    inputs="text",
-    outputs="text",
-    title="My GGUF Model"
 )
-demo.launch()

 import gradio as gr
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+import os
+print("Downloading GGUF model from HuggingFace...")
+# Download model
+model_path = hf_hub_download(
+    repo_id="Datangtang/GGUF1B",
+    filename="llama-3.2-1b-instruct.Q4_K_M.gguf",
+    local_dir="./model"
 )
+print(f"Model downloaded to: {model_path}")
+print("Loading GGUF model with optimized settings...")
+# Load with optimized settings
+llm = Llama(
+    model_path=model_path,
+    n_ctx=1024,              # Reduced from 2048 (faster)
+    n_threads=6,             # Increased from 4 (use more CPU)
+    n_batch=512,             # Added: larger batch for faster processing
+    n_gpu_layers=0,
+    verbose=False,
+    use_mlock=True,          # Keep model in RAM
+    use_mmap=True,           # Use memory mapping
 )
+print("Model loaded successfully!")
+def chat(message, history):
+    """Handle chat interactions"""
+    # Build conversation (keep it short)
+    conversation = ""
+    # Only use last 3 turns of history to keep context short
+    recent_history = history[-3:] if len(history) > 3 else history
+    for human, assistant in recent_history:
+        conversation += f"User: {human}\n"
+        conversation += f"Assistant: {assistant}\n"
+    conversation += f"User: {message}\n"
+    conversation += "Assistant:"
+    # Generate with optimized settings
+    response = llm(
+        conversation,
+        max_tokens=128,          # Reduced from 256 (faster)
+        temperature=0.7,
+        top_p=0.9,
+        top_k=40,               # Added: limit sampling
+        repeat_penalty=1.1,
+        stop=["User:", "\n\n"],
+        echo=False,
+    )
+    return response['choices'][0]['text'].strip()
+# Create interface WITHOUT example caching
+demo = gr.ChatInterface(
+    fn=chat,
+    title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf",
+    description=(
+        "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
+        "Optimized with GGUF Q4_K_M quantization | "
+        "ID2223 Lab 2"
+    ),
+    examples=[
+        "What is machine learning?",
+        "Explain AI briefly",
+        "What is LoRA?",
+    ],
+    cache_examples=False,  # IMPORTANT: Disable caching
+    theme="soft",
+)
+if __name__ == "__main__":
+    demo.launch()