Spaces:

Datangtang
/

iris

Sleeping

App Files Files Community

Datangtang commited on Dec 4, 2025

Commit

77cf31a

verified ·

1 Parent(s): a3cfd53

最初成功时的代码

Browse files

Files changed (1) hide show

app.py +67 -109

app.py CHANGED Viewed

@@ -3,119 +3,77 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
-# ----------------------------------------
-# Global model cache
-# ----------------------------------------
-loaded_models = {}
-current_model_name = None
-MODEL_CONFIGS = {
-    "1B Model (Datangtang/GGUF1B)": {
-        "repo_id": "Datangtang/GFUF1B",
-        "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
-    },
-    "3B Model (Datangtang/GGUF3B)": {
-        "repo_id": "Datangtang/GGUF3B",
-        "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
-    }
-}
-# ----------------------------------------
-# Load model function
-# ----------------------------------------
-def load_model(model_choice):
-    if model_choice in loaded_models:
-        print(f"Reusing already loaded model: {model_choice}")
-        return loaded_models[model_choice]
-    cfg = MODEL_CONFIGS[model_choice]
-    print(f"Downloading model: {model_choice}")
-    model_path = hf_hub_download(
-        repo_id=cfg["repo_id"],
-        filename=cfg["filename"],
-        local_dir="./model",
-        token=os.environ["HF_TOKEN"]
-    )
-    print("Loading model into memory...")
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=1024,
-        n_threads=6,
-        n_batch=512,
-        n_gpu_layers=0,
-        use_mmap=True,
-        use_mlock=True,
-        verbose=False
-    )
-    loaded_models[model_choice] = llm
-    print("Model loaded successfully!")
-    return llm
-# ----------------------------------------
-# Chat function (HuggingFace-compatible)
-# ----------------------------------------
-def chat(message, history, model_choice):
-    llm = load_model(model_choice)
-    # Build conversation prompt
-    conversation = "System: You are a helpful assistant.\n"
-    for human, assistant in history[-3:]:
         conversation += f"User: {human}\n"
-        if assistant:
-            conversation += f"Assistant: {assistant}\n"
-    conversation += f"User: {message}\nAssistant:"
     response = llm(
         conversation,
-        max_tokens=128,
         temperature=0.7,
         top_p=0.9,
-        top_k=40,
         repeat_penalty=1.1,
-        stop=["User:", "Assistant:"]
-    )
-    return response["choices"][0]["text"].strip()
-# ----------------------------------------
-# Gradio UI
-# ----------------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("## 🦙 Datangtang GGUF Model Demo")
-    model_choice = gr.Dropdown(
-        label="Select Model",
-        choices=list(MODEL_CONFIGS.keys()),
-        value="1B Model (Datangtang/GGUF1B)"
     )
-    chatbot = gr.Chatbot()
-    msg_box = gr.Textbox(label="Message")
-    # Add user message to history
-    def user_send(message, history):
-        history = history + [[message, None]]
-        return history, ""
-    # Generate bot response
-    def bot_reply(history, model_choice):
-        user_msg = history[-1][0]
-        bot_msg = chat(user_msg, history[:-1], model_choice)
-        history[-1][1] = bot_msg
-        return history
-    # Wire events
-    msg_box.submit(user_send, [msg_box, chatbot], [chatbot, msg_box]).then(
-        bot_reply, [chatbot, model_choice], chatbot
-    )
-demo.launch()

 from huggingface_hub import hf_hub_download
 import os
+print("Downloading GGUF model from HuggingFace...")
+# Download model
+model_path = hf_hub_download(
+    repo_id="Datangtang/GGUF3B",
+    filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
+    local_dir="./model"
+)
+print(f"Model downloaded to: {model_path}")
+print("Loading GGUF model with optimized settings...")
+# Load with optimized settings
+llm = Llama(
+    model_path=model_path,
+    n_ctx=1024,              # Reduced from 2048 (faster)
+    n_threads=6,             # Increased from 4 (use more CPU)
+    n_batch=512,             # Added: larger batch for faster processing
+    n_gpu_layers=0,
+    verbose=False,
+    use_mlock=True,          # Keep model in RAM
+    use_mmap=True,           # Use memory mapping
+)
+print("Model loaded successfully!")
+def chat(message, history):
+    """Handle chat interactions"""
+    # Build conversation (keep it short)
+    conversation = ""
+    # Only use last 3 turns of history to keep context short
+    recent_history = history[-3:] if len(history) > 3 else history
+    for human, assistant in recent_history:
         conversation += f"User: {human}\n"
+        conversation += f"Assistant: {assistant}\n"
+    conversation += f"User: {message}\n"
+    conversation += "Assistant:"
+    # Generate with optimized settings
     response = llm(
         conversation,
+        max_tokens=128,          # Reduced from 256 (faster)
         temperature=0.7,
         top_p=0.9,
+        top_k=40,               # Added: limit sampling
         repeat_penalty=1.1,
+        stop=["User:", "\n\n"],
+        echo=False,
     )
+    return response['choices'][0]['text'].strip()
+# Create interface WITHOUT example caching
+demo = gr.ChatInterface(
+    fn=chat,
+    title="Bit & Sugar/llama-3.2-3b-finetome-1000steps-gguf",
+    description=(
+        "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
+        "Optimized with GGUF Q4_K_M quantization | "
+        "ID2223 Lab 2"
+    ),
+    examples=[
+        "What is machine learning?",
+        "Explain AI briefly",
+        "What is LoRA?",
+    ],
+    cache_examples=False,  # IMPORTANT: Disable caching
+)
+if __name__ == "__main__":
+    demo.launch()