Spaces:

Datangtang
/

iris

Sleeping

App Files Files Community

Datangtang commited on Dec 4, 2025

Commit

ac0916f

verified ·

1 Parent(s): 6e74518

go back to 1b & 3b

Browse files

Files changed (1) hide show

app.py +114 -67

app.py CHANGED Viewed

@@ -3,79 +3,126 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
-print("Downloading GGUF model from HuggingFace...")
-# Download model
-model_path = hf_hub_download(
-    repo_id="Datangtang/GGUF1B",
-    filename="llama-3.2-1b-instruct.Q4_K_M.gguf",
-    local_dir="./model",
-    token=os.environ["HF_TOKEN"]
-)
-print(f"Model downloaded to: {model_path}")
-print("Loading GGUF model with optimized settings...")
-# Load with optimized settings
-llm = Llama(
-    model_path=model_path,
-    n_ctx=1024,              # Reduced from 2048 (faster)
-    n_threads=6,             # Increased from 4 (use more CPU)
-    n_batch=512,             # Added: larger batch for faster processing
-    n_gpu_layers=0,
-    verbose=False,
-    use_mlock=True,          # Keep model in RAM
-    use_mmap=True,           # Use memory mapping
-)
-print("Model loaded successfully!")
-def chat(message, history):
-    """Handle chat interactions"""
-    # Build conversation (keep it short)
-    conversation = ""
-    # Only use last 3 turns of history to keep context short
-    recent_history = history[-3:] if len(history) > 3 else history
-    for human, assistant in recent_history:
-        conversation += f"User: {human}\n"
-        conversation += f"Assistant: {assistant}\n"
-    conversation += f"User: {message}\n"
-    conversation += "Assistant:"
-    # Generate with optimized settings
     response = llm(
         conversation,
-        max_tokens=128,          # Reduced from 256 (faster)
         temperature=0.7,
         top_p=0.9,
-        top_k=40,               # Added: limit sampling
         repeat_penalty=1.1,
-        stop=["User:", "\n\n"],
-        echo=False,
     )
-    return response['choices'][0]['text'].strip()
-# Create interface WITHOUT example caching
-demo = gr.ChatInterface(
-    fn=chat,
-    title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf",
-    description=(
-        "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
-        "Optimized with GGUF Q4_K_M quantization | "
-        "ID2223 Lab 2"
-    ),
-    examples=[
-        "What is machine learning?",
-        "Explain AI briefly",
-        "What is LoRA?",
-    ],
-    cache_examples=False,  # IMPORTANT: Disable caching
-    theme="soft",
-)
 if __name__ == "__main__":
     demo.launch()

 from huggingface_hub import hf_hub_download
 import os
+# ----------------------------------------
+# Global model cache
+# ----------------------------------------
+loaded_models = {}   # Cache loaded Llama models
+current_model_name = None
+MODEL_CONFIGS = {
+    "1B Model (Datangtang/GGUF1B)": {
+        "repo_id": "Datangtang/GGUF1B",
+        "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
+    },
+    "3B Model (Datangtang/GGUF3B)": {
+        "repo_id": "Datangtang/GGGF3B",
+        "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
+    }
+}
+# ----------------------------------------
+# Load model function
+# ----------------------------------------
+def load_model(model_choice):
+    global loaded_models, current_model_name
+    if model_choice in loaded_models:
+        print(f"Reusing already loaded model: {model_choice}")
+        current_model_name = model_choice
+        return loaded_models[model_choice]
+    print(f"Downloading model: {model_choice}")
+    cfg = MODEL_CONFIGS[model_choice]
+    model_path = hf_hub_download(
+        repo_id=cfg["repo_id"],
+        filename=cfg["filename"],
+        local_dir="./model",
+        token=os.environ["HF_TOKEN"]
+    )
+    print(f"Model downloaded to: {model_path}")
+    print("Loading GGUF model into memory...")
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=1024,
+        n_threads=6,
+        n_batch=512,
+        n_gpu_layers=0,
+        use_mmap=True,
+        use_mlock=True,
+        verbose=False,
+    )
+    loaded_models[model_choice] = llm
+    current_model_name = model_choice
+    print("Model loaded successfully!")
+    return llm
+# ----------------------------------------
+# Chat function
+# ----------------------------------------
+def chat(message, history, model_choice):
+    llm = load_model(model_choice)
+    # System prompt
+    conversation = "System: You are a helpful assistant.\n"
+    # Add last 3 messages
+    for human, assistant in history[-3:]:
+        conversation += f"User: {human}\nAssistant: {assistant}\n"
+    conversation += f"User: {message}\nAssistant:"
     response = llm(
         conversation,
+        max_tokens=128,
         temperature=0.7,
         top_p=0.9,
+        top_k=40,
         repeat_penalty=1.1,
+        stop=["User:", "Assistant:"],
+        echo=False
     )
+    return response["choices"][0]["text"].strip()
+# ----------------------------------------
+# Gradio UI
+# ----------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
+    gr.Markdown("Switch between **1B** and **3B** GGUF models in real-time.")
+    model_choice = gr.Dropdown(
+        label="Select Model",
+        choices=list(MODEL_CONFIGS.keys()),
+        value="1B Model (Datangtang/GGUF1B)",
+    )
+    chat_iface = gr.ChatInterface(
+        fn=lambda message, history: chat(message, history, model_choice.value),
+        examples=[
+            "Explain deep learning in one paragraph.",
+            "What is the difference between supervised and unsupervised learning?",
+            "Explain what a transformer model is.",
+        ],
+        cache_examples=False,
+    )
+    model_choice.change(
+        fn=lambda x: f"🔄 Switched to: {x}",
+        inputs=[model_choice],
+        outputs=[],
+    )
 if __name__ == "__main__":
     demo.launch()