Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| print("Downloading GGUF model from HuggingFace...") | |
| # Download model | |
| model_path = hf_hub_download( | |
| repo_id="Datangtang/GGUF1B", | |
| filename="llama-3.2-1b-instruct.Q4_K_M.gguf", | |
| local_dir="./model", | |
| token=os.environ["HF_TOKEN"] | |
| ) | |
| print(f"Model downloaded to: {model_path}") | |
| print("Loading GGUF model with optimized settings...") | |
| # Load with optimized settings | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=1024, # Reduced from 2048 (faster) | |
| n_threads=6, # Increased from 4 (use more CPU) | |
| n_batch=512, # Added: larger batch for faster processing | |
| n_gpu_layers=0, | |
| verbose=False, | |
| use_mlock=True, # Keep model in RAM | |
| use_mmap=True, # Use memory mapping | |
| ) | |
| print("Model loaded successfully!") | |
| def chat(message, history): | |
| """Handle chat interactions""" | |
| # Build conversation (keep it short) | |
| conversation = "" | |
| # Only use last 3 turns of history to keep context short | |
| recent_history = history[-3:] if len(history) > 3 else history | |
| for human, assistant in recent_history: | |
| conversation += f"User: {human}\n" | |
| conversation += f"Assistant: {assistant}\n" | |
| conversation += f"User: {message}\n" | |
| conversation += "Assistant:" | |
| # Generate with optimized settings | |
| response = llm( | |
| conversation, | |
| max_tokens=128, # Reduced from 256 (faster) | |
| temperature=0.7, | |
| top_p=0.9, | |
| top_k=40, # Added: limit sampling | |
| repeat_penalty=1.1, | |
| stop=["User:", "\n\n"], | |
| echo=False, | |
| ) | |
| return response['choices'][0]['text'].strip() | |
| # Create interface WITHOUT example caching | |
| demo = gr.ChatInterface( | |
| fn=chat, | |
| title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf", | |
| description=( | |
| "Best model from 8 experiments (1000 steps, 23% loss improvement) | " | |
| "Optimized with GGUF Q4_K_M quantization | " | |
| "ID2223 Lab 2" | |
| ), | |
| examples=[ | |
| "What is machine learning?", | |
| "Explain AI briefly", | |
| "What is LoRA?", | |
| ], | |
| cache_examples=False, # IMPORTANT: Disable caching | |
| theme="soft", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |