Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| # Configuration: Llama-3.2-1B-Instruct (GGUF Community Version) | |
| # This usually bypasses the "Gated Repo" error because it's a quantized re-upload | |
| REPO_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF" | |
| FILENAME = "Llama-3.2-1B-Instruct-Q8_0.gguf" | |
| print(f"Downloading {FILENAME} from {REPO_ID}...") | |
| try: | |
| model_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME | |
| ) | |
| except Exception as e: | |
| print(f"Error downloading {FILENAME}: {e}") | |
| # Fallback to Q4_K_M (smaller) | |
| print("Trying fallback to Q4_K_M...") | |
| FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" | |
| model_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME | |
| ) | |
| print(f"Loading model from {model_path}...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=4096, | |
| n_threads=2, | |
| chat_format="llama-3" | |
| ) | |
| def predict(message, history): | |
| messages = [] | |
| for human_msg, ai_msg in history: | |
| messages.append({"role": "user", "content": human_msg}) | |
| messages.append({"role": "assistant", "content": ai_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| response = llm.create_chat_completion( | |
| messages=messages, | |
| stream=True, | |
| max_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95 | |
| ) | |
| partial_message = "" | |
| for chunk in response: | |
| delta = chunk['choices'][0]['delta'] | |
| if 'content' in delta: | |
| partial_message += delta['content'] | |
| yield partial_message | |
| demo = gr.ChatInterface( | |
| fn=predict, | |
| title="Llama 3.2 1B (Docker/GGUF)", | |
| description="Running GGUF model via Docker container.", | |
| examples=["Hello, how are you?", "Write a Python script.", "Explain quantum computing."], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |