Yash030's picture
Revert to Docker GGUF
ab7020b
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Configuration: Llama-3.2-1B-Instruct (GGUF Community Version)
# This usually bypasses the "Gated Repo" error because it's a quantized re-upload
REPO_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF"
FILENAME = "Llama-3.2-1B-Instruct-Q8_0.gguf"
print(f"Downloading {FILENAME} from {REPO_ID}...")
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME
)
except Exception as e:
print(f"Error downloading {FILENAME}: {e}")
# Fallback to Q4_K_M (smaller)
print("Trying fallback to Q4_K_M...")
FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME
)
print(f"Loading model from {model_path}...")
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=2,
chat_format="llama-3"
)
def predict(message, history):
messages = []
for human_msg, ai_msg in history:
messages.append({"role": "user", "content": human_msg})
messages.append({"role": "assistant", "content": ai_msg})
messages.append({"role": "user", "content": message})
response = llm.create_chat_completion(
messages=messages,
stream=True,
max_tokens=512,
temperature=0.7,
top_p=0.95
)
partial_message = ""
for chunk in response:
delta = chunk['choices'][0]['delta']
if 'content' in delta:
partial_message += delta['content']
yield partial_message
demo = gr.ChatInterface(
fn=predict,
title="Llama 3.2 1B (Docker/GGUF)",
description="Running GGUF model via Docker container.",
examples=["Hello, how are you?", "Write a Python script.", "Explain quantum computing."],
)
if __name__ == "__main__":
demo.launch()