Spaces:

Yash030
/

Qwen_Base_Model_1.7b_GGUF

Sleeping

App Files Files Community

Qwen_Base_Model_1.7b_GGUF / app.py

Yash030

Revert to Docker GGUF

ab7020b 4 months ago

raw

history blame contribute delete

1.86 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Configuration: Llama-3.2-1B-Instruct (GGUF Community Version)
	# This usually bypasses the "Gated Repo" error because it's a quantized re-upload
	REPO_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF"
	FILENAME = "Llama-3.2-1B-Instruct-Q8_0.gguf"

	print(f"Downloading {FILENAME} from {REPO_ID}...")
	try:
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME
	)
	except Exception as e:
	print(f"Error downloading {FILENAME}: {e}")
	# Fallback to Q4_K_M (smaller)
	print("Trying fallback to Q4_K_M...")
	FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME
	)

	print(f"Loading model from {model_path}...")
	llm = Llama(
	model_path=model_path,
	n_ctx=4096,
	n_threads=2,
	chat_format="llama-3"
	)

	def predict(message, history):
	messages = []
	for human_msg, ai_msg in history:
	messages.append({"role": "user", "content": human_msg})
	messages.append({"role": "assistant", "content": ai_msg})

	messages.append({"role": "user", "content": message})

	response = llm.create_chat_completion(
	messages=messages,
	stream=True,
	max_tokens=512,
	temperature=0.7,
	top_p=0.95
	)

	partial_message = ""
	for chunk in response:
	delta = chunk['choices'][0]['delta']
	if 'content' in delta:
	partial_message += delta['content']
	yield partial_message

	demo = gr.ChatInterface(
	fn=predict,
	title="Llama 3.2 1B (Docker/GGUF)",
	description="Running GGUF model via Docker container.",
	examples=["Hello, how are you?", "Write a Python script.", "Explain quantum computing."],
	)

	if __name__ == "__main__":
	demo.launch()