Spaces:

d-e-e-k-11
/

chatbot

Runtime error

App Files Files Community

chatbot / app.py

d-e-e-k-11

Fix: auto-download model from HF model repo at startup

c55e854 verified 3 months ago

raw

history blame contribute delete

3.04 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	MODEL_REPO = "d-e-e-k-11/llama-2-7b-chat-ggml"
	MODEL_FILE = "llama-2-7b-chat.ggmlv3.q2_K.bin"
	LOCAL_PATH = "/tmp/llama-model.bin"

	# ─── Load Model ──────────────────────────────────────────────────────
	llm = None
	print("Checking for model...")

	if not os.path.exists(LOCAL_PATH):
	print(f"Downloading model from {MODEL_REPO} ...")
	try:
	cached = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
	os.symlink(cached, LOCAL_PATH)
	print("Model downloaded via hf_hub_download.")
	except Exception as e:
	print(f"Download failed: {e}")

	if os.path.exists(LOCAL_PATH):
	print("Loading Llama-2 model into memory...")
	try:
	llm = Llama(model_path=LOCAL_PATH, n_ctx=2048, n_threads=4, verbose=False)
	print("Model ready!")
	except Exception as e:
	print(f"Failed to load model: {e}")
	else:
	print("Model file not found. Chatbot will return placeholder responses.")

	# ─── Chat Function ───────────────────────────────────────────────────
	def chat(message, history):
	if llm is None:
	return (
	"Model is still loading or unavailable. "
	"Please wait a moment and try again, or check the Space logs."
	)

	# Build context from last 5 turns
	context = ""
	for user_msg, bot_msg in history[-5:]:
	context += f"[INST] {user_msg} [/INST] {bot_msg} </s>"

	prompt = (
	f"[INST] <<SYS>>\nYou are a helpful, respectful AI assistant.\n<</SYS>>\n\n"
	f"{context}[INST] {message} [/INST]"
	)

	output = llm(
	prompt,
	max_tokens=512,
	stop=["[/INST]", "</s>", "User:"],
	echo=False,
	)
	return output["choices"][0]["text"].strip()

	# ─── Gradio UI ───────────────────────────────────────────────────────
	demo = gr.ChatInterface(
	fn=chat,
	title="Llama-2-7B Chatbot",
	description=(
	"Offline AI chatbot powered by Llama-2-7B (GGMLv3 Q2_K quantized).\n\n"
	"Model is downloaded automatically from Hugging Face on startup (~2.7 GB). "
	"First load may take a few minutes."
	),
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="slate",
	),
	examples=[
	"What is machine learning?",
	"Write a Python function to reverse a string.",
	"Explain quantum computing in simple terms.",
	"What are the planets in the solar system?",
	],
	retry_btn="Retry",
	undo_btn="Undo",
	clear_btn="Clear",
	)

	if __name__ == "__main__":
	demo.launch()