import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import os MODEL_REPO = "d-e-e-k-11/llama-2-7b-chat-ggml" MODEL_FILE = "llama-2-7b-chat.ggmlv3.q2_K.bin" LOCAL_PATH = "/tmp/llama-model.bin" # ─── Load Model ────────────────────────────────────────────────────── llm = None print("Checking for model...") if not os.path.exists(LOCAL_PATH): print(f"Downloading model from {MODEL_REPO} ...") try: cached = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) os.symlink(cached, LOCAL_PATH) print("Model downloaded via hf_hub_download.") except Exception as e: print(f"Download failed: {e}") if os.path.exists(LOCAL_PATH): print("Loading Llama-2 model into memory...") try: llm = Llama(model_path=LOCAL_PATH, n_ctx=2048, n_threads=4, verbose=False) print("Model ready!") except Exception as e: print(f"Failed to load model: {e}") else: print("Model file not found. Chatbot will return placeholder responses.") # ─── Chat Function ─────────────────────────────────────────────────── def chat(message, history): if llm is None: return ( "Model is still loading or unavailable. " "Please wait a moment and try again, or check the Space logs." ) # Build context from last 5 turns context = "" for user_msg, bot_msg in history[-5:]: context += f"[INST] {user_msg} [/INST] {bot_msg} " prompt = ( f"[INST] <>\nYou are a helpful, respectful AI assistant.\n<>\n\n" f"{context}[INST] {message} [/INST]" ) output = llm( prompt, max_tokens=512, stop=["[/INST]", "", "User:"], echo=False, ) return output["choices"][0]["text"].strip() # ─── Gradio UI ─────────────────────────────────────────────────────── demo = gr.ChatInterface( fn=chat, title="Llama-2-7B Chatbot", description=( "**Offline AI chatbot** powered by Llama-2-7B (GGMLv3 Q2_K quantized).\n\n" "Model is downloaded automatically from Hugging Face on startup (~2.7 GB). " "First load may take a few minutes." ), theme=gr.themes.Soft( primary_hue="blue", secondary_hue="slate", ), examples=[ "What is machine learning?", "Write a Python function to reverse a string.", "Explain quantum computing in simple terms.", "What are the planets in the solar system?", ], retry_btn="Retry", undo_btn="Undo", clear_btn="Clear", ) if __name__ == "__main__": demo.launch()