File size: 3,043 Bytes
88d8657
 
c55e854
88d8657
 
c55e854
 
 
88d8657
c55e854
88d8657
c55e854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88d8657
c55e854
88d8657
c55e854
88d8657
 
c55e854
 
 
 
 
 
88d8657
c55e854
88d8657
c55e854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88d8657
 
 
 
c55e854
 
 
88d8657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

MODEL_REPO  = "d-e-e-k-11/llama-2-7b-chat-ggml"
MODEL_FILE  = "llama-2-7b-chat.ggmlv3.q2_K.bin"
LOCAL_PATH  = "/tmp/llama-model.bin"

# ─── Load Model ──────────────────────────────────────────────────────
llm = None
print("Checking for model...")

if not os.path.exists(LOCAL_PATH):
    print(f"Downloading model from {MODEL_REPO} ...")
    try:
        cached = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
        os.symlink(cached, LOCAL_PATH)
        print("Model downloaded via hf_hub_download.")
    except Exception as e:
        print(f"Download failed: {e}")

if os.path.exists(LOCAL_PATH):
    print("Loading Llama-2 model into memory...")
    try:
        llm = Llama(model_path=LOCAL_PATH, n_ctx=2048, n_threads=4, verbose=False)
        print("Model ready!")
    except Exception as e:
        print(f"Failed to load model: {e}")
else:
    print("Model file not found. Chatbot will return placeholder responses.")

# ─── Chat Function ───────────────────────────────────────────────────
def chat(message, history):
    if llm is None:
        return (
            "Model is still loading or unavailable. "
            "Please wait a moment and try again, or check the Space logs."
        )

    # Build context from last 5 turns
    context = ""
    for user_msg, bot_msg in history[-5:]:
        context += f"[INST] {user_msg} [/INST] {bot_msg} </s>"

    prompt = (
        f"[INST] <<SYS>>\nYou are a helpful, respectful AI assistant.\n<</SYS>>\n\n"
        f"{context}[INST] {message} [/INST]"
    )

    output = llm(
        prompt,
        max_tokens=512,
        stop=["[/INST]", "</s>", "User:"],
        echo=False,
    )
    return output["choices"][0]["text"].strip()

# ─── Gradio UI ───────────────────────────────────────────────────────
demo = gr.ChatInterface(
    fn=chat,
    title="Llama-2-7B Chatbot",
    description=(
        "**Offline AI chatbot** powered by Llama-2-7B (GGMLv3 Q2_K quantized).\n\n"
        "Model is downloaded automatically from Hugging Face on startup (~2.7 GB). "
        "First load may take a few minutes."
    ),
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="slate",
    ),
    examples=[
        "What is machine learning?",
        "Write a Python function to reverse a string.",
        "Explain quantum computing in simple terms.",
        "What are the planets in the solar system?",
    ],
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
)

if __name__ == "__main__":
    demo.launch()