File size: 4,279 Bytes
01be4af
 
f3adf75
 
691a18b
 
 
 
1a7c573
 
 
 
 
 
 
7b8ee94
 
1a7c573
 
 
 
7b8ee94
691a18b
 
 
633446c
 
 
f3adf75
7b8ee94
f3adf75
1a7c573
f3adf75
691a18b
 
f3adf75
7b8ee94
f3adf75
7b8ee94
f3adf75
 
1a7c573
691a18b
 
1a7c573
f3adf75
 
7b8ee94
f3adf75
 
691a18b
 
d9d9611
691a18b
f3adf75
691a18b
 
 
 
d9d9611
691a18b
 
f3adf75
 
 
7b8ee94
691a18b
f3adf75
d9d9611
1a7c573
d401a5e
 
f3adf75
7b8ee94
f3adf75
9e8c9e9
691a18b
 
 
 
 
 
 
 
 
 
f3adf75
 
691a18b
7b8ee94
d401a5e
 
691a18b
d401a5e
7b8ee94
691a18b
d401a5e
 
f3adf75
 
4753747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "gradio==4.44.0", "--force-reinstall", "--no-deps"])
import gradio as gr

try:
    from llama_cpp import Llama
    print("llama-cpp-python already installed.")
except ImportError:
    print("Installing llama-cpp-python (fast CPU wheel)...")
    try:
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--no-cache-dir",
            "https://github.com/yownas/llama-cpp-python-wheels/releases/download/v0.3.16/llama_cpp_python-0.3.16+cpuavx-cp310-cp310-linux_x86_64.whl"
        ])
        print("llama-cpp-python installed from wheel.")
    except Exception as e:
        print("Wheel failed β†’ falling back to PyPI...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--no-cache-dir",
            "llama-cpp-python==0.3.16", "--force-reinstall"
        ])
    from llama_cpp import Llama  # ← INDENTED CORRECTLY

from huggingface_hub import hf_hub_download

# === BULLETPROOF WORKING MODEL (hugging-quants repo) ===
MODEL_REPO = "hugging-quants/Llama-3.2-3B-Instruct-Q4_K_M-GGUF"
MODEL_FILE = "llama-3.2-3b-instruct-q4_k_m.gguf"

print("Downloading Llama 3.2 3B Instruct (Q4_K_M)...")
model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE,
    local_dir="./models",
    local_dir_use_symlinks=False
)
print(f"Model downloaded: {model_path}")

print("Loading model into memory (20–40 sec)...")
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_threads=8,
    n_batch=512,
    n_gpu_layers=0,
    verbose=False
)
print("Model loaded β€” ready to chat!")

def chat(message, history):
    if not message.strip():
        return history, ""

    messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.9,
        stop=["<|eot_id|>", "<|end_of_text|>"],
        stream=False
    )

    bot_response = response["choices"][0]["message"]["content"].strip()
    history.append((message, bot_response))
    return history, ""

# === CSS & UI (perfect) ===
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&family=Source+Code+Pro:wght@400;600&display=swap');
body, .gradio-container { background: #0c0c0c !important; font-family: 'JetBrains Mono', monospace !important; }
.gradio-container { max-width: 1400px !important; border: 1px solid #00ff00 !important; box-shadow: 0 0 10px rgba(0,255,0,0.3) !important; }
*, h1, h2, h3, label, p { color: #00ff00 !important; }
.message { background: #1a1a1a !important; border-left: 3px solid #00ff00 !important; padding: 12px !important; }
.user { border-left-color: #00cc00 !important; }
input, textarea { background: #1a1a1a !important; border: 1px solid #00ff00 !important; color: #00ff00 !important; }
button { background: #1a1a1a !important; border: 1px solid #00ff00 !important; color: #00ff00 !important; }
button:hover { background: #00ff00 !important; color: #000 !important; }
.primary { background: #00ff00 !important; color: #000 !important; }
footer { display: none !important; }
"""

with gr.Blocks(theme=gr.themes.Base(primary_hue="green"), css=custom_css, title="$ LLAMA TERMINAL") as demo:
    gr.Markdown("# $ LLAMA TERMINAL\n```\n> System Online | Llama 3.2 3B Ready\n> Type your query below...\n```")
    chatbot = gr.Chatbot(height=600)
    with gr.Row():
        msg = gr.Textbox(placeholder="$ Enter command...", show_label=False, scale=8, container=False)
        submit = gr.Button("SEND", scale=1, variant="primary")
    gr.Examples(["What is the capital of France?", "Write a haiku about AI"], inputs=msg)
    gr.ClearButton([msg, chatbot], value="CLEAR")
    submit.click(chat, [msg, chatbot], [chatbot, msg])
    msg.submit(chat, [msg, chatbot], [chatbot, msg])

if __name__ == "__main__":
    demo.queue().launch(share=True, server_name="0.0.0.0", server_port=7860)