import subprocess, sys subprocess.check_call([sys.executable, "-m", "pip", "install", "gradio==4.44.0", "--force-reinstall", "--no-deps"]) import gradio as gr try: from llama_cpp import Llama print("llama-cpp-python already installed.") except ImportError: print("Installing llama-cpp-python (fast CPU wheel)...") try: subprocess.check_call([ sys.executable, "-m", "pip", "install", "--no-cache-dir", "https://github.com/yownas/llama-cpp-python-wheels/releases/download/v0.3.16/llama_cpp_python-0.3.16+cpuavx-cp310-cp310-linux_x86_64.whl" ]) print("llama-cpp-python installed from wheel.") except Exception as e: print("Wheel failed → falling back to PyPI...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "--no-cache-dir", "llama-cpp-python==0.3.16", "--force-reinstall" ]) from llama_cpp import Llama # ← INDENTED CORRECTLY from huggingface_hub import hf_hub_download # === BULLETPROOF WORKING MODEL (hugging-quants repo) === MODEL_REPO = "hugging-quants/Llama-3.2-3B-Instruct-Q4_K_M-GGUF" MODEL_FILE = "llama-3.2-3b-instruct-q4_k_m.gguf" print("Downloading Llama 3.2 3B Instruct (Q4_K_M)...") model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir="./models", local_dir_use_symlinks=False ) print(f"Model downloaded: {model_path}") print("Loading model into memory (20–40 sec)...") llm = Llama( model_path=model_path, n_ctx=8192, n_threads=8, n_batch=512, n_gpu_layers=0, verbose=False ) print("Model loaded — ready to chat!") def chat(message, history): if not message.strip(): return history, "" messages = [{"role": "system", "content": "You are a helpful AI assistant."}] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) if bot_msg: messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) response = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, top_p=0.9, stop=["<|eot_id|>", "<|end_of_text|>"], stream=False ) bot_response = response["choices"][0]["message"]["content"].strip() history.append((message, bot_response)) return history, "" # === CSS & UI (perfect) === custom_css = """ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&family=Source+Code+Pro:wght@400;600&display=swap'); body, .gradio-container { background: #0c0c0c !important; font-family: 'JetBrains Mono', monospace !important; } .gradio-container { max-width: 1400px !important; border: 1px solid #00ff00 !important; box-shadow: 0 0 10px rgba(0,255,0,0.3) !important; } *, h1, h2, h3, label, p { color: #00ff00 !important; } .message { background: #1a1a1a !important; border-left: 3px solid #00ff00 !important; padding: 12px !important; } .user { border-left-color: #00cc00 !important; } input, textarea { background: #1a1a1a !important; border: 1px solid #00ff00 !important; color: #00ff00 !important; } button { background: #1a1a1a !important; border: 1px solid #00ff00 !important; color: #00ff00 !important; } button:hover { background: #00ff00 !important; color: #000 !important; } .primary { background: #00ff00 !important; color: #000 !important; } footer { display: none !important; } """ with gr.Blocks(theme=gr.themes.Base(primary_hue="green"), css=custom_css, title="$ LLAMA TERMINAL") as demo: gr.Markdown("# $ LLAMA TERMINAL\n```\n> System Online | Llama 3.2 3B Ready\n> Type your query below...\n```") chatbot = gr.Chatbot(height=600) with gr.Row(): msg = gr.Textbox(placeholder="$ Enter command...", show_label=False, scale=8, container=False) submit = gr.Button("SEND", scale=1, variant="primary") gr.Examples(["What is the capital of France?", "Write a haiku about AI"], inputs=msg) gr.ClearButton([msg, chatbot], value="CLEAR") submit.click(chat, [msg, chatbot], [chatbot, msg]) msg.submit(chat, [msg, chatbot], [chatbot, msg]) if __name__ == "__main__": demo.queue().launch(share=True, server_name="0.0.0.0", server_port=7860)