import os import traceback import time from huggingface_hub import snapshot_download import gradio as gr # Attempt to import llama_cpp, if failed, prompt in the UI try: from llama_cpp import Llama except Exception as e: Llama = None Llama_import_error = e # ---------- Configuration Area ---------- # ★★★ Please change this to your model repository ★★★ MODEL_REPO = "Marcus719/Llama-3.2-3B-changedata-Lab2-GGUF" # Specify to download only the q4_k_m file to prevent running out of disk space GGUF_FILENAME = "unsloth.Q4_K_M.gguf" DEFAULT_N_CTX = 2048 # Context length DEFAULT_MAX_TOKENS = 256 # Default generation length DEFAULT_N_THREADS = 2 # Recommended 2 for free CPU tier # ------------------------------ def log(msg: str): print(f"[app] {time.strftime('%Y-%m-%d %H:%M:%S')} - {msg}", flush=True) def load_model_from_hub(repo_id: str, filename: str, n_ctx=DEFAULT_N_CTX, n_threads=DEFAULT_N_THREADS): if Llama is None: raise RuntimeError(f"llama-cpp-python not installed or failed to load: {Llama_import_error}") log(f"Starting model download: {repo_id} / {filename} ...") # Use snapshot_download to download a single file # allow_patterns ensures only the GGUF file is downloaded local_dir = snapshot_download( repo_id=repo_id, allow_patterns=[filename], local_dir_use_symlinks=False # Disabling symlinks for stability in Spaces ) # Construct full path # snapshot_download usually preserves directory structure, otherwise we search gguf_path = os.path.join(local_dir, filename) # Search for the file if direct path fails (for robustness) if not os.path.exists(gguf_path): for root, dirs, files in os.walk(local_dir): if filename in files: gguf_path = os.path.join(root, filename) break if not os.path.exists(gguf_path): raise FileNotFoundError(f"Could not find {filename} in {local_dir}") log(f"Model path: {gguf_path}. Loading into memory...") # Initialize the model llm = Llama(model_path=gguf_path, n_ctx=n_ctx, n_threads=n_threads, verbose=False) log("Llama model loaded successfully!") return llm, gguf_path def init_model(state): """Callback function for the Load button""" try: if state.get("llm") is not None: return state log("Received load request...") # Download and load llm, gguf_path = load_model_from_hub(MODEL_REPO, GGUF_FILENAME) # Update state state["llm"] = llm state["gguf_path"] = gguf_path return state except Exception as exc: tb = traceback.format_exc() log(f"Initialization Error: {exc}\n{tb}") return state def generate_response(prompt: str, max_tokens: int, state): """Callback function for the Generate button""" try: if not prompt or prompt.strip() == "": return "Please enter an instruction.", state # Lazy loading: attempt to auto-load if Generate is clicked without explicit initialization if state.get("llm") is None: try: log("Model not detected, attempting auto-load...") llm, gguf_path = load_model_from_hub(MODEL_REPO, GGUF_FILENAME) state["llm"] = llm state["gguf_path"] = gguf_path except Exception as e: return f"Model Load Failed: {e}", state llm = state.get("llm") log(f"Generating (Prompt Length={len(prompt)})...") # Construct Llama 3 format Prompt system_prompt = "You are a helpful AI assistant." # Simple concatenation: System + User # For strict formatting, use tokenizer.apply_chat_template # Using simple text concatenation here for generality, Llama 3 usually understands full_prompt = f"<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # Inference output = llm( full_prompt, max_tokens=max_tokens, stop=["<|eot_id|>"], # Stop token echo=False ) text = output['choices'][0]['text'] log("Generation complete.") return text, state except Exception as exc: tb = traceback.format_exc() log(f"Generation Error: {exc}\n{tb}") return f"Runtime Error: {exc}", state def soft_clear(current_state): """Clear button: only clears text, keeps the model loaded""" return "", current_state # ---------------- Gradio UI Construction ---------------- # Theme settings theme = gr.themes.Soft( primary_hue="indigo", secondary_hue="slate", neutral_hue="slate") # Custom CSS custom_css = """.footer-text { font-size: 0.8em; color: gray; text-align: center; }""" with gr.Blocks(title="Llama 3.2 Lab2 Project") as demo: # Header with gr.Row(): gr.Markdown("# Llama 3.2 (1B) Fine-Tuned Chatbot") gr.Markdown( f""" **ID2223 Lab 2 Project** | Fine-tuned on **UltraChat-200k-Filtered(only use 100k)**. Running on CPU (GGUF 4-bit) | Model: `{MODEL_REPO}` """ ) # Main layout with gr.Row(): # Left: Input and Controls with gr.Column(scale=4): with gr.Group(): prompt_in = gr.Textbox( lines=5, label="User Instruction (User Input)", placeholder="e.g., Explain Quantum Mechanics...", elem_id="prompt-input" ) with gr.Accordion("Advanced Parameters", open=False): max_tokens = gr.Slider( minimum=16, maximum=1024, step=16, value=DEFAULT_MAX_TOKENS, label="Max Generation Length (Max Tokens)", info="Longer generations will take more CPU time." ) with gr.Row(): init_btn = gr.Button("1. Load Model", variant="secondary") gen_btn = gr.Button("2. Generate Response", variant="primary") clear_btn = gr.Button("Clear Chat", variant="stop") # Right: Output Display with gr.Column(scale=6): output_txt = gr.Textbox( label="Model Response (Response)", lines=15, ) # Footer with gr.Row(): gr.Markdown( "*Note: Inference runs on a free CPU, so speed may be slow. The model (approx. 2GB) must be downloaded on first run, please be patient.*", elem_classes=["footer-text"] ) # State storage state = gr.State({"llm": None, "gguf_path": None, "status": "Not initialized"}) # Event binding init_btn.click( fn=init_model, inputs=state, outputs=[state], show_progress=True ) gen_btn.click( fn=generate_response, inputs=[prompt_in, max_tokens, state], outputs=[output_txt, state], show_progress=True ) clear_btn.click(fn=soft_clear, inputs=[state], outputs=[prompt_in, state]) clear_btn.click(lambda: "", outputs=[output_txt]) # Launch the application if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)