ID2223_Lab2_new

Sleeping

File size: 8,177 Bytes

5881ab0
 
 
 
5374b45
 
bc69f49
5881ab0
 
8665c7a
5881ab0
 
d9a1250
bc69f49
 
2b0c96c
bc69f49
8665c7a
bc69f49
 
 
8665c7a
d9a1250
5881ab0
 
d9a1250
8665c7a
5881ab0
bc69f49
8665c7a
bc69f49
8665c7a
bc69f49
 
8665c7a
 
 
bc69f49
8665c7a
 
bc69f49
 
8665c7a
 
bc69f49
8665c7a
 
 
 
 
bc69f49
 
 
 
8665c7a
bc69f49
8665c7a
bc69f49
5881ab0
5374b45
5881ab0
bc69f49
5881ab0
 
337b19e
8665c7a
bc69f49
 
8665c7a
 
bc69f49
5881ab0
 
8665c7a
337b19e
5881ab0
 
bc69f49
337b19e
d9a1250
5881ab0
bc69f49
5881ab0
 
337b19e
8665c7a
bc69f49
5881ab0
 
bc69f49
8665c7a
5881ab0
 
 
337b19e
bc69f49
5881ab0
8665c7a
bc69f49
8665c7a
bc69f49
8665c7a
bc69f49
 
 
8665c7a
bc69f49
 
8665c7a
 
 
bc69f49
8665c7a
 
 
 
bc69f49
337b19e
5881ab0
 
bc69f49
337b19e
8665c7a
 
bc69f49
337b19e
8665c7a
5881ab0
bc69f49
 
49dc795
 
 
bc69f49
49dc795
bc69f49
 
49dc795
654d90d
49dc795
bc69f49
d9a1250
49dc795
337b19e
49dc795
8665c7a
 
 
49dc795
 
 
337b19e
49dc795
bc69f49
5881ab0
bc69f49
49dc795
 
 
 
bc69f49
 
49dc795
 
bc69f49
337b19e
49dc795
 
 
 
 
bc69f49
 
49dc795
bc69f49
 
337b19e
 
bc69f49
337b19e
bc69f49
 
49dc795
 
bc69f49
49dc795
 
5881ab0
bc69f49
49dc795
 
337b19e
49dc795
 
5881ab0
bc69f49
49dc795
5881ab0
bc69f49
49dc795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9a1250
bc69f49
d9a1250
8665c7a

import os
import traceback
import time
from huggingface_hub import snapshot_download
import gradio as gr

# Attempt to import llama_cpp, if failed, prompt in the UI
try:
    from llama_cpp import Llama
except Exception as e:
    Llama = None
    Llama_import_error = e

# ---------- Configuration Area ----------
# ★★★ Please change this to your model repository ★★★
MODEL_REPO = "Marcus719/Llama-3.2-1B-Compare1-Lab2-GGUF"
# Specify to download only the q4_k_m file to prevent running out of disk space
GGUF_FILENAME = "unsloth.Q4_K_M.gguf" 
DEFAULT_N_CTX = 2048  # Context length
DEFAULT_MAX_TOKENS = 256 # Default generation length
DEFAULT_N_THREADS = 2 # Recommended 2 for free CPU tier
# ------------------------------

def log(msg: str):
    print(f"[app] {time.strftime('%Y-%m-%d %H:%M:%S')} - {msg}", flush=True)

def load_model_from_hub(repo_id: str, filename: str, n_ctx=DEFAULT_N_CTX, n_threads=DEFAULT_N_THREADS):
    if Llama is None:
        raise RuntimeError(f"llama-cpp-python not installed or failed to load: {Llama_import_error}")
    
    log(f"Starting model download: {repo_id} / {filename} ...")
    
    # Use snapshot_download to download a single file
    # allow_patterns ensures only the GGUF file is downloaded
    local_dir = snapshot_download(
        repo_id=repo_id, 
        allow_patterns=[filename],
        local_dir_use_symlinks=False # Disabling symlinks for stability in Spaces
    )
    
    # Construct full path
    # snapshot_download usually preserves directory structure, otherwise we search
    gguf_path = os.path.join(local_dir, filename)
    
    # Search for the file if direct path fails (for robustness)
    if not os.path.exists(gguf_path):
        for root, dirs, files in os.walk(local_dir):
            if filename in files:
                gguf_path = os.path.join(root, filename)
                break
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"Could not find {filename} in {local_dir}")
            
    log(f"Model path: {gguf_path}. Loading into memory...")
    
    # Initialize the model
    llm = Llama(model_path=gguf_path, n_ctx=n_ctx, n_threads=n_threads, verbose=False)
    log("Llama model loaded successfully!")
    return llm, gguf_path

def init_model(state):
    """Callback function for the Load button"""
    try:
        if state.get("llm") is not None:
            return " System Ready (Model Loaded)", state
        
        log("Received load request...")
        # Download and load
        llm, gguf_path = load_model_from_hub(MODEL_REPO, GGUF_FILENAME)
        
        # Update state
        state["llm"] = llm
        state["gguf_path"] = gguf_path
        
        return " System Ready", state
    except Exception as exc:
        tb = traceback.format_exc()
        log(f"Initialization Error: {exc}\n{tb}")
        return f" Initialization Failed: {exc}", state

def generate_response(prompt: str, max_tokens: int, state):
    """Callback function for the Generate button"""
    try:
        if not prompt or prompt.strip() == "":
            return " Please enter an instruction.", " Idle", state
        
        # Lazy loading: attempt to auto-load if Generate is clicked without explicit initialization
        if state.get("llm") is None:
            try:
                log("Model not detected, attempting auto-load...")
                llm, gguf_path = load_model_from_hub(MODEL_REPO, GGUF_FILENAME)
                state["llm"] = llm
                state["gguf_path"] = gguf_path
            except Exception as e:
                return f" Model Load Failed: {e}", f" Error", state
        
        llm = state.get("llm")
        
        log(f"Generating (Prompt Length={len(prompt)})...")
        
        # Construct Llama 3 format Prompt
        system_prompt = "You are a helpful AI assistant."
        # Simple concatenation: System + User
        # For strict formatting, use tokenizer.apply_chat_template
        # Using simple text concatenation here for generality, Llama 3 usually understands
        full_prompt = f"<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        
        # Inference
        output = llm(
            full_prompt, 
            max_tokens=max_tokens, 
            stop=["<|eot_id|>"], # Stop token
            echo=False
        )
        
        text = output['choices'][0]['text']
        log("Generation complete.")
        return text, " Generation Complete", state
    except Exception as exc:
        tb = traceback.format_exc()
        log(f"Generation Error: {exc}\n{tb}")
        return f"Runtime Error: {exc}", f" Exception", state

def soft_clear(current_state):
    """Clear button: only clears text, keeps the model loaded"""
    status = " System Ready" if current_state.get("llm") else " Not Initialized"
    return "", status, current_state

# ---------------- Gradio UI Construction ----------------
# Theme settings
theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="slate",
    neutral_hue="slate")

# Custom CSS
custom_css = """.footer-text { font-size: 0.8em; color: gray; text-align: center; }"""

with gr.Blocks(title="Llama 3.2 Lab2 Project") as demo:
    
    # Header
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("#  Llama 3.2 (1B) Fine-Tuned Chatbot")
            gr.Markdown(
                f"""
                **ID2223 Lab 2 Project** | Fine-tuned on **FineTome-100k**.
                Running on CPU (GGUF 4-bit) | Model: `{MODEL_REPO}`
                """
            )
        with gr.Column(scale=0, min_width=150):
            status_label = gr.Label(value=" Not Initialized", label="System Status", show_label=False)

    # Main layout
    with gr.Row():
        # Left: Input and Controls
        with gr.Column(scale=4):
            with gr.Group():
                prompt_in = gr.Textbox(
                    lines=5, 
                    label="User Instruction (User Input)", 
                    placeholder="e.g., Explain Quantum Mechanics...",
                    elem_id="prompt-input"
                )
                                
                with gr.Accordion(" Advanced Parameters", open=False):
                    max_tokens = gr.Slider(
                        minimum=16, 
                        maximum=1024, 
                        step=16, 
                        value=DEFAULT_MAX_TOKENS, 
                        label="Max Generation Length (Max Tokens)",
                        info="Longer generations will take more CPU time."
                    )
                        
                with gr.Row():
                    init_btn = gr.Button(" 1. Load Model", variant="secondary")
                    gen_btn = gr.Button(" 2. Generate Response", variant="primary")
                        
                clear_btn = gr.Button(" Clear Chat", variant="stop")

        # Right: Output Display
        with gr.Column(scale=6):
            output_txt = gr.Textbox(
                label="Model Response (Response)", 
                lines=15, 
            )

    # Footer
    with gr.Row():
        gr.Markdown(
            " *Note: Inference runs on a free CPU, so speed may be slow. The model (approx. 2GB) must be downloaded on first run, please be patient.*",
            elem_classes=["footer-text"]
        )

    # State storage
    state = gr.State({"llm": None, "gguf_path": None, "status": "Not initialized"})

    # Event binding
    init_btn.click(
        fn=init_model, 
        inputs=state, 
        outputs=[status_label, state],
        show_progress=True
    )
    
    gen_btn.click(
        fn=generate_response, 
        inputs=[prompt_in, max_tokens, state], 
        outputs=[output_txt, status_label, state],
        show_progress=True
    )
    
    clear_btn.click(fn=soft_clear, inputs=[state], outputs=[prompt_in, status_label, state])
    clear_btn.click(lambda: "", outputs=[output_txt])

# Launch the application
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)