import gradio as gr
import time
import os
from llama_cpp import Llama # Import the necessary library
import numpy as np

# --- CONFIGURATION ---
# Define the paths to your uploaded GGUF files
GGUF_MODEL_PATH_1B = "./llama-3.2-1b-summary-q4_k_m.gguf"
GGUF_MODEL_PATH_3B = "./llama-3.2-3b-summary-q4_k_m.gguf" 

# Define the Prompt template for summarization (using a simple instruction format)
SYSTEM_PROMPT = (
    "You are an expert summarization bot. Your task is to provide a comprehensive "
    "and concise summary of the user's document based on the requested length."
)

# ----------------------------------------------------
# 1. MODEL LOADING FUNCTION (Runs once on app startup)
# ----------------------------------------------------
def load_llm(model_path):
    print(f"Attempting to load GGUF model: {model_path}...")
    
    # Load the model using llama-cpp-python (n_gpu_layers=0 forces CPU usage)
    # verbose=True shows loading status
    try:
        llm = Llama(
            model_path=model_path,
            n_gpu_layers=0,  # Ensure it runs on CPU
            n_ctx=2048,      # Context window size
            verbose=True
        )
        print(f"Successfully loaded model: {model_path}")
        return llm
    except Exception as e:
        print(f"Error loading model {model_path}: {e}")
        # In case of failure, return a placeholder function
        return lambda prompt, **kwargs: {"choices": [{"text": f"Error: Model failed to load ({model_path}). Check logs. Error: {e}"}]}


# Load models globally so they are loaded only once at startup
llm_1b = load_llm(GGUF_MODEL_PATH_1B)
llm_3b = load_llm(GGUF_MODEL_PATH_3B)


# ----------------------------------------------------
# 2. CORE PROCESSING FUNCTION (GGUF Inference)
# ----------------------------------------------------
def generate_summary_and_compare(long_document, selected_model, summary_length):
    
    # 1. Select the model and configuration
    if "1B" in selected_model:
        selected_llm = llm_1b
        model_name_display = "Llama-3.2-1B (Faster)"
    elif "3B" in selected_model:
        selected_llm = llm_3b
        model_name_display = "Llama-3.2-3B (Higher Quality)"
    else:
        return "Error: Invalid model selection.", ""

    # 2. Build the instruction prompt
    instruction = f"Please summarize the following document and keep the summary {summary_length}. Document: \n\n{long_document}"
    
    # We use Llama 3 format for instruction
    full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    
    # 3. Run Inference and measure speed
    start_time = time.time()
    
    # Determine max_tokens based on length request (heuristic)
    max_tokens = 250 if "Detailed" in summary_length else 100
    
    try:
        # Call the GGUF model's completion method
        output = selected_llm(
            full_prompt,
            max_tokens=max_tokens,
            stop=["<|eot_id|>"], # Stop sequence for Llama models
            temperature=0.7,
            echo=False,
            # min_p=0.1 # Optional: Can improve output quality slightly
        )
        
        end_time = time.time()
        total_latency = end_time - start_time
        
        # Extract the text output
        summary_output = output["choices"][0]["text"].strip()

    except Exception as e:
        total_latency = time.time() - start_time
        summary_output = f"Inference Error on {model_name_display}. Error: {e}"


    # 4. Generate Performance Report (Task 2 Output)
    speed_report = f"Model: {model_name_display}\nTotal Latency: {total_latency:.2f} seconds\n(Used for A-grade speed/quality tradeoff analysis)"
    
    return summary_output, speed_report


# ----------------------------------------------------
# 3. GRADIO INTERFACE DEFINITION (kept same as previous version)
# ----------------------------------------------------
with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
    gr.Markdown(f"# 📚 LLM Document Summarizer & Model Comparison (KTH Lab 2)")
    gr.Markdown(
        "This tool demonstrates the summarization capability of a fine-tuned LLM. "
        "Select a model and input a document. The speed comparison between 1B and 3B models on CPU fulfills the requirements for Task 2."
    )

    with gr.Row():
        # Left Panel: User Input and Controls
        with gr.Column(scale=1):
            input_document = gr.Textbox(
                lines=10, 
                label="Paste Long Document or Report Content", 
                placeholder="Paste the text you need summarized here..."
            )
            
            summary_control = gr.Radio(
                ["Concise (under 50 words)", "Detailed (under 200 words)"],
                label="Select Summary Length Requirement",
                value="Concise (under 50 words)"
            )
            
            model_selector = gr.Radio(
                ["Llama-3.2-1B (Faster)", "Llama-3.2-3B (Higher Quality)"],
                label="Select Model for Comparison (Task 2)",
                value="Llama-3.2-1B (Faster)"
            )
            
            process_button = gr.Button("Generate Summary & Compare Speed", variant="primary")

        # Right Panel: Output and Performance Report
        with gr.Column(scale=2):
            output_summary = gr.Textbox(
                label="Generated Document Summary", 
                lines=15, 
                interactive=False
            )
            
            performance_report = gr.Textbox(
                label="Performance and Latency Report",
                interactive=False,
                lines=3
            )

    # Event Binding: Connect the button click to the processing function
    process_button.click(
        fn=generate_summary_and_compare,
        inputs=[input_document, model_selector, summary_control],
        outputs=[output_summary, performance_report]
    )

demo.launch()