import gradio as gr import time import os from llama_cpp import Llama # Import the necessary library import numpy as np # --- CONFIGURATION --- # Define the paths to your uploaded GGUF files GGUF_MODEL_PATH_1B = "./llama-3.2-1b-summary-q4_k_m.gguf" GGUF_MODEL_PATH_3B = "./llama-3.2-3b-summary-q4_k_m.gguf" # Define the Prompt template for summarization (using a simple instruction format) SYSTEM_PROMPT = ( "You are an expert summarization bot. Your task is to provide a comprehensive " "and concise summary of the user's document based on the requested length." ) # ---------------------------------------------------- # 1. MODEL LOADING FUNCTION (Runs once on app startup) # ---------------------------------------------------- def load_llm(model_path): print(f"Attempting to load GGUF model: {model_path}...") # Load the model using llama-cpp-python (n_gpu_layers=0 forces CPU usage) # verbose=True shows loading status try: llm = Llama( model_path=model_path, n_gpu_layers=0, # Ensure it runs on CPU n_ctx=2048, # Context window size verbose=True ) print(f"Successfully loaded model: {model_path}") return llm except Exception as e: print(f"Error loading model {model_path}: {e}") # In case of failure, return a placeholder function return lambda prompt, **kwargs: {"choices": [{"text": f"Error: Model failed to load ({model_path}). Check logs. Error: {e}"}]} # Load models globally so they are loaded only once at startup llm_1b = load_llm(GGUF_MODEL_PATH_1B) llm_3b = load_llm(GGUF_MODEL_PATH_3B) # ---------------------------------------------------- # 2. CORE PROCESSING FUNCTION (GGUF Inference) # ---------------------------------------------------- def generate_summary_and_compare(long_document, selected_model, summary_length): # 1. Select the model and configuration if "1B" in selected_model: selected_llm = llm_1b model_name_display = "Llama-3.2-1B (Faster)" elif "3B" in selected_model: selected_llm = llm_3b model_name_display = "Llama-3.2-3B (Higher Quality)" else: return "Error: Invalid model selection.", "" # 2. Build the instruction prompt instruction = f"Please summarize the following document and keep the summary {summary_length}. Document: \n\n{long_document}" # We use Llama 3 format for instruction full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>" # 3. Run Inference and measure speed start_time = time.time() # Determine max_tokens based on length request (heuristic) max_tokens = 250 if "Detailed" in summary_length else 100 try: # Call the GGUF model's completion method output = selected_llm( full_prompt, max_tokens=max_tokens, stop=["<|eot_id|>"], # Stop sequence for Llama models temperature=0.7, echo=False, # min_p=0.1 # Optional: Can improve output quality slightly ) end_time = time.time() total_latency = end_time - start_time # Extract the text output summary_output = output["choices"][0]["text"].strip() except Exception as e: total_latency = time.time() - start_time summary_output = f"Inference Error on {model_name_display}. Error: {e}" # 4. Generate Performance Report (Task 2 Output) speed_report = f"Model: {model_name_display}\nTotal Latency: {total_latency:.2f} seconds\n(Used for A-grade speed/quality tradeoff analysis)" return summary_output, speed_report # ---------------------------------------------------- # 3. GRADIO INTERFACE DEFINITION (kept same as previous version) # ---------------------------------------------------- with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo: gr.Markdown(f"# 📚 LLM Document Summarizer & Model Comparison (KTH Lab 2)") gr.Markdown( "This tool demonstrates the summarization capability of a fine-tuned LLM. " "Select a model and input a document. The speed comparison between 1B and 3B models on CPU fulfills the requirements for Task 2." ) with gr.Row(): # Left Panel: User Input and Controls with gr.Column(scale=1): input_document = gr.Textbox( lines=10, label="Paste Long Document or Report Content", placeholder="Paste the text you need summarized here..." ) summary_control = gr.Radio( ["Concise (under 50 words)", "Detailed (under 200 words)"], label="Select Summary Length Requirement", value="Concise (under 50 words)" ) model_selector = gr.Radio( ["Llama-3.2-1B (Faster)", "Llama-3.2-3B (Higher Quality)"], label="Select Model for Comparison (Task 2)", value="Llama-3.2-1B (Faster)" ) process_button = gr.Button("Generate Summary & Compare Speed", variant="primary") # Right Panel: Output and Performance Report with gr.Column(scale=2): output_summary = gr.Textbox( label="Generated Document Summary", lines=15, interactive=False ) performance_report = gr.Textbox( label="Performance and Latency Report", interactive=False, lines=3 ) # Event Binding: Connect the button click to the processing function process_button.click( fn=generate_summary_and_compare, inputs=[input_document, model_selector, summary_control], outputs=[output_summary, performance_report] ) demo.launch()