Spaces:

Maoxt
/

ID2223_Lab2

Build error

App Files Files Community

Maoxt commited on Nov 28, 2025

Commit

13bc5be

verified ·

1 Parent(s): c2b5297

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -28

app.py CHANGED Viewed

@@ -1,52 +1,105 @@
 import gradio as gr
 import time
 import os
-import sys
-# --- PLACEHOLDERS / CONSTANTS ---
-# TODO: Replace with your actual GGUF model paths after export
-GGUF_MODEL_PATH_1B = "llama-3.2-1b-summary-q4_k_m.gguf"
-GGUF_MODEL_PATH_3B = "llama-3.2-3b-summary-q4_k_m.gguf"
-# NOTE: In a real implementation, you would use a library like llama-cpp-python
-# to load these GGUF files and perform inference on the CPU.
 # ----------------------------------------------------
-# 1. CORE PROCESSING FUNCTION (Simulated for Frontend Setup)
 # ----------------------------------------------------
-def generate_summary_and_compare(long_document, selected_model, summary_length):
-    start_time = time.time()
-    # --- A-GRADE MODEL SELECTION AND INFERENCE LOGIC ---
-    # Simulation based on model selection (Task 2 Comparison)
     if "1B" in selected_model:
-        # Simulate calling the 1B GGUF model inference function
-        inference_time_sim = 1.0  # Simulating faster speed
-        model_name_display = "Llama-3.2-1B (Optimized GGUF)"
-        # Simulated summary output
-        summary_output = f"[1B Summary] The key finding of this document is: {long_document[:50]}... (Requested length: {summary_length}). This model prioritizes speed."
     elif "3B" in selected_model:
-        # Simulate calling the 3B GGUF model inference function
-        inference_time_sim = 2.5  # Simulating slower speed
-        model_name_display = "Llama-3.2-3B (High Quality GGUF)"
-        summary_output = f"[3B Summary] This comprehensive report finds that the main conclusions are: {long_document[:70]}... (Requested length: {summary_length}). This model prioritizes quality."
     else:
-        return "Error: Please select a model.", ""
-    time.sleep(inference_time_sim) # Simulate inference latency (CPU bound)
-    end_time = time.time()
-    total_latency = end_time - start_time
-    # Report to highlight the A-grade Task 2 comparison result
     speed_report = f"Model: {model_name_display}\nTotal Latency: {total_latency:.2f} seconds\n(Used for A-grade speed/quality tradeoff analysis)"
     return summary_output, speed_report
 # ----------------------------------------------------
-# 2. GRADIO INTERFACE DEFINITION (using Blocks for enhanced UI)
 # ----------------------------------------------------
 with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
     gr.Markdown(f"# 📚 LLM Document Summarizer & Model Comparison (KTH Lab 2)")
@@ -64,7 +117,6 @@ with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
                 placeholder="Paste the text you need summarized here..."
             )
-            # Control component specific to the summarization task
             summary_control = gr.Radio(
                 ["Concise (under 50 words)", "Detailed (under 200 words)"],
                 label="Select Summary Length Requirement",

 import gradio as gr
 import time
 import os
+from llama_cpp import Llama # Import the necessary library
+import numpy as np
+# --- CONFIGURATION ---
+# Define the paths to your uploaded GGUF files
+GGUF_MODEL_PATH_1B = "./llama-3.2-1b-summary-q4_k_m.gguf"
+GGUF_MODEL_PATH_3B = "./llama-3.2-3b-summary-q4_k_m.gguf"
+# Define the Prompt template for summarization (using a simple instruction format)
+SYSTEM_PROMPT = (
+    "You are an expert summarization bot. Your task is to provide a comprehensive "
+    "and concise summary of the user's document based on the requested length."
+)
 # ----------------------------------------------------
+# 1. MODEL LOADING FUNCTION (Runs once on app startup)
 # ----------------------------------------------------
+def load_llm(model_path):
+    print(f"Attempting to load GGUF model: {model_path}...")
+    # Load the model using llama-cpp-python (n_gpu_layers=0 forces CPU usage)
+    # verbose=True shows loading status
+    try:
+        llm = Llama(
+            model_path=model_path,
+            n_gpu_layers=0,  # Ensure it runs on CPU
+            n_ctx=2048,      # Context window size
+            verbose=True
+        )
+        print(f"Successfully loaded model: {model_path}")
+        return llm
+    except Exception as e:
+        print(f"Error loading model {model_path}: {e}")
+        # In case of failure, return a placeholder function
+        return lambda prompt, **kwargs: {"choices": [{"text": f"Error: Model failed to load ({model_path}). Check logs. Error: {e}"}]}
+# Load models globally so they are loaded only once at startup
+llm_1b = load_llm(GGUF_MODEL_PATH_1B)
+llm_3b = load_llm(GGUF_MODEL_PATH_3B)
+# ----------------------------------------------------
+# 2. CORE PROCESSING FUNCTION (GGUF Inference)
+# ----------------------------------------------------
+def generate_summary_and_compare(long_document, selected_model, summary_length):
+    # 1. Select the model and configuration
     if "1B" in selected_model:
+        selected_llm = llm_1b
+        model_name_display = "Llama-3.2-1B (Faster)"
     elif "3B" in selected_model:
+        selected_llm = llm_3b
+        model_name_display = "Llama-3.2-3B (Higher Quality)"
     else:
+        return "Error: Invalid model selection.", ""
+    # 2. Build the instruction prompt
+    instruction = f"Please summarize the following document and keep the summary {summary_length}. Document: \n\n{long_document}"
+    # We use Llama 3 format for instruction
+    full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+    # 3. Run Inference and measure speed
+    start_time = time.time()
+    # Determine max_tokens based on length request (heuristic)
+    max_tokens = 250 if "Detailed" in summary_length else 100
+    try:
+        # Call the GGUF model's completion method
+        output = selected_llm(
+            full_prompt,
+            max_tokens=max_tokens,
+            stop=["<|eot_id|>"], # Stop sequence for Llama models
+            temperature=0.7,
+            echo=False,
+            # min_p=0.1 # Optional: Can improve output quality slightly
+        )
+        end_time = time.time()
+        total_latency = end_time - start_time
+        # Extract the text output
+        summary_output = output["choices"][0]["text"].strip()
+    except Exception as e:
+        total_latency = time.time() - start_time
+        summary_output = f"Inference Error on {model_name_display}. Error: {e}"
+    # 4. Generate Performance Report (Task 2 Output)
     speed_report = f"Model: {model_name_display}\nTotal Latency: {total_latency:.2f} seconds\n(Used for A-grade speed/quality tradeoff analysis)"
     return summary_output, speed_report
 # ----------------------------------------------------
+# 3. GRADIO INTERFACE DEFINITION (kept same as previous version)
 # ----------------------------------------------------
 with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
     gr.Markdown(f"# 📚 LLM Document Summarizer & Model Comparison (KTH Lab 2)")
                 placeholder="Paste the text you need summarized here..."
             )
             summary_control = gr.Radio(
                 ["Concise (under 50 words)", "Detailed (under 200 words)"],
                 label="Select Summary Length Requirement",