Spaces:
Build error
Build error
File size: 6,066 Bytes
9186170 1eacf14 13bc5be 9186170 13bc5be 9186170 13bc5be 9186170 1eacf14 13bc5be 1eacf14 13bc5be 1eacf14 13bc5be 1eacf14 13bc5be 1eacf14 13bc5be 1eacf14 13bc5be 1eacf14 13bc5be 1eacf14 13bc5be 1eacf14 13bc5be 9186170 13bc5be 1eacf14 9186170 1eacf14 13bc5be 1eacf14 9186170 1eacf14 9186170 1eacf14 9186170 1eacf14 9186170 1eacf14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import gradio as gr
import time
import os
from llama_cpp import Llama # Import the necessary library
import numpy as np
# --- CONFIGURATION ---
# Define the paths to your uploaded GGUF files
GGUF_MODEL_PATH_1B = "./llama-3.2-1b-summary-q4_k_m.gguf"
GGUF_MODEL_PATH_3B = "./llama-3.2-3b-summary-q4_k_m.gguf"
# Define the Prompt template for summarization (using a simple instruction format)
SYSTEM_PROMPT = (
"You are an expert summarization bot. Your task is to provide a comprehensive "
"and concise summary of the user's document based on the requested length."
)
# ----------------------------------------------------
# 1. MODEL LOADING FUNCTION (Runs once on app startup)
# ----------------------------------------------------
def load_llm(model_path):
print(f"Attempting to load GGUF model: {model_path}...")
# Load the model using llama-cpp-python (n_gpu_layers=0 forces CPU usage)
# verbose=True shows loading status
try:
llm = Llama(
model_path=model_path,
n_gpu_layers=0, # Ensure it runs on CPU
n_ctx=2048, # Context window size
verbose=True
)
print(f"Successfully loaded model: {model_path}")
return llm
except Exception as e:
print(f"Error loading model {model_path}: {e}")
# In case of failure, return a placeholder function
return lambda prompt, **kwargs: {"choices": [{"text": f"Error: Model failed to load ({model_path}). Check logs. Error: {e}"}]}
# Load models globally so they are loaded only once at startup
llm_1b = load_llm(GGUF_MODEL_PATH_1B)
llm_3b = load_llm(GGUF_MODEL_PATH_3B)
# ----------------------------------------------------
# 2. CORE PROCESSING FUNCTION (GGUF Inference)
# ----------------------------------------------------
def generate_summary_and_compare(long_document, selected_model, summary_length):
# 1. Select the model and configuration
if "1B" in selected_model:
selected_llm = llm_1b
model_name_display = "Llama-3.2-1B (Faster)"
elif "3B" in selected_model:
selected_llm = llm_3b
model_name_display = "Llama-3.2-3B (Higher Quality)"
else:
return "Error: Invalid model selection.", ""
# 2. Build the instruction prompt
instruction = f"Please summarize the following document and keep the summary {summary_length}. Document: \n\n{long_document}"
# We use Llama 3 format for instruction
full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
# 3. Run Inference and measure speed
start_time = time.time()
# Determine max_tokens based on length request (heuristic)
max_tokens = 250 if "Detailed" in summary_length else 100
try:
# Call the GGUF model's completion method
output = selected_llm(
full_prompt,
max_tokens=max_tokens,
stop=["<|eot_id|>"], # Stop sequence for Llama models
temperature=0.7,
echo=False,
# min_p=0.1 # Optional: Can improve output quality slightly
)
end_time = time.time()
total_latency = end_time - start_time
# Extract the text output
summary_output = output["choices"][0]["text"].strip()
except Exception as e:
total_latency = time.time() - start_time
summary_output = f"Inference Error on {model_name_display}. Error: {e}"
# 4. Generate Performance Report (Task 2 Output)
speed_report = f"Model: {model_name_display}\nTotal Latency: {total_latency:.2f} seconds\n(Used for A-grade speed/quality tradeoff analysis)"
return summary_output, speed_report
# ----------------------------------------------------
# 3. GRADIO INTERFACE DEFINITION (kept same as previous version)
# ----------------------------------------------------
with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
gr.Markdown(f"# 📚 LLM Document Summarizer & Model Comparison (KTH Lab 2)")
gr.Markdown(
"This tool demonstrates the summarization capability of a fine-tuned LLM. "
"Select a model and input a document. The speed comparison between 1B and 3B models on CPU fulfills the requirements for Task 2."
)
with gr.Row():
# Left Panel: User Input and Controls
with gr.Column(scale=1):
input_document = gr.Textbox(
lines=10,
label="Paste Long Document or Report Content",
placeholder="Paste the text you need summarized here..."
)
summary_control = gr.Radio(
["Concise (under 50 words)", "Detailed (under 200 words)"],
label="Select Summary Length Requirement",
value="Concise (under 50 words)"
)
model_selector = gr.Radio(
["Llama-3.2-1B (Faster)", "Llama-3.2-3B (Higher Quality)"],
label="Select Model for Comparison (Task 2)",
value="Llama-3.2-1B (Faster)"
)
process_button = gr.Button("Generate Summary & Compare Speed", variant="primary")
# Right Panel: Output and Performance Report
with gr.Column(scale=2):
output_summary = gr.Textbox(
label="Generated Document Summary",
lines=15,
interactive=False
)
performance_report = gr.Textbox(
label="Performance and Latency Report",
interactive=False,
lines=3
)
# Event Binding: Connect the button click to the processing function
process_button.click(
fn=generate_summary_and_compare,
inputs=[input_document, model_selector, summary_control],
outputs=[output_summary, performance_report]
)
demo.launch() |