Maoxt commited on
Commit
13bc5be
·
verified ·
1 Parent(s): c2b5297

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -28
app.py CHANGED
@@ -1,52 +1,105 @@
1
  import gradio as gr
2
  import time
3
  import os
4
- import sys
 
5
 
6
- # --- PLACEHOLDERS / CONSTANTS ---
7
- # TODO: Replace with your actual GGUF model paths after export
8
- GGUF_MODEL_PATH_1B = "llama-3.2-1b-summary-q4_k_m.gguf"
9
- GGUF_MODEL_PATH_3B = "llama-3.2-3b-summary-q4_k_m.gguf"
10
 
11
- # NOTE: In a real implementation, you would use a library like llama-cpp-python
12
- # to load these GGUF files and perform inference on the CPU.
 
 
 
13
 
14
  # ----------------------------------------------------
15
- # 1. CORE PROCESSING FUNCTION (Simulated for Frontend Setup)
16
  # ----------------------------------------------------
17
- def generate_summary_and_compare(long_document, selected_model, summary_length):
18
- start_time = time.time()
19
 
20
- # --- A-GRADE MODEL SELECTION AND INFERENCE LOGIC ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Simulation based on model selection (Task 2 Comparison)
23
  if "1B" in selected_model:
24
- # Simulate calling the 1B GGUF model inference function
25
- inference_time_sim = 1.0 # Simulating faster speed
26
- model_name_display = "Llama-3.2-1B (Optimized GGUF)"
27
- # Simulated summary output
28
- summary_output = f"[1B Summary] The key finding of this document is: {long_document[:50]}... (Requested length: {summary_length}). This model prioritizes speed."
29
  elif "3B" in selected_model:
30
- # Simulate calling the 3B GGUF model inference function
31
- inference_time_sim = 2.5 # Simulating slower speed
32
- model_name_display = "Llama-3.2-3B (High Quality GGUF)"
33
- summary_output = f"[3B Summary] This comprehensive report finds that the main conclusions are: {long_document[:70]}... (Requested length: {summary_length}). This model prioritizes quality."
34
  else:
35
- return "Error: Please select a model.", ""
 
 
 
36
 
37
- time.sleep(inference_time_sim) # Simulate inference latency (CPU bound)
 
 
 
 
38
 
39
- end_time = time.time()
40
- total_latency = end_time - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # Report to highlight the A-grade Task 2 comparison result
43
  speed_report = f"Model: {model_name_display}\nTotal Latency: {total_latency:.2f} seconds\n(Used for A-grade speed/quality tradeoff analysis)"
44
 
45
  return summary_output, speed_report
46
 
47
 
48
  # ----------------------------------------------------
49
- # 2. GRADIO INTERFACE DEFINITION (using Blocks for enhanced UI)
50
  # ----------------------------------------------------
51
  with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
52
  gr.Markdown(f"# 📚 LLM Document Summarizer & Model Comparison (KTH Lab 2)")
@@ -64,7 +117,6 @@ with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
64
  placeholder="Paste the text you need summarized here..."
65
  )
66
 
67
- # Control component specific to the summarization task
68
  summary_control = gr.Radio(
69
  ["Concise (under 50 words)", "Detailed (under 200 words)"],
70
  label="Select Summary Length Requirement",
 
1
  import gradio as gr
2
  import time
3
  import os
4
+ from llama_cpp import Llama # Import the necessary library
5
+ import numpy as np
6
 
7
+ # --- CONFIGURATION ---
8
+ # Define the paths to your uploaded GGUF files
9
+ GGUF_MODEL_PATH_1B = "./llama-3.2-1b-summary-q4_k_m.gguf"
10
+ GGUF_MODEL_PATH_3B = "./llama-3.2-3b-summary-q4_k_m.gguf"
11
 
12
+ # Define the Prompt template for summarization (using a simple instruction format)
13
+ SYSTEM_PROMPT = (
14
+ "You are an expert summarization bot. Your task is to provide a comprehensive "
15
+ "and concise summary of the user's document based on the requested length."
16
+ )
17
 
18
  # ----------------------------------------------------
19
+ # 1. MODEL LOADING FUNCTION (Runs once on app startup)
20
  # ----------------------------------------------------
21
+ def load_llm(model_path):
22
+ print(f"Attempting to load GGUF model: {model_path}...")
23
 
24
+ # Load the model using llama-cpp-python (n_gpu_layers=0 forces CPU usage)
25
+ # verbose=True shows loading status
26
+ try:
27
+ llm = Llama(
28
+ model_path=model_path,
29
+ n_gpu_layers=0, # Ensure it runs on CPU
30
+ n_ctx=2048, # Context window size
31
+ verbose=True
32
+ )
33
+ print(f"Successfully loaded model: {model_path}")
34
+ return llm
35
+ except Exception as e:
36
+ print(f"Error loading model {model_path}: {e}")
37
+ # In case of failure, return a placeholder function
38
+ return lambda prompt, **kwargs: {"choices": [{"text": f"Error: Model failed to load ({model_path}). Check logs. Error: {e}"}]}
39
+
40
+
41
+ # Load models globally so they are loaded only once at startup
42
+ llm_1b = load_llm(GGUF_MODEL_PATH_1B)
43
+ llm_3b = load_llm(GGUF_MODEL_PATH_3B)
44
+
45
+
46
+ # ----------------------------------------------------
47
+ # 2. CORE PROCESSING FUNCTION (GGUF Inference)
48
+ # ----------------------------------------------------
49
+ def generate_summary_and_compare(long_document, selected_model, summary_length):
50
 
51
+ # 1. Select the model and configuration
52
  if "1B" in selected_model:
53
+ selected_llm = llm_1b
54
+ model_name_display = "Llama-3.2-1B (Faster)"
 
 
 
55
  elif "3B" in selected_model:
56
+ selected_llm = llm_3b
57
+ model_name_display = "Llama-3.2-3B (Higher Quality)"
 
 
58
  else:
59
+ return "Error: Invalid model selection.", ""
60
+
61
+ # 2. Build the instruction prompt
62
+ instruction = f"Please summarize the following document and keep the summary {summary_length}. Document: \n\n{long_document}"
63
 
64
+ # We use Llama 3 format for instruction
65
+ full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
66
+
67
+ # 3. Run Inference and measure speed
68
+ start_time = time.time()
69
 
70
+ # Determine max_tokens based on length request (heuristic)
71
+ max_tokens = 250 if "Detailed" in summary_length else 100
72
+
73
+ try:
74
+ # Call the GGUF model's completion method
75
+ output = selected_llm(
76
+ full_prompt,
77
+ max_tokens=max_tokens,
78
+ stop=["<|eot_id|>"], # Stop sequence for Llama models
79
+ temperature=0.7,
80
+ echo=False,
81
+ # min_p=0.1 # Optional: Can improve output quality slightly
82
+ )
83
+
84
+ end_time = time.time()
85
+ total_latency = end_time - start_time
86
+
87
+ # Extract the text output
88
+ summary_output = output["choices"][0]["text"].strip()
89
+
90
+ except Exception as e:
91
+ total_latency = time.time() - start_time
92
+ summary_output = f"Inference Error on {model_name_display}. Error: {e}"
93
+
94
 
95
+ # 4. Generate Performance Report (Task 2 Output)
96
  speed_report = f"Model: {model_name_display}\nTotal Latency: {total_latency:.2f} seconds\n(Used for A-grade speed/quality tradeoff analysis)"
97
 
98
  return summary_output, speed_report
99
 
100
 
101
  # ----------------------------------------------------
102
+ # 3. GRADIO INTERFACE DEFINITION (kept same as previous version)
103
  # ----------------------------------------------------
104
  with gr.Blocks(title="KTH ID2223 Lab 2: LLM Document Summarizer") as demo:
105
  gr.Markdown(f"# 📚 LLM Document Summarizer & Model Comparison (KTH Lab 2)")
 
117
  placeholder="Paste the text you need summarized here..."
118
  )
119
 
 
120
  summary_control = gr.Radio(
121
  ["Concise (under 50 words)", "Detailed (under 200 words)"],
122
  label="Select Summary Length Requirement",