Seth0330 commited on
Commit
bec67ce
·
verified ·
1 Parent(s): b3585ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -392
app.py CHANGED
@@ -1,450 +1,209 @@
1
  import streamlit as st
2
- from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
3
  import io
4
  import requests
5
  import json
6
  import re
7
  import os
8
- from datetime import datetime
9
 
10
- # Configure Streamlit
11
- st.set_page_config(
12
- page_title="PDF Tools - Summarizer & Invoice Extractor",
13
- layout="wide",
14
- )
15
 
16
- # Model Configuration for Invoice Extractor
17
  MODELS = {
18
  "DeepSeek v3": {
19
  "api_url": "https://api.deepseek.com/v1/chat/completions",
20
- "model_name": "deepseek-chat",
21
- "api_key_env": "DEEPSEEK_API_KEY",
22
- "response_format": {"type": "json_object"}
23
  },
24
  "DeepSeek R1": {
25
  "api_url": "https://api.deepseek.com/v1/chat/completions",
26
- "model_name": "deepseek-reasoner",
27
- "api_key_env": "DEEPSEEK_API_KEY",
28
- "response_format": None
29
  },
30
  "Llama 4 Mavericks": {
31
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
32
- "model_name": "meta-llama/llama-4-maverick:free",
33
- "api_key_env": "OPENROUTER_API_KEY",
34
  "response_format": {"type": "json_object"},
35
  "extra_headers": {
36
  "HTTP-Referer": "https://huggingface.co",
37
- "X-Title": "Invoice Extractor"
38
- }
39
- }
 
 
 
 
 
 
 
 
 
 
40
  }
41
 
42
  def get_api_key(model_choice):
43
- """Get the appropriate API key based on model choice"""
44
- api_key_env = MODELS[model_choice]["api_key_env"]
45
- api_key = os.environ.get(api_key_env)
46
- if not api_key:
47
- st.error(f"❌ `{api_key_env}` environment variable not set!")
48
  st.stop()
49
- return api_key
50
 
51
  def query_llm(model_choice, prompt):
52
- """Call the appropriate API based on model choice"""
53
- config = MODELS[model_choice]
54
  headers = {
55
  "Authorization": f"Bearer {get_api_key(model_choice)}",
56
  "Content-Type": "application/json",
57
  }
58
-
59
- if "extra_headers" in config:
60
- headers.update(config["extra_headers"])
61
-
62
  payload = {
63
- "model": config["model_name"],
64
  "messages": [{"role": "user", "content": prompt}],
65
  "temperature": 0.1,
66
  "max_tokens": 2000,
67
  }
68
-
69
- if config["response_format"]:
70
- payload["response_format"] = config["response_format"]
71
-
72
  try:
73
- with st.spinner(f"🔍 Analyzing with {model_choice}..."):
74
- response = requests.post(config["api_url"], headers=headers, json=payload, timeout=90)
75
-
76
- if response.status_code != 200:
77
- st.error(f"🚨 API Error {response.status_code}: {response.text}")
78
- return None
79
-
80
- try:
81
- content = response.json()["choices"][0]["message"]["content"]
82
- st.session_state.last_api_response = content
83
- st.session_state.last_api_response_raw = response.text
84
- return content
85
- except KeyError as e:
86
- st.error(f"KeyError in response: {e}\nFull response: {response.json()}")
87
- return None
88
-
89
- except requests.exceptions.RequestException as e:
90
- st.error(f"🌐 Connection Failed: {str(e)}")
91
  return None
92
 
93
  def clean_json_response(text):
94
- """Improved JSON extraction with comprehensive error handling"""
95
  if not text:
96
  return None
97
-
98
- # First attempt to parse directly
 
 
 
 
 
 
 
 
 
 
99
  try:
100
- data = json.loads(text)
101
- return data
102
- except json.JSONDecodeError:
103
- pass
104
-
105
- # Try to extract JSON from potential markdown
106
- json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
107
- if json_match:
108
  try:
109
- return json.loads(json_match.group(1))
110
  except json.JSONDecodeError:
111
- pass
112
-
113
- # Try to find any JSON-like structure
114
- try:
115
- start_idx = text.find('{')
116
- end_idx = text.rfind('}') + 1
117
- if start_idx != -1 and end_idx != 0:
118
- return json.loads(text[start_idx:end_idx])
119
- except:
120
- pass
121
-
122
- # Final fallback - manual reconstruction
123
- try:
124
- if '"invoice_header":' in text and '"line_items":' in text:
125
- header_part = text.split('"line_items":')[0]
126
- line_items_part = text.split('"line_items":')[1]
127
-
128
- # Ensure proper closing of JSON
129
- if not header_part.strip().endswith('{'):
130
- header_part += '{'
131
-
132
- if not line_items_part.strip().endswith('}}'):
133
- line_items_part = line_items_part.split('}')[0] + ']}}'
134
-
135
- reconstructed = header_part + '"line_items":' + line_items_part
136
- return json.loads(reconstructed)
137
- except Exception as e:
138
- st.warning(f"Could not fully reconstruct JSON: {str(e)}")
139
- return None
140
-
141
  return None
142
 
143
- def get_extraction_prompt(model_choice, text):
144
- """Return the appropriate prompt based on model choice"""
145
- if model_choice == "DeepSeek v3":
146
- return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
147
- {{
148
- "invoice_number": "string",
149
- "invoice_date": "YYYY-MM-DD",
150
- "po_number": "string or null",
151
- "invoice_value": "string with currency symbol",
152
- "line_items": [
153
- {{
154
- "description": "string",
155
- "quantity": "number or string",
156
- "unit_price": "string with currency",
157
- "total_price": "string with currency"
158
- }}
159
- ]
160
- }}
161
- Rules:
162
- 1. Return ONLY valid JSON (no additional text or markdown)
163
- 2. Use null for missing fields
164
- 3. Include all line items found in the invoice
165
- 4. For line items, quantity can be number or string, prices should include currency
166
- 5. Do not include any explanations or notes
167
- Invoice Text:
168
- """ + text
169
-
170
- elif model_choice == "DeepSeek R1":
171
- return f"""Please extract the following information from the invoice text below and return ONLY the raw JSON without any markdown formatting or additional text:
172
- {{
173
- "invoice_number": "string or null",
174
- "invoice_date": "YYYY-MM-DD or null",
175
- "po_number": "string or null",
176
- "invoice_value": "string with currency or null",
177
- "line_items": [
178
- {{
179
- "description": "string",
180
- "quantity": "number or string",
181
- "unit_price": "string with currency",
182
- "total_price": "string with currency"
183
- }}
184
- ]
185
- }}
186
- Invoice Text:
187
- """ + text
188
-
189
- else: # Llama 4 Mavericks
190
- return f"""Extract complete invoice information and return a VALID JSON object with these fields:
191
- {{
192
- "invoice_header": {{
193
- "invoice_number": "string",
194
- "invoice_date": "YYYY-MM-DD",
195
- "po_number": "string or null",
196
- "invoice_value": "string with currency",
197
- "supplier_name": "string or null",
198
- "customer_name": "string or null"
199
- }},
200
- "line_items": [
201
- {{
202
- "item_number": "string or null",
203
- "description": "string",
204
- "quantity": "number",
205
- "unit_price": "string with currency",
206
- "total_price": "string with currency"
207
- }}
208
- ]
209
- }}
210
- Rules:
211
- 1. Return ONLY valid JSON (no additional text or markdown)
212
- 2. Use null for missing fields
213
- 3. Date format must be YYYY-MM-DD
214
- 4. All currency values must include currency symbol or code
215
- 5. Include all line items found in the invoice
216
- 6. For line items, quantity should be a number, prices as strings with currency
217
- 7. Do not include any explanations or notes
218
- Invoice Text:
219
- """ + text
220
-
221
- def format_currency(value):
222
- """Helper function to format currency values consistently"""
223
- if not value:
224
- return "N/A"
225
- if isinstance(value, (int, float)):
226
- return f"${value:,.2f}"
227
- return value
228
-
229
- def display_line_items(line_items, model_choice="DeepSeek v3"):
230
- """Display line items in a formatted table"""
231
- if not line_items:
232
- st.info("No line items found in this invoice. This may be due to incomplete data from the API.")
233
- return
234
-
235
- st.subheader("📋 Line Items")
236
-
237
- if model_choice == "Llama 4 Mavericks":
238
- # Display as a table for Llama
239
- items_display = []
240
- for idx, item in enumerate(line_items, 1):
241
- items_display.append({
242
- "#": idx,
243
- "Description": item.get("description", "N/A"),
244
- "Quantity": item.get("quantity", 0),
245
- "Unit Price": item.get("unit_price", "N/A"),
246
- "Total Price": item.get("total_price", "N/A")
247
- })
248
- st.table(items_display)
249
- else:
250
- # Display in columns for DeepSeek models
251
- cols = st.columns([4, 2, 2, 2])
252
- with st.container():
253
- cols[0].write("**Description**")
254
- cols[1].write("**Qty**")
255
- cols[2].write("**Unit Price**")
256
- cols[3].write("**Total**")
257
-
258
- for item in line_items:
259
- cols = st.columns([4, 2, 2, 2])
260
- cols[0].write(item.get("description", "N/A"))
261
- cols[1].write(item.get("quantity", "N/A"))
262
- cols[2].write(format_currency(item.get("unit_price", "N/A")))
263
- cols[3].write(format_currency(item.get("total_price", "N/A")))
264
- st.divider()
265
-
266
- def display_invoice_data(model_choice, invoice_data):
267
- if not invoice_data:
268
- return
269
-
270
- if model_choice == "Llama 4 Mavericks":
271
- # Display header information
272
- st.subheader("Invoice Summary")
273
- header = invoice_data.get("invoice_header", {})
274
-
275
- col1, col2, col3 = st.columns(3)
276
- with col1:
277
- st.metric("Invoice Number", header.get("invoice_number", "Not found"))
278
- st.metric("Supplier", header.get("supplier_name", "Not found"))
279
- with col2:
280
- st.metric("Invoice Date", header.get("invoice_date", "Not found"))
281
- st.metric("Customer", header.get("customer_name", "Not found"))
282
- with col3:
283
- st.metric("PO Number", header.get("po_number", "Not found"))
284
- st.metric("Total Value", header.get("invoice_value", "Not found"))
285
-
286
- # Display line items
287
- display_line_items(invoice_data.get("line_items", []), model_choice)
288
-
289
- # Calculate and display subtotal if not provided in header
290
- if not header.get("invoice_value"):
291
- try:
292
- total = sum(float(re.sub(r'[^\d.]', '', item.get("total_price", "0")))
293
- for item in invoice_data.get("line_items", []) if item.get("total_price"))
294
- st.metric("Calculated Total", f"${total:,.2f}")
295
- except:
296
- pass
297
-
298
  else:
299
- # Display for DeepSeek models
300
- st.success("Information extracted successfully!")
301
-
302
- col1, col2 = st.columns(2)
303
- with col1:
304
- st.metric("Invoice Number", invoice_data.get("invoice_number", "Not found"))
305
- st.metric("PO Number", invoice_data.get("po_number", "Not found"))
306
-
307
- with col2:
308
- st.metric("Invoice Date", invoice_data.get("invoice_date", "Not found"))
309
- st.metric("Invoice Value", format_currency(invoice_data.get("invoice_value")))
310
-
311
- # Display line items for both DeepSeek models
312
- display_line_items(invoice_data.get("line_items", []), model_choice)
313
 
314
  def extract_invoice_info(model_choice, text):
315
- """Extract structured data from pasted text"""
316
  prompt = get_extraction_prompt(model_choice, text)
317
- result = query_llm(model_choice, prompt)
318
-
319
- if not result:
320
  return None
321
-
322
- parsed_data = clean_json_response(result)
323
- if not parsed_data:
324
- st.error("Failed to parse JSON. Raw response:")
325
- st.code(result)
326
  return None
327
-
328
- # Normalize data structure based on model
329
- if model_choice == "Llama 4 Mavericks":
330
- if "invoice_header" not in parsed_data:
331
- parsed_data["invoice_header"] = {}
332
- if "line_items" not in parsed_data:
333
- parsed_data["line_items"] = []
334
-
335
- # Set default values for header if missing
336
- header_fields = ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]
337
- for field in header_fields:
338
- if field not in parsed_data["invoice_header"]:
339
- parsed_data["invoice_header"][field] = None
340
-
341
- # Validate line items structure
342
- for item in parsed_data["line_items"]:
343
- item_fields = ["item_number", "description", "quantity", "unit_price", "total_price"]
344
- for field in item_fields:
345
- if field not in item:
346
- item[field] = None if field != "quantity" else 0
347
- if field == "quantity" and not isinstance(item[field], (int, float)):
348
- try:
349
- item[field] = float(item[field])
350
- except (ValueError, TypeError):
351
- item[field] = 0
352
-
353
- else: # DeepSeek models
354
- # Ensure all required fields exist
355
- for field in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
356
- if field not in parsed_data:
357
- parsed_data[field] = None
358
-
359
- # Ensure line_items exists and has proper structure
360
- if "line_items" not in parsed_data:
361
- parsed_data["line_items"] = []
362
- else:
363
- for item in parsed_data["line_items"]:
364
- item_fields = ["description", "quantity", "unit_price", "total_price"]
365
- for field in item_fields:
366
- if field not in item:
367
- item[field] = None if field != "quantity" else 0
368
-
369
- return parsed_data
370
 
371
- # Create tabs for different functionalities
372
- tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
- # PDF Summarizer Tab
375
  with tab1:
376
- st.title("PDF to Bullet Point Summarizer 🗟 🔏")
377
-
378
- # File uploader for the PDF
379
- uploaded_file = st.file_uploader("Upload your PDF document", type="pdf", key="pdf_uploader")
380
-
381
- # Slider for users to select the summarization extent
382
- summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20, key="summary_scale")
383
-
384
- # Submit button
385
- submit_button = st.button("Generate Summary", key="summary_button")
386
-
387
- # Check if the submit button is pressed
388
- if submit_button and uploaded_file is not None:
389
- with st.spinner('Processing...'):
390
- # Read the PDF content
391
- text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
392
-
393
- # Extract key phrases from the text
394
- key_phrases = extract_key_phrases(text)
395
-
396
- # Score sentences based on the key phrases
397
- sentence_scores = score_sentences(text, key_phrases)
398
-
399
- # Determine the number of bullet points based on the selected summarization scale
400
- total_sentences = len(list(sentence_scores.keys()))
401
- num_points = max(1, total_sentences * summary_scale // 100)
402
-
403
- # Generate the bullet-point summary
404
- summary = summarize_text(sentence_scores, num_points=num_points)
405
-
406
- # Display the summary as bullet points
407
- st.subheader("Here's the summary: ")
408
- st.markdown(summary)
409
 
410
- # Invoice Extractor Tab
411
  with tab2:
412
- st.title("📋 Invoice Extractor from PDF")
413
- st.write("Upload an invoice PDF to extract key details")
414
-
415
- # Model selection
416
- model_choice = st.selectbox(
417
- "Select AI Model",
418
- list(MODELS.keys()),
419
- index=0,
420
- help="Choose which AI model to use for extraction",
421
- key="model_choice"
422
- )
423
-
424
- # File uploader for the invoice PDF
425
- invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf", key="invoice_pdf_uploader")
426
-
427
- if st.button("Extract Invoice Information", key="invoice_button") and invoice_pdf is not None:
428
- with st.spinner('Reading PDF...'):
429
- # Read the PDF content
430
- invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
431
-
432
- # Process in status container
433
- with st.status("Processing...", expanded=True) as status:
434
- st.write(f"🤖 Querying {model_choice} API...")
435
- invoice_data = extract_invoice_info(model_choice, invoice_text)
436
-
437
- if invoice_data:
438
- status.update(label="✅ Extraction Complete!", state="complete")
439
- display_invoice_data(model_choice, invoice_data)
440
  else:
441
- status.update(label="❌ Extraction Failed", state="error")
442
- st.error("Failed to extract information. Try simplifying the text.")
443
-
444
- # Debug information outside the status container
445
- if invoice_data and "last_api_response" in st.session_state:
446
- with st.expander("Debug Information"):
447
- st.write("API Response:")
448
- st.json(st.session_state.last_api_response)
449
- st.write("Raw API Response:")
450
- st.code(st.session_state.get("last_api_response_raw", "No response"))
 
1
  import streamlit as st
 
2
  import io
3
  import requests
4
  import json
5
  import re
6
  import os
 
7
 
8
+ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
9
+
10
+ st.set_page_config(page_title="PDF Tools", layout="wide")
 
 
11
 
 
12
  MODELS = {
13
  "DeepSeek v3": {
14
  "api_url": "https://api.deepseek.com/v1/chat/completions",
15
+ "model": "deepseek-chat",
16
+ "key_env": "DEEPSEEK_API_KEY",
17
+ "response_format": {"type": "json_object"},
18
  },
19
  "DeepSeek R1": {
20
  "api_url": "https://api.deepseek.com/v1/chat/completions",
21
+ "model": "deepseek-reasoner",
22
+ "key_env": "DEEPSEEK_API_KEY",
23
+ "response_format": None,
24
  },
25
  "Llama 4 Mavericks": {
26
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
27
+ "model": "meta-llama/llama-4-maverick:free",
28
+ "key_env": "OPENROUTER_API_KEY",
29
  "response_format": {"type": "json_object"},
30
  "extra_headers": {
31
  "HTTP-Referer": "https://huggingface.co",
32
+ "X-Title": "Invoice Extractor",
33
+ },
34
+ },
35
+ "Mistral Small": {
36
+ "api_url": "https://ezofisai.services.ai.azure.com/api/projects/firstProject",
37
+ "model": "mistral-small-2503",
38
+ "key_env": "AZUREMIST_API_KEY",
39
+ "response_format": {"type": "json_object"},
40
+ "extra_headers": {
41
+ "HTTP-Referer": "https://huggingface.co",
42
+ "X-Title": "Invoice Extractor",
43
+ },
44
+ },
45
  }
46
 
47
  def get_api_key(model_choice):
48
+ key = os.getenv(MODELS[model_choice]["key_env"])
49
+ if not key:
50
+ st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
 
 
51
  st.stop()
52
+ return key
53
 
54
  def query_llm(model_choice, prompt):
55
+ cfg = MODELS[model_choice]
 
56
  headers = {
57
  "Authorization": f"Bearer {get_api_key(model_choice)}",
58
  "Content-Type": "application/json",
59
  }
60
+ if cfg.get("extra_headers"):
61
+ headers.update(cfg["extra_headers"])
 
 
62
  payload = {
63
+ "model": cfg["model"],
64
  "messages": [{"role": "user", "content": prompt}],
65
  "temperature": 0.1,
66
  "max_tokens": 2000,
67
  }
68
+ if cfg.get("response_format"):
69
+ payload["response_format"] = cfg["response_format"]
 
 
70
  try:
71
+ with st.spinner(f"🔍 Querying {model_choice}..."):
72
+ r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
73
+ if r.status_code != 200:
74
+ st.error(f"🚨 API Error {r.status_code}: {r.text}")
75
+ return None
76
+ content = r.json()["choices"][0]["message"]["content"]
77
+ st.session_state.last_api = content
78
+ st.session_state.last_raw = r.text
79
+ return content
80
+ except Exception as e:
81
+ st.error(f"Connection error: {e}")
 
 
 
 
 
 
 
82
  return None
83
 
84
  def clean_json_response(text):
 
85
  if not text:
86
  return None
87
+ orig = text
88
+ # strip ``` fences
89
+ text = re.sub(r'```(?:json)?', '', text).strip()
90
+ # find outer braces
91
+ start, end = text.find('{'), text.rfind('}') + 1
92
+ if start < 0 or end < 1:
93
+ st.error("Couldn't locate JSON in response.")
94
+ st.code(orig)
95
+ return None
96
+ frag = text[start:end]
97
+ # remove stray trailing commas
98
+ frag = re.sub(r',\s*([}\]])', r'\1', frag)
99
  try:
100
+ return json.loads(frag)
101
+ except json.JSONDecodeError as e:
102
+ # attempt to insert missing commas between adjacent fields
103
+ repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
 
 
 
 
104
  try:
105
+ return json.loads(repaired)
106
  except json.JSONDecodeError:
107
+ st.error(f"JSON parse error: {e}")
108
+ st.code(frag)
109
+ return None
110
+
111
+ def fallback_supplier(text):
112
+ for line in text.splitlines():
113
+ line = line.strip()
114
+ if line:
115
+ return line
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  return None
117
 
118
+ def get_extraction_prompt(model_choice, txt):
119
+ if model_choice.startswith("DeepSeek"):
120
+ return (
121
+ "Extract full invoice info and RETURN ONLY a single-line json object with fields:\n"
122
+ '{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
123
+ '"po_number":"string|null","invoice_value":"string with currency",'
124
+ '"line_items":[{"description":"string","quantity":"number","unit_price":"string with currency","total_price":"string with currency"}]}\n'
125
+ "Use null for missing. NO extra text.\n\n"
126
+ f"Invoice Text:\n{txt}"
127
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  else:
129
+ return (
130
+ "Extract invoice data and RETURN ONLY a compact, one-line json object exactly:\n"
131
+ '{"invoice_header":{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
132
+ '"po_number":"string|null","invoice_value":"string with currency",'
133
+ '"supplier_name":"string|null","customer_name":"string|null"},'
134
+ '"line_items":[{"item_number":"string|null","description":"string","quantity":number,'
135
+ '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
136
+ "Use null for missing. NO extras.\n\n"
137
+ f"Invoice Text:\n{txt}"
138
+ )
 
 
 
 
139
 
140
  def extract_invoice_info(model_choice, text):
 
141
  prompt = get_extraction_prompt(model_choice, text)
142
+ raw = query_llm(model_choice, prompt)
143
+ if not raw:
 
144
  return None
145
+ data = clean_json_response(raw)
146
+ if not data:
 
 
 
147
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
+ # normalize + supplier fallback
150
+ if model_choice in ("Llama 4 Mavericks","Mistral Small"):
151
+ hdr = data.setdefault("invoice_header", {})
152
+ for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
153
+ hdr.setdefault(k, None)
154
+ if not hdr.get("supplier_name"):
155
+ hdr["supplier_name"] = fallback_supplier(text)
156
+ items = data.setdefault("line_items", [])
157
+ for itm in items:
158
+ for k in ("item_number","description","quantity","unit_price","total_price"):
159
+ itm.setdefault(k, None)
160
+ else:
161
+ for k in ("invoice_number","invoice_date","po_number","invoice_value"):
162
+ data.setdefault(k, None)
163
+ items = data.setdefault("line_items", [])
164
+ for itm in items:
165
+ for k in ("description","quantity","unit_price","total_price"):
166
+ itm.setdefault(k, None)
167
+
168
+ return data
169
+
170
+ # ---- UI ----
171
+ tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
172
 
 
173
  with tab1:
174
+ st.title("PDF Bullet Points")
175
+ pdf = st.file_uploader("Upload PDF", type="pdf")
176
+ pct = st.slider("Summarization %", 1, 100, 20)
177
+ if st.button("Summarize") and pdf:
178
+ txt = read_pdf(io.BytesIO(pdf.getvalue()))
179
+ keys = extract_key_phrases(txt)
180
+ scores = score_sentences(txt, keys)
181
+ n = max(1, len(scores)*pct//100)
182
+ st.markdown(summarize_text(scores, num_points=n))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
 
184
  with tab2:
185
+ st.title("Invoice Extractor")
186
+ mdl = st.selectbox("Model", list(MODELS.keys()))
187
+ inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
188
+ if st.button("Extract") and inv_pdf:
189
+ txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
190
+ info = extract_invoice_info(mdl, txt)
191
+ if info:
192
+ st.success("Extraction Complete")
193
+ if mdl in ("Llama 4 Mavericks","Mistral Small"):
194
+ h=info["invoice_header"]
195
+ c1,c2,c3 = st.columns(3)
196
+ c1.metric("Invoice #", h["invoice_number"]); c1.metric("Supplier", h["supplier_name"])
197
+ c2.metric("Date", h["invoice_date"]); c2.metric("Customer", h["customer_name"])
198
+ c3.metric("PO #", h["po_number"]); c3.metric("Total", h["invoice_value"])
199
+ st.subheader("Line Items"); st.table(info["line_items"])
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  else:
201
+ c1,c2 = st.columns(2)
202
+ c1.metric("Invoice #", info["invoice_number"]); c1.metric("PO #", info["po_number"])
203
+ c2.metric("Date", info["invoice_date"]); c2.metric("Value", info["invoice_value"])
204
+ st.subheader("Line Items"); st.table(info["line_items"])
205
+
206
+ if "last_api" in st.session_state:
207
+ with st.expander("Debug"):
208
+ st.code(st.session_state.last_api)
209
+ st.code(st.session_state.last_raw)