Seth0330 commited on
Commit
cdae312
·
verified ·
1 Parent(s): 0c39d40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +428 -25
app.py CHANGED
@@ -1,38 +1,441 @@
1
  import streamlit as st
2
  from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
3
  import io
 
 
 
 
 
4
 
5
- # Initialize your Streamlit app
6
- st.title("PDF to Bullet Point Summarizer 🗟 🔏")
 
 
 
7
 
8
- # File uploader for the PDF
9
- uploaded_file = st.file_uploader("Upload your PDF document", type="pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # Slider for users to select the summarization extent
12
- summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20)
13
 
14
- # Submit button
15
- submit_button = st.button("Generate Summary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # Check if the submit button is pressed
18
- if submit_button and uploaded_file is not None:
19
- with st.spinner('Processing...'):
20
- # Read the PDF content
21
- text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Extract key phrases from the text
24
- key_phrases = extract_key_phrases(text)
 
 
 
 
 
 
 
 
 
25
 
26
- # Score sentences based on the key phrases
27
- sentence_scores = score_sentences(text, key_phrases)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Determine the number of bullet points based on the selected summarization scale
30
- total_sentences = len(list(sentence_scores.keys()))
31
- num_points = max(1, total_sentences * summary_scale // 100)
 
 
32
 
33
- # Generate the bullet-point summary
34
- summary = summarize_text(sentence_scores, num_points=num_points)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- # Display the summary as bullet points
37
- st.subheader("Here's the summary: ")
38
- st.markdown(summary)
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
3
  import io
4
+ import requests
5
+ import json
6
+ import re
7
+ import os
8
+ from datetime import datetime
9
 
10
+ # Configure Streamlit
11
+ st.set_page_config(
12
+ page_title="PDF Tools - Summarizer & Invoice Extractor",
13
+ layout="wide",
14
+ )
15
 
16
+ # Model Configuration for Invoice Extractor
17
+ MODELS = {
18
+ "DeepSeek v3": {
19
+ "api_url": "https://api.deepseek.com/v1/chat/completions",
20
+ "model_name": "deepseek-chat",
21
+ "api_key_env": "DEEPSEEK_API_KEY",
22
+ "response_format": {"type": "json_object"}
23
+ },
24
+ "DeepSeek R1": {
25
+ "api_url": "https://api.deepseek.com/v1/chat/completions",
26
+ "model_name": "deepseek-reasoner",
27
+ "api_key_env": "DEEPSEEK_API_KEY",
28
+ "response_format": None
29
+ },
30
+ "Llama 4 Mavericks": {
31
+ "api_url": "https://openrouter.ai/api/v1/chat/completions",
32
+ "model_name": "meta-llama/llama-4-maverick:free",
33
+ "api_key_env": "OPENROUTER_API_KEY",
34
+ "response_format": {"type": "json_object"},
35
+ "extra_headers": {
36
+ "HTTP-Referer": "https://huggingface.co",
37
+ "X-Title": "Invoice Extractor"
38
+ }
39
+ }
40
+ }
41
 
42
+ # Create tabs for different functionalities
43
+ tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
44
 
45
+ # PDF Summarizer Tab
46
+ with tab1:
47
+ st.title("PDF to Bullet Point Summarizer 🗟 🔏")
48
+
49
+ # File uploader for the PDF
50
+ uploaded_file = st.file_uploader("Upload your PDF document", type="pdf", key="pdf_uploader")
51
+
52
+ # Slider for users to select the summarization extent
53
+ summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20, key="summary_scale")
54
+
55
+ # Submit button
56
+ submit_button = st.button("Generate Summary", key="summary_button")
57
+
58
+ # Check if the submit button is pressed
59
+ if submit_button and uploaded_file is not None:
60
+ with st.spinner('Processing...'):
61
+ # Read the PDF content
62
+ text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
63
+
64
+ # Extract key phrases from the text
65
+ key_phrases = extract_key_phrases(text)
66
+
67
+ # Score sentences based on the key phrases
68
+ sentence_scores = score_sentences(text, key_phrases)
69
+
70
+ # Determine the number of bullet points based on the selected summarization scale
71
+ total_sentences = len(list(sentence_scores.keys()))
72
+ num_points = max(1, total_sentences * summary_scale // 100)
73
+
74
+ # Generate the bullet-point summary
75
+ summary = summarize_text(sentence_scores, num_points=num_points)
76
+
77
+ # Display the summary as bullet points
78
+ st.subheader("Here's the summary: ")
79
+ st.markdown(summary)
80
 
81
+ # Invoice Extractor Tab
82
+ with tab2:
83
+ st.title("📋 Invoice Extractor from PDF")
84
+ st.write("Upload an invoice PDF to extract key details")
85
+
86
+ # Model selection
87
+ model_choice = st.selectbox(
88
+ "Select AI Model",
89
+ list(MODELS.keys()),
90
+ index=0,
91
+ help="Choose which AI model to use for extraction",
92
+ key="model_choice"
93
+ )
94
+
95
+ # File uploader for the invoice PDF
96
+ invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf", key="invoice_pdf_uploader")
97
+
98
+ if st.button("Extract Invoice Information", key="invoice_button") and invoice_pdf is not None:
99
+ with st.spinner('Reading PDF...'):
100
+ # Read the PDF content
101
+ invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
102
 
103
+ # Process in status container
104
+ with st.status("Processing...", expanded=True) as status:
105
+ st.write(f"🤖 Querying {model_choice} API...")
106
+ invoice_data = extract_invoice_info(model_choice, invoice_text)
107
+
108
+ if invoice_data:
109
+ status.update(label="✅ Extraction Complete!", state="complete")
110
+ display_invoice_data(model_choice, invoice_data)
111
+ else:
112
+ status.update(label="❌ Extraction Failed", state="error")
113
+ st.error("Failed to extract information. Try simplifying the text.")
114
 
115
+ # Debug information outside the status container
116
+ if invoice_data and "last_api_response" in st.session_state:
117
+ with st.expander("Debug Information"):
118
+ st.write("API Response:")
119
+ st.json(st.session_state.last_api_response)
120
+ st.write("Raw API Response:")
121
+ st.code(st.session_state.get("last_api_response_raw", "No response"))
122
+
123
+ # Invoice Extractor Functions
124
+ def get_api_key(model_choice):
125
+ """Get the appropriate API key based on model choice"""
126
+ api_key_env = MODELS[model_choice]["api_key_env"]
127
+ api_key = os.environ.get(api_key_env)
128
+ if not api_key:
129
+ st.error(f"❌ `{api_key_env}` environment variable not set!")
130
+ st.stop()
131
+ return api_key
132
+
133
+ def clean_json_response(text):
134
+ """Improved JSON extraction from API response with better error handling"""
135
+ # First try to parse directly as JSON
136
+ try:
137
+ return json.loads(text)
138
+ except json.JSONDecodeError:
139
+ pass
140
+
141
+ # Try to extract JSON from markdown code blocks
142
+ json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
143
+ if json_match:
144
+ try:
145
+ return json.loads(json_match.group(1))
146
+ except json.JSONDecodeError:
147
+ pass
148
+
149
+ # Try to extract any JSON-like content
150
+ json_match = re.search(r'\{.*\}', text, re.DOTALL)
151
+ if json_match:
152
+ try:
153
+ return json.loads(json_match.group(0))
154
+ except json.JSONDecodeError:
155
+ pass
156
+
157
+ # Fallback to simple key-value parsing
158
+ try:
159
+ data = {}
160
+ for line in text.split('\n'):
161
+ if ':' in line:
162
+ parts = line.split(':', 1)
163
+ if len(parts) == 2:
164
+ key, val = parts
165
+ key = key.strip().strip('"').lower().replace(' ', '_')
166
+ data[key] = val.strip().strip('"')
167
+ return data if data else None
168
+ except Exception:
169
+ return None
170
+
171
+ def query_llm(model_choice, prompt):
172
+ """Call the appropriate API based on model choice"""
173
+ config = MODELS[model_choice]
174
+ headers = {
175
+ "Authorization": f"Bearer {get_api_key(model_choice)}",
176
+ "Content-Type": "application/json",
177
+ }
178
+
179
+ # Add extra headers if they exist (for OpenRouter)
180
+ if "extra_headers" in config:
181
+ headers.update(config["extra_headers"])
182
+
183
+ payload = {
184
+ "model": config["model_name"],
185
+ "messages": [{"role": "user", "content": prompt}],
186
+ "temperature": 0.1,
187
+ "max_tokens": 2000,
188
+ }
189
+
190
+ # Add response format if specified
191
+ if config["response_format"]:
192
+ payload["response_format"] = config["response_format"]
193
+
194
+ try:
195
+ with st.spinner(f"🔍 Analyzing with {model_choice}..."):
196
+ response = requests.post(config["api_url"], headers=headers, json=payload, timeout=60)
197
+
198
+ if response.status_code != 200:
199
+ st.error(f"🚨 API Error {response.status_code}: {response.text}")
200
+ return None
201
+
202
+ try:
203
+ content = response.json()["choices"][0]["message"]["content"]
204
+ st.session_state.last_api_response = content
205
+ st.session_state.last_api_response_raw = response.text
206
+ return content
207
+ except KeyError as e:
208
+ st.error(f"KeyError in response: {e}\nFull response: {response.json()}")
209
+ return None
210
+
211
+ except requests.exceptions.RequestException as e:
212
+ st.error(f"🌐 Connection Failed: {str(e)}")
213
+ return None
214
+
215
+ def get_extraction_prompt(model_choice, text):
216
+ """Return the appropriate prompt based on model choice"""
217
+ if model_choice == "DeepSeek v3":
218
+ return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
219
+ {{
220
+ "invoice_number": "string",
221
+ "invoice_date": "YYYY-MM-DD",
222
+ "po_number": "string or null",
223
+ "invoice_value": "string with currency symbol",
224
+ "line_items": [
225
+ {{
226
+ "description": "string",
227
+ "quantity": "number or string",
228
+ "unit_price": "string with currency",
229
+ "total_price": "string with currency"
230
+ }}
231
+ ]
232
+ }}
233
+ Rules:
234
+ 1. Return ONLY valid JSON (no additional text or markdown)
235
+ 2. Use null for missing fields
236
+ 3. Include all line items found in the invoice
237
+ 4. For line items, quantity can be number or string, prices should include currency
238
+ 5. Do not include any explanations or notes
239
+ Invoice Text:
240
+ """ + text
241
+
242
+ elif model_choice == "DeepSeek R1":
243
+ return f"""Please extract the following information from the invoice text below and return ONLY the raw JSON without any markdown formatting or additional text:
244
+ {{
245
+ "invoice_number": "string or null",
246
+ "invoice_date": "YYYY-MM-DD or null",
247
+ "po_number": "string or null",
248
+ "invoice_value": "string with currency or null",
249
+ "line_items": [
250
+ {{
251
+ "description": "string",
252
+ "quantity": "number or string",
253
+ "unit_price": "string with currency",
254
+ "total_price": "string with currency"
255
+ }}
256
+ ]
257
+ }}
258
+ Invoice Text:
259
+ """ + text
260
+
261
+ else: # Llama 4 Mavericks
262
+ return f"""Extract complete invoice information and return a VALID JSON object with these fields:
263
+ {{
264
+ "invoice_header": {{
265
+ "invoice_number": "string",
266
+ "invoice_date": "YYYY-MM-DD",
267
+ "po_number": "string or null",
268
+ "invoice_value": "string with currency",
269
+ "supplier_name": "string or null",
270
+ "customer_name": "string or null"
271
+ }},
272
+ "line_items": [
273
+ {{
274
+ "item_number": "string or null",
275
+ "description": "string",
276
+ "quantity": "number",
277
+ "unit_price": "string with currency",
278
+ "total_price": "string with currency"
279
+ }}
280
+ ]
281
+ }}
282
+ Rules:
283
+ 1. Return ONLY valid JSON (no additional text or markdown)
284
+ 2. Use null for missing fields
285
+ 3. Date format must be YYYY-MM-DD
286
+ 4. All currency values must include currency symbol or code
287
+ 5. Include all line items found in the invoice
288
+ 6. For line items, quantity should be a number, prices as strings with currency
289
+ 7. Do not include any explanations or notes
290
+ Invoice Text:
291
+ """ + text
292
+
293
+ def format_currency(value):
294
+ """Helper function to format currency values consistently"""
295
+ if not value:
296
+ return "N/A"
297
+ if isinstance(value, (int, float)):
298
+ return f"${value:,.2f}"
299
+ return value
300
+
301
+ def display_line_items(line_items, model_choice="DeepSeek v3"):
302
+ """Display line items in a formatted table"""
303
+ if not line_items:
304
+ st.info("No line items found in this invoice.")
305
+ return
306
+
307
+ st.subheader("📋 Line Items")
308
+
309
+ if model_choice == "Llama 4 Mavericks":
310
+ # Display as a table for Llama
311
+ items_display = []
312
+ for idx, item in enumerate(line_items, 1):
313
+ items_display.append({
314
+ "#": idx,
315
+ "Description": item.get("description", "N/A"),
316
+ "Quantity": item.get("quantity", 0),
317
+ "Unit Price": item.get("unit_price", "N/A"),
318
+ "Total Price": item.get("total_price", "N/A")
319
+ })
320
+ st.table(items_display)
321
+ else:
322
+ # Display in columns for DeepSeek models
323
+ cols = st.columns([4, 2, 2, 2])
324
+ with st.container():
325
+ cols[0].write("**Description**")
326
+ cols[1].write("**Qty**")
327
+ cols[2].write("**Unit Price**")
328
+ cols[3].write("**Total**")
329
+
330
+ for item in line_items:
331
+ cols = st.columns([4, 2, 2, 2])
332
+ cols[0].write(item.get("description", "N/A"))
333
+ cols[1].write(item.get("quantity", "N/A"))
334
+ cols[2].write(format_currency(item.get("unit_price", "N/A")))
335
+ cols[3].write(format_currency(item.get("total_price", "N/A")))
336
+ st.divider()
337
+
338
+ def display_invoice_data(model_choice, invoice_data):
339
+ if not invoice_data:
340
+ return
341
+
342
+ if model_choice == "Llama 4 Mavericks":
343
+ # Display header information
344
+ st.subheader("Invoice Summary")
345
+ header = invoice_data.get("invoice_header", {})
346
+
347
+ col1, col2, col3 = st.columns(3)
348
+ with col1:
349
+ st.metric("Invoice Number", header.get("invoice_number", "Not found"))
350
+ st.metric("Supplier", header.get("supplier_name", "Not found"))
351
+ with col2:
352
+ st.metric("Invoice Date", header.get("invoice_date", "Not found"))
353
+ st.metric("Customer", header.get("customer_name", "Not found"))
354
+ with col3:
355
+ st.metric("PO Number", header.get("po_number", "Not found"))
356
+ st.metric("Total Value", header.get("invoice_value", "Not found"))
357
+
358
+ # Display line items
359
+ display_line_items(invoice_data.get("line_items", []), model_choice)
360
+
361
+ # Calculate and display subtotal if not provided in header
362
+ if not header.get("invoice_value"):
363
+ try:
364
+ total = sum(float(re.sub(r'[^\d.]', '', item.get("total_price", "0")))
365
+ for item in invoice_data.get("line_items", []) if item.get("total_price"))
366
+ st.metric("Calculated Total", f"${total:,.2f}")
367
+ except:
368
+ pass
369
+
370
+ else:
371
+ # Display for DeepSeek models
372
+ st.success("Information extracted successfully!")
373
+
374
+ col1, col2 = st.columns(2)
375
+ with col1:
376
+ st.metric("Invoice Number", invoice_data.get("invoice_number", "Not found"))
377
+ st.metric("PO Number", invoice_data.get("po_number", "Not found"))
378
+
379
+ with col2:
380
+ st.metric("Invoice Date", invoice_data.get("invoice_date", "Not found"))
381
+ st.metric("Invoice Value", format_currency(invoice_data.get("invoice_value")))
382
+
383
+ # Display line items for both DeepSeek models
384
+ display_line_items(invoice_data.get("line_items", []), model_choice)
385
+
386
+ def extract_invoice_info(model_choice, text):
387
+ """Extract structured data from pasted text"""
388
+ prompt = get_extraction_prompt(model_choice, text)
389
+ result = query_llm(model_choice, prompt)
390
+
391
+ if not result:
392
+ return None
393
+
394
+ parsed_data = clean_json_response(result)
395
+ if not parsed_data:
396
+ st.error("Failed to parse JSON. Raw response:")
397
+ st.code(result)
398
+ return None
399
+
400
+ # Normalize data structure based on model
401
+ if model_choice == "Llama 4 Mavericks":
402
+ if "invoice_header" not in parsed_data:
403
+ parsed_data["invoice_header"] = {}
404
+ if "line_items" not in parsed_data:
405
+ parsed_data["line_items"] = []
406
 
407
+ # Set default values for header if missing
408
+ header_fields = ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]
409
+ for field in header_fields:
410
+ if field not in parsed_data["invoice_header"]:
411
+ parsed_data["invoice_header"][field] = None
412
 
413
+ # Validate line items structure
414
+ for item in parsed_data["line_items"]:
415
+ item_fields = ["item_number", "description", "quantity", "unit_price", "total_price"]
416
+ for field in item_fields:
417
+ if field not in item:
418
+ item[field] = None if field != "quantity" else 0
419
+ if field == "quantity" and not isinstance(item[field], (int, float)):
420
+ try:
421
+ item[field] = float(item[field])
422
+ except (ValueError, TypeError):
423
+ item[field] = 0
424
+
425
+ else: # DeepSeek models
426
+ # Ensure all required fields exist
427
+ for field in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
428
+ if field not in parsed_data:
429
+ parsed_data[field] = None
430
 
431
+ # Ensure line_items exists and has proper structure
432
+ if "line_items" not in parsed_data:
433
+ parsed_data["line_items"] = []
434
+ else:
435
+ for item in parsed_data["line_items"]:
436
+ item_fields = ["description", "quantity", "unit_price", "total_price"]
437
+ for field in item_fields:
438
+ if field not in item:
439
+ item[field] = None if field != "quantity" else 0
440
+
441
+ return parsed_data