Seth0330 commited on
Commit
b3585ca
·
verified ·
1 Parent(s): 572e346

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +393 -156
app.py CHANGED
@@ -1,213 +1,450 @@
1
  import streamlit as st
 
2
  import io
3
  import requests
4
  import json
5
  import re
6
  import os
 
7
 
8
- from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
9
-
10
- st.set_page_config(page_title="PDF Tools", layout="wide")
 
 
11
 
 
12
  MODELS = {
13
  "DeepSeek v3": {
14
  "api_url": "https://api.deepseek.com/v1/chat/completions",
15
- "model": "deepseek-chat",
16
- "key_env": "DEEPSEEK_API_KEY",
17
- "response_format": {"type": "json_object"},
18
  },
19
  "DeepSeek R1": {
20
  "api_url": "https://api.deepseek.com/v1/chat/completions",
21
- "model": "deepseek-reasoner",
22
- "key_env": "DEEPSEEK_API_KEY",
23
- "response_format": None,
24
  },
25
  "Llama 4 Mavericks": {
26
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
27
- "model": "meta-llama/llama-4-maverick:free",
28
- "key_env": "OPENROUTER_API_KEY",
29
  "response_format": {"type": "json_object"},
30
  "extra_headers": {
31
  "HTTP-Referer": "https://huggingface.co",
32
- "X-Title": "Invoice Extractor",
33
- },
34
- },
35
- "Mistral Small": {
36
- # Update these two fields with your Azure values:
37
- "api_url": "https://ezofisai.services.ai.azure.com/api/projects/firstProject",
38
- "model": "mistral-small-2503", # this is not used by Azure, just for completeness
39
- "key_env": "AZUREMIST_API_KEY",
40
- "response_format": {"type": "json_object"},
41
- # No extra_headers needed for Azure
42
- },
43
  }
44
 
45
  def get_api_key(model_choice):
46
- key = os.getenv(MODELS[model_choice]["key_env"])
47
- if not key:
48
- st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
 
 
49
  st.stop()
50
- return key
51
 
52
  def query_llm(model_choice, prompt):
53
- cfg = MODELS[model_choice]
 
54
  headers = {
 
55
  "Content-Type": "application/json",
56
  }
57
- # Azure OpenAI (Mistral Small) needs api-key header instead of Authorization
58
- if model_choice == "mistral-small-2503":
59
- headers["api-key"] = get_api_key(model_choice)
60
- else:
61
- headers["Authorization"] = f"Bearer {get_api_key(model_choice)}"
62
- if cfg.get("extra_headers"):
63
- headers.update(cfg["extra_headers"])
64
  payload = {
 
65
  "messages": [{"role": "user", "content": prompt}],
66
  "temperature": 0.1,
67
  "max_tokens": 2000,
68
  }
69
- # Only non-Azure APIs need "model" in payload
70
- if model_choice != "mistral-small-2503":
71
- payload["model"] = cfg["model"]
72
- if cfg.get("response_format"):
73
- payload["response_format"] = cfg["response_format"]
74
  try:
75
- with st.spinner(f"🔍 Querying {model_choice}..."):
76
- r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
77
- if r.status_code != 200:
78
- st.error(f"🚨 API Error {r.status_code}: {r.text}")
79
- return None
80
- content = r.json()["choices"][0]["message"]["content"]
81
- st.session_state.last_api = content
82
- st.session_state.last_raw = r.text
83
- return content
84
- except Exception as e:
85
- st.error(f"Connection error: {e}")
 
 
 
 
 
 
 
86
  return None
87
 
88
  def clean_json_response(text):
 
89
  if not text:
90
  return None
91
- orig = text
92
- # strip ``` fences
93
- text = re.sub(r'```(?:json)?', '', text).strip()
94
- # find outer braces
95
- start, end = text.find('{'), text.rfind('}') + 1
96
- if start < 0 or end < 1:
97
- st.error("Couldn't locate JSON in response.")
98
- st.code(orig)
99
- return None
100
- frag = text[start:end]
101
- # remove stray trailing commas
102
- frag = re.sub(r',\s*([}\]])', r'\1', frag)
103
  try:
104
- return json.loads(frag)
105
- except json.JSONDecodeError as e:
106
- # attempt to insert missing commas between adjacent fields
107
- repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
 
 
 
 
108
  try:
109
- return json.loads(repaired)
110
  except json.JSONDecodeError:
111
- st.error(f"JSON parse error: {e}")
112
- st.code(frag)
113
- return None
114
-
115
- def fallback_supplier(text):
116
- for line in text.splitlines():
117
- line = line.strip()
118
- if line:
119
- return line
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  return None
121
 
122
- def get_extraction_prompt(model_choice, txt):
123
- if model_choice.startswith("DeepSeek"):
124
- return (
125
- "Extract full invoice info and RETURN ONLY a single-line json object with fields:\n"
126
- '{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
127
- '"po_number":"string|null","invoice_value":"string with currency",'
128
- '"line_items":[{"description":"string","quantity":"number","unit_price":"string with currency","total_price":"string with currency"}]}\n'
129
- "Use null for missing. NO extra text.\n\n"
130
- f"Invoice Text:\n{txt}"
131
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  else:
133
- return (
134
- "Extract invoice data and RETURN ONLY a compact, one-line json object exactly:\n"
135
- '{"invoice_header":{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
136
- '"po_number":"string|null","invoice_value":"string with currency",'
137
- '"supplier_name":"string|null","customer_name":"string|null"},'
138
- '"line_items":[{"item_number":"string|null","description":"string","quantity":number,'
139
- '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
140
- "Use null for missing. NO extras.\n\n"
141
- f"Invoice Text:\n{txt}"
142
- )
 
 
 
 
143
 
144
  def extract_invoice_info(model_choice, text):
 
145
  prompt = get_extraction_prompt(model_choice, text)
146
- raw = query_llm(model_choice, prompt)
147
- if not raw:
 
148
  return None
149
- data = clean_json_response(raw)
150
- if not data:
 
 
 
151
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- # normalize + supplier fallback
154
- if model_choice in ("Llama 4 Mavericks","Mistral Small"):
155
- hdr = data.setdefault("invoice_header", {})
156
- for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
157
- hdr.setdefault(k, None)
158
- if not hdr.get("supplier_name"):
159
- hdr["supplier_name"] = fallback_supplier(text)
160
- items = data.setdefault("line_items", [])
161
- for itm in items:
162
- for k in ("item_number","description","quantity","unit_price","total_price"):
163
- itm.setdefault(k, None)
164
- else:
165
- for k in ("invoice_number","invoice_date","po_number","invoice_value"):
166
- data.setdefault(k, None)
167
- items = data.setdefault("line_items", [])
168
- for itm in items:
169
- for k in ("description","quantity","unit_price","total_price"):
170
- itm.setdefault(k, None)
171
-
172
- return data
173
-
174
- # ---- UI ----
175
- tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
176
 
 
177
  with tab1:
178
- st.title("PDF Bullet Points")
179
- pdf = st.file_uploader("Upload PDF", type="pdf")
180
- pct = st.slider("Summarization %", 1, 100, 20)
181
- if st.button("Summarize") and pdf:
182
- txt = read_pdf(io.BytesIO(pdf.getvalue()))
183
- keys = extract_key_phrases(txt)
184
- scores = score_sentences(txt, keys)
185
- n = max(1, len(scores)*pct//100)
186
- st.markdown(summarize_text(scores, num_points=n))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
 
188
  with tab2:
189
- st.title("Invoice Extractor")
190
- mdl = st.selectbox("Model", list(MODELS.keys()))
191
- inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
192
- if st.button("Extract") and inv_pdf:
193
- txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
194
- info = extract_invoice_info(mdl, txt)
195
- if info:
196
- st.success("Extraction Complete")
197
- if mdl in ("Llama 4 Mavericks","Mistral Small"):
198
- h=info["invoice_header"]
199
- c1,c2,c3 = st.columns(3)
200
- c1.metric("Invoice #", h["invoice_number"]); c1.metric("Supplier", h["supplier_name"])
201
- c2.metric("Date", h["invoice_date"]); c2.metric("Customer", h["customer_name"])
202
- c3.metric("PO #", h["po_number"]); c3.metric("Total", h["invoice_value"])
203
- st.subheader("Line Items"); st.table(info["line_items"])
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  else:
205
- c1,c2 = st.columns(2)
206
- c1.metric("Invoice #", info["invoice_number"]); c1.metric("PO #", info["po_number"])
207
- c2.metric("Date", info["invoice_date"]); c2.metric("Value", info["invoice_value"])
208
- st.subheader("Line Items"); st.table(info["line_items"])
209
-
210
- if "last_api" in st.session_state:
211
- with st.expander("Debug"):
212
- st.code(st.session_state.last_api)
213
- st.code(st.session_state.last_raw)
 
 
1
  import streamlit as st
2
+ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
3
  import io
4
  import requests
5
  import json
6
  import re
7
  import os
8
+ from datetime import datetime
9
 
10
+ # Configure Streamlit
11
+ st.set_page_config(
12
+ page_title="PDF Tools - Summarizer & Invoice Extractor",
13
+ layout="wide",
14
+ )
15
 
16
+ # Model Configuration for Invoice Extractor
17
  MODELS = {
18
  "DeepSeek v3": {
19
  "api_url": "https://api.deepseek.com/v1/chat/completions",
20
+ "model_name": "deepseek-chat",
21
+ "api_key_env": "DEEPSEEK_API_KEY",
22
+ "response_format": {"type": "json_object"}
23
  },
24
  "DeepSeek R1": {
25
  "api_url": "https://api.deepseek.com/v1/chat/completions",
26
+ "model_name": "deepseek-reasoner",
27
+ "api_key_env": "DEEPSEEK_API_KEY",
28
+ "response_format": None
29
  },
30
  "Llama 4 Mavericks": {
31
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
32
+ "model_name": "meta-llama/llama-4-maverick:free",
33
+ "api_key_env": "OPENROUTER_API_KEY",
34
  "response_format": {"type": "json_object"},
35
  "extra_headers": {
36
  "HTTP-Referer": "https://huggingface.co",
37
+ "X-Title": "Invoice Extractor"
38
+ }
39
+ }
 
 
 
 
 
 
 
 
40
  }
41
 
42
  def get_api_key(model_choice):
43
+ """Get the appropriate API key based on model choice"""
44
+ api_key_env = MODELS[model_choice]["api_key_env"]
45
+ api_key = os.environ.get(api_key_env)
46
+ if not api_key:
47
+ st.error(f"❌ `{api_key_env}` environment variable not set!")
48
  st.stop()
49
+ return api_key
50
 
51
  def query_llm(model_choice, prompt):
52
+ """Call the appropriate API based on model choice"""
53
+ config = MODELS[model_choice]
54
  headers = {
55
+ "Authorization": f"Bearer {get_api_key(model_choice)}",
56
  "Content-Type": "application/json",
57
  }
58
+
59
+ if "extra_headers" in config:
60
+ headers.update(config["extra_headers"])
61
+
 
 
 
62
  payload = {
63
+ "model": config["model_name"],
64
  "messages": [{"role": "user", "content": prompt}],
65
  "temperature": 0.1,
66
  "max_tokens": 2000,
67
  }
68
+
69
+ if config["response_format"]:
70
+ payload["response_format"] = config["response_format"]
71
+
 
72
  try:
73
+ with st.spinner(f"🔍 Analyzing with {model_choice}..."):
74
+ response = requests.post(config["api_url"], headers=headers, json=payload, timeout=90)
75
+
76
+ if response.status_code != 200:
77
+ st.error(f"🚨 API Error {response.status_code}: {response.text}")
78
+ return None
79
+
80
+ try:
81
+ content = response.json()["choices"][0]["message"]["content"]
82
+ st.session_state.last_api_response = content
83
+ st.session_state.last_api_response_raw = response.text
84
+ return content
85
+ except KeyError as e:
86
+ st.error(f"KeyError in response: {e}\nFull response: {response.json()}")
87
+ return None
88
+
89
+ except requests.exceptions.RequestException as e:
90
+ st.error(f"🌐 Connection Failed: {str(e)}")
91
  return None
92
 
93
  def clean_json_response(text):
94
+ """Improved JSON extraction with comprehensive error handling"""
95
  if not text:
96
  return None
97
+
98
+ # First attempt to parse directly
 
 
 
 
 
 
 
 
 
 
99
  try:
100
+ data = json.loads(text)
101
+ return data
102
+ except json.JSONDecodeError:
103
+ pass
104
+
105
+ # Try to extract JSON from potential markdown
106
+ json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
107
+ if json_match:
108
  try:
109
+ return json.loads(json_match.group(1))
110
  except json.JSONDecodeError:
111
+ pass
112
+
113
+ # Try to find any JSON-like structure
114
+ try:
115
+ start_idx = text.find('{')
116
+ end_idx = text.rfind('}') + 1
117
+ if start_idx != -1 and end_idx != 0:
118
+ return json.loads(text[start_idx:end_idx])
119
+ except:
120
+ pass
121
+
122
+ # Final fallback - manual reconstruction
123
+ try:
124
+ if '"invoice_header":' in text and '"line_items":' in text:
125
+ header_part = text.split('"line_items":')[0]
126
+ line_items_part = text.split('"line_items":')[1]
127
+
128
+ # Ensure proper closing of JSON
129
+ if not header_part.strip().endswith('{'):
130
+ header_part += '{'
131
+
132
+ if not line_items_part.strip().endswith('}}'):
133
+ line_items_part = line_items_part.split('}')[0] + ']}}'
134
+
135
+ reconstructed = header_part + '"line_items":' + line_items_part
136
+ return json.loads(reconstructed)
137
+ except Exception as e:
138
+ st.warning(f"Could not fully reconstruct JSON: {str(e)}")
139
+ return None
140
+
141
  return None
142
 
143
+ def get_extraction_prompt(model_choice, text):
144
+ """Return the appropriate prompt based on model choice"""
145
+ if model_choice == "DeepSeek v3":
146
+ return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
147
+ {{
148
+ "invoice_number": "string",
149
+ "invoice_date": "YYYY-MM-DD",
150
+ "po_number": "string or null",
151
+ "invoice_value": "string with currency symbol",
152
+ "line_items": [
153
+ {{
154
+ "description": "string",
155
+ "quantity": "number or string",
156
+ "unit_price": "string with currency",
157
+ "total_price": "string with currency"
158
+ }}
159
+ ]
160
+ }}
161
+ Rules:
162
+ 1. Return ONLY valid JSON (no additional text or markdown)
163
+ 2. Use null for missing fields
164
+ 3. Include all line items found in the invoice
165
+ 4. For line items, quantity can be number or string, prices should include currency
166
+ 5. Do not include any explanations or notes
167
+ Invoice Text:
168
+ """ + text
169
+
170
+ elif model_choice == "DeepSeek R1":
171
+ return f"""Please extract the following information from the invoice text below and return ONLY the raw JSON without any markdown formatting or additional text:
172
+ {{
173
+ "invoice_number": "string or null",
174
+ "invoice_date": "YYYY-MM-DD or null",
175
+ "po_number": "string or null",
176
+ "invoice_value": "string with currency or null",
177
+ "line_items": [
178
+ {{
179
+ "description": "string",
180
+ "quantity": "number or string",
181
+ "unit_price": "string with currency",
182
+ "total_price": "string with currency"
183
+ }}
184
+ ]
185
+ }}
186
+ Invoice Text:
187
+ """ + text
188
+
189
+ else: # Llama 4 Mavericks
190
+ return f"""Extract complete invoice information and return a VALID JSON object with these fields:
191
+ {{
192
+ "invoice_header": {{
193
+ "invoice_number": "string",
194
+ "invoice_date": "YYYY-MM-DD",
195
+ "po_number": "string or null",
196
+ "invoice_value": "string with currency",
197
+ "supplier_name": "string or null",
198
+ "customer_name": "string or null"
199
+ }},
200
+ "line_items": [
201
+ {{
202
+ "item_number": "string or null",
203
+ "description": "string",
204
+ "quantity": "number",
205
+ "unit_price": "string with currency",
206
+ "total_price": "string with currency"
207
+ }}
208
+ ]
209
+ }}
210
+ Rules:
211
+ 1. Return ONLY valid JSON (no additional text or markdown)
212
+ 2. Use null for missing fields
213
+ 3. Date format must be YYYY-MM-DD
214
+ 4. All currency values must include currency symbol or code
215
+ 5. Include all line items found in the invoice
216
+ 6. For line items, quantity should be a number, prices as strings with currency
217
+ 7. Do not include any explanations or notes
218
+ Invoice Text:
219
+ """ + text
220
+
221
+ def format_currency(value):
222
+ """Helper function to format currency values consistently"""
223
+ if not value:
224
+ return "N/A"
225
+ if isinstance(value, (int, float)):
226
+ return f"${value:,.2f}"
227
+ return value
228
+
229
+ def display_line_items(line_items, model_choice="DeepSeek v3"):
230
+ """Display line items in a formatted table"""
231
+ if not line_items:
232
+ st.info("No line items found in this invoice. This may be due to incomplete data from the API.")
233
+ return
234
+
235
+ st.subheader("📋 Line Items")
236
+
237
+ if model_choice == "Llama 4 Mavericks":
238
+ # Display as a table for Llama
239
+ items_display = []
240
+ for idx, item in enumerate(line_items, 1):
241
+ items_display.append({
242
+ "#": idx,
243
+ "Description": item.get("description", "N/A"),
244
+ "Quantity": item.get("quantity", 0),
245
+ "Unit Price": item.get("unit_price", "N/A"),
246
+ "Total Price": item.get("total_price", "N/A")
247
+ })
248
+ st.table(items_display)
249
+ else:
250
+ # Display in columns for DeepSeek models
251
+ cols = st.columns([4, 2, 2, 2])
252
+ with st.container():
253
+ cols[0].write("**Description**")
254
+ cols[1].write("**Qty**")
255
+ cols[2].write("**Unit Price**")
256
+ cols[3].write("**Total**")
257
+
258
+ for item in line_items:
259
+ cols = st.columns([4, 2, 2, 2])
260
+ cols[0].write(item.get("description", "N/A"))
261
+ cols[1].write(item.get("quantity", "N/A"))
262
+ cols[2].write(format_currency(item.get("unit_price", "N/A")))
263
+ cols[3].write(format_currency(item.get("total_price", "N/A")))
264
+ st.divider()
265
+
266
+ def display_invoice_data(model_choice, invoice_data):
267
+ if not invoice_data:
268
+ return
269
+
270
+ if model_choice == "Llama 4 Mavericks":
271
+ # Display header information
272
+ st.subheader("Invoice Summary")
273
+ header = invoice_data.get("invoice_header", {})
274
+
275
+ col1, col2, col3 = st.columns(3)
276
+ with col1:
277
+ st.metric("Invoice Number", header.get("invoice_number", "Not found"))
278
+ st.metric("Supplier", header.get("supplier_name", "Not found"))
279
+ with col2:
280
+ st.metric("Invoice Date", header.get("invoice_date", "Not found"))
281
+ st.metric("Customer", header.get("customer_name", "Not found"))
282
+ with col3:
283
+ st.metric("PO Number", header.get("po_number", "Not found"))
284
+ st.metric("Total Value", header.get("invoice_value", "Not found"))
285
+
286
+ # Display line items
287
+ display_line_items(invoice_data.get("line_items", []), model_choice)
288
+
289
+ # Calculate and display subtotal if not provided in header
290
+ if not header.get("invoice_value"):
291
+ try:
292
+ total = sum(float(re.sub(r'[^\d.]', '', item.get("total_price", "0")))
293
+ for item in invoice_data.get("line_items", []) if item.get("total_price"))
294
+ st.metric("Calculated Total", f"${total:,.2f}")
295
+ except:
296
+ pass
297
+
298
  else:
299
+ # Display for DeepSeek models
300
+ st.success("Information extracted successfully!")
301
+
302
+ col1, col2 = st.columns(2)
303
+ with col1:
304
+ st.metric("Invoice Number", invoice_data.get("invoice_number", "Not found"))
305
+ st.metric("PO Number", invoice_data.get("po_number", "Not found"))
306
+
307
+ with col2:
308
+ st.metric("Invoice Date", invoice_data.get("invoice_date", "Not found"))
309
+ st.metric("Invoice Value", format_currency(invoice_data.get("invoice_value")))
310
+
311
+ # Display line items for both DeepSeek models
312
+ display_line_items(invoice_data.get("line_items", []), model_choice)
313
 
314
  def extract_invoice_info(model_choice, text):
315
+ """Extract structured data from pasted text"""
316
  prompt = get_extraction_prompt(model_choice, text)
317
+ result = query_llm(model_choice, prompt)
318
+
319
+ if not result:
320
  return None
321
+
322
+ parsed_data = clean_json_response(result)
323
+ if not parsed_data:
324
+ st.error("Failed to parse JSON. Raw response:")
325
+ st.code(result)
326
  return None
327
+
328
+ # Normalize data structure based on model
329
+ if model_choice == "Llama 4 Mavericks":
330
+ if "invoice_header" not in parsed_data:
331
+ parsed_data["invoice_header"] = {}
332
+ if "line_items" not in parsed_data:
333
+ parsed_data["line_items"] = []
334
+
335
+ # Set default values for header if missing
336
+ header_fields = ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]
337
+ for field in header_fields:
338
+ if field not in parsed_data["invoice_header"]:
339
+ parsed_data["invoice_header"][field] = None
340
+
341
+ # Validate line items structure
342
+ for item in parsed_data["line_items"]:
343
+ item_fields = ["item_number", "description", "quantity", "unit_price", "total_price"]
344
+ for field in item_fields:
345
+ if field not in item:
346
+ item[field] = None if field != "quantity" else 0
347
+ if field == "quantity" and not isinstance(item[field], (int, float)):
348
+ try:
349
+ item[field] = float(item[field])
350
+ except (ValueError, TypeError):
351
+ item[field] = 0
352
+
353
+ else: # DeepSeek models
354
+ # Ensure all required fields exist
355
+ for field in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
356
+ if field not in parsed_data:
357
+ parsed_data[field] = None
358
+
359
+ # Ensure line_items exists and has proper structure
360
+ if "line_items" not in parsed_data:
361
+ parsed_data["line_items"] = []
362
+ else:
363
+ for item in parsed_data["line_items"]:
364
+ item_fields = ["description", "quantity", "unit_price", "total_price"]
365
+ for field in item_fields:
366
+ if field not in item:
367
+ item[field] = None if field != "quantity" else 0
368
+
369
+ return parsed_data
370
 
371
+ # Create tabs for different functionalities
372
+ tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
+ # PDF Summarizer Tab
375
  with tab1:
376
+ st.title("PDF to Bullet Point Summarizer 🗟 🔏")
377
+
378
+ # File uploader for the PDF
379
+ uploaded_file = st.file_uploader("Upload your PDF document", type="pdf", key="pdf_uploader")
380
+
381
+ # Slider for users to select the summarization extent
382
+ summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20, key="summary_scale")
383
+
384
+ # Submit button
385
+ submit_button = st.button("Generate Summary", key="summary_button")
386
+
387
+ # Check if the submit button is pressed
388
+ if submit_button and uploaded_file is not None:
389
+ with st.spinner('Processing...'):
390
+ # Read the PDF content
391
+ text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
392
+
393
+ # Extract key phrases from the text
394
+ key_phrases = extract_key_phrases(text)
395
+
396
+ # Score sentences based on the key phrases
397
+ sentence_scores = score_sentences(text, key_phrases)
398
+
399
+ # Determine the number of bullet points based on the selected summarization scale
400
+ total_sentences = len(list(sentence_scores.keys()))
401
+ num_points = max(1, total_sentences * summary_scale // 100)
402
+
403
+ # Generate the bullet-point summary
404
+ summary = summarize_text(sentence_scores, num_points=num_points)
405
+
406
+ # Display the summary as bullet points
407
+ st.subheader("Here's the summary: ")
408
+ st.markdown(summary)
409
 
410
+ # Invoice Extractor Tab
411
  with tab2:
412
+ st.title("📋 Invoice Extractor from PDF")
413
+ st.write("Upload an invoice PDF to extract key details")
414
+
415
+ # Model selection
416
+ model_choice = st.selectbox(
417
+ "Select AI Model",
418
+ list(MODELS.keys()),
419
+ index=0,
420
+ help="Choose which AI model to use for extraction",
421
+ key="model_choice"
422
+ )
423
+
424
+ # File uploader for the invoice PDF
425
+ invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf", key="invoice_pdf_uploader")
426
+
427
+ if st.button("Extract Invoice Information", key="invoice_button") and invoice_pdf is not None:
428
+ with st.spinner('Reading PDF...'):
429
+ # Read the PDF content
430
+ invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
431
+
432
+ # Process in status container
433
+ with st.status("Processing...", expanded=True) as status:
434
+ st.write(f"🤖 Querying {model_choice} API...")
435
+ invoice_data = extract_invoice_info(model_choice, invoice_text)
436
+
437
+ if invoice_data:
438
+ status.update(label="✅ Extraction Complete!", state="complete")
439
+ display_invoice_data(model_choice, invoice_data)
440
  else:
441
+ status.update(label="❌ Extraction Failed", state="error")
442
+ st.error("Failed to extract information. Try simplifying the text.")
443
+
444
+ # Debug information outside the status container
445
+ if invoice_data and "last_api_response" in st.session_state:
446
+ with st.expander("Debug Information"):
447
+ st.write("API Response:")
448
+ st.json(st.session_state.last_api_response)
449
+ st.write("Raw API Response:")
450
+ st.code(st.session_state.get("last_api_response_raw", "No response"))