Seth0330 commited on
Commit
e6cd773
·
verified ·
1 Parent(s): 0592d14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -8
app.py CHANGED
@@ -6,7 +6,7 @@ import re
6
  import os
7
  import time
8
 
9
- from main import extract_key_phrases, score_sentences, summarize_text # read_pdf removed
10
 
11
  st.set_page_config(page_title="PDF Tools", layout="wide")
12
 
@@ -114,9 +114,83 @@ def fallback_supplier(text):
114
  return None
115
 
116
  def get_extraction_prompt(model_choice, txt):
117
- # (no change, reuse as before)
118
  return (
119
- # [--- omitted for brevity; keep as is ---]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  "\nInvoice Text:\n"
121
  f"{txt}"
122
  )
@@ -130,7 +204,6 @@ def extract_invoice_info(model_choice, text):
130
  if not data:
131
  return None
132
 
133
- # (no change, reuse as before)
134
  if model_choice.startswith("DeepSeek"):
135
  header = {k: v for k, v in data.items() if k != "line_items"}
136
  items = data.get("line_items", [])
@@ -165,8 +238,11 @@ UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set this in your environment
165
 
166
  def extract_text_from_pdf_unstract(pdf_file):
167
  headers = {"unstract-key": UNSTRACT_API_KEY}
168
- # Step 1: POST /whisper with the PDF
169
- files = {"file": pdf_file}
 
 
 
170
  whisper_url = f"{UNSTRACT_BASE}/whisper"
171
  with st.spinner("Uploading and processing PDF with Unstract..."):
172
  r = requests.post(whisper_url, files=files, headers=headers)
@@ -180,7 +256,7 @@ def extract_text_from_pdf_unstract(pdf_file):
180
 
181
  # Step 2: Poll /whisper-status until processed
182
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
183
- for i in range(30): # Wait up to ~30 x 2 = 60 seconds
184
  status_r = requests.get(status_url, headers=headers)
185
  if status_r.status_code != 200:
186
  st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
@@ -200,7 +276,12 @@ def extract_text_from_pdf_unstract(pdf_file):
200
  if r.status_code != 200:
201
  st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
202
  return None
203
- return r.json().get("result_text") or r.text
 
 
 
 
 
204
 
205
  # --------- INVOICE EXTRACTOR UI ---------
206
  st.title("Invoice Extractor")
 
6
  import os
7
  import time
8
 
9
+ from main import extract_key_phrases, score_sentences, summarize_text # Only if still needed for later
10
 
11
  st.set_page_config(page_title="PDF Tools", layout="wide")
12
 
 
114
  return None
115
 
116
  def get_extraction_prompt(model_choice, txt):
 
117
  return (
118
+ "You are an expert invoice parser. "
119
+ "Extract data according to the visible table structure and column headers in the invoice. "
120
+ "For every line item, only extract fields that correspond to the table columns for that row (do not include header/shipment fields in line items). "
121
+ "Merge all multi-line content within a single cell into that field (especially for the 'description' and 'notes'). "
122
+ "Shipment/invoice-level fields such as CAR NUMBER, SHIPPING POINT, SHIPMENT NUMBER, CURRENCY, etc., must go ONLY into the 'invoice_header', not as line item fields.\n"
123
+ "Use this schema:\n"
124
+ '{\n'
125
+ ' "invoice_header": {\n'
126
+ ' "car_number": "string or null",\n'
127
+ ' "shipment_number": "string or null",\n'
128
+ ' "shipping_point": "string or null",\n'
129
+ ' "currency": "string or null",\n'
130
+ ' "invoice_number": "string or null",\n'
131
+ ' "invoice_date": "string or null",\n'
132
+ ' "order_number": "string or null",\n'
133
+ ' "customer_order_number": "string or null",\n'
134
+ ' "our_order_number": "string or null",\n'
135
+ ' "sales_order_number": "string or null",\n'
136
+ ' "purchase_order_number": "string or null",\n'
137
+ ' "order_date": "string or null",\n'
138
+ ' "supplier_name": "string or null",\n'
139
+ ' "supplier_address": "string or null",\n'
140
+ ' "supplier_phone": "string or null",\n'
141
+ ' "supplier_email": "string or null",\n'
142
+ ' "supplier_tax_id": "string or null",\n'
143
+ ' "customer_name": "string or null",\n'
144
+ ' "customer_address": "string or null",\n'
145
+ ' "customer_phone": "string or null",\n'
146
+ ' "customer_email": "string or null",\n'
147
+ ' "customer_tax_id": "string or null",\n'
148
+ ' "ship_to_name": "string or null",\n'
149
+ ' "ship_to_address": "string or null",\n'
150
+ ' "bill_to_name": "string or null",\n'
151
+ ' "bill_to_address": "string or null",\n'
152
+ ' "remit_to_name": "string or null",\n'
153
+ ' "remit_to_address": "string or null",\n'
154
+ ' "tax_id": "string or null",\n'
155
+ ' "tax_registration_number": "string or null",\n'
156
+ ' "vat_number": "string or null",\n'
157
+ ' "payment_terms": "string or null",\n'
158
+ ' "payment_method": "string or null",\n'
159
+ ' "payment_reference": "string or null",\n'
160
+ ' "bank_account_number": "string or null",\n'
161
+ ' "iban": "string or null",\n'
162
+ ' "swift_code": "string or null",\n'
163
+ ' "total_before_tax": "string or null",\n'
164
+ ' "tax_amount": "string or null",\n'
165
+ ' "tax_rate": "string or null",\n'
166
+ ' "shipping_charges": "string or null",\n'
167
+ ' "discount": "string or null",\n'
168
+ ' "total_due": "string or null",\n'
169
+ ' "amount_paid": "string or null",\n'
170
+ ' "balance_due": "string or null",\n'
171
+ ' "due_date": "string or null",\n'
172
+ ' "invoice_status": "string or null",\n'
173
+ ' "reference_number": "string or null",\n'
174
+ ' "project_code": "string or null",\n'
175
+ ' "department": "string or null",\n'
176
+ ' "contact_person": "string or null",\n'
177
+ ' "notes": "string or null",\n'
178
+ ' "additional_info": "string or null"\n'
179
+ ' },\n'
180
+ ' "line_items": [\n'
181
+ ' {\n'
182
+ ' "quantity": "string or null",\n'
183
+ ' "units": "string or null",\n'
184
+ ' "description": "string or null",\n'
185
+ ' "footage": "string or null",\n'
186
+ ' "price": "string or null",\n'
187
+ ' "amount": "string or null",\n'
188
+ ' "notes": "string or null"\n'
189
+ ' }\n'
190
+ ' ]\n'
191
+ '}'
192
+ "\nIf a field is missing for a line item or header, use null. "
193
+ "Do not invent fields. Do not add any header or shipment data to any line item. Return ONLY the JSON object, no explanation.\n"
194
  "\nInvoice Text:\n"
195
  f"{txt}"
196
  )
 
204
  if not data:
205
  return None
206
 
 
207
  if model_choice.startswith("DeepSeek"):
208
  header = {k: v for k, v in data.items() if k != "line_items"}
209
  items = data.get("line_items", [])
 
238
 
239
  def extract_text_from_pdf_unstract(pdf_file):
240
  headers = {"unstract-key": UNSTRACT_API_KEY}
241
+ pdf_bytes = pdf_file.read()
242
+ filename = pdf_file.name if hasattr(pdf_file, "name") else "uploaded.pdf"
243
+ files = {
244
+ "file": (filename, io.BytesIO(pdf_bytes), "application/pdf")
245
+ }
246
  whisper_url = f"{UNSTRACT_BASE}/whisper"
247
  with st.spinner("Uploading and processing PDF with Unstract..."):
248
  r = requests.post(whisper_url, files=files, headers=headers)
 
256
 
257
  # Step 2: Poll /whisper-status until processed
258
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
259
+ for i in range(30): # Wait up to 60s (2s x 30)
260
  status_r = requests.get(status_url, headers=headers)
261
  if status_r.status_code != 200:
262
  st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
 
276
  if r.status_code != 200:
277
  st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
278
  return None
279
+ # Unstract sometimes returns JSON, sometimes raw text
280
+ try:
281
+ data = r.json()
282
+ return data.get("result_text") or r.text
283
+ except Exception:
284
+ return r.text
285
 
286
  # --------- INVOICE EXTRACTOR UI ---------
287
  st.title("Invoice Extractor")