Seth0330 commited on
Commit
2682cc6
·
verified ·
1 Parent(s): b3c1ec9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -15
app.py CHANGED
@@ -4,8 +4,9 @@ import json
4
  import re
5
  import os
6
  import time
 
7
 
8
- from main import extract_key_phrases, score_sentences, summarize_text # Optional, if you use these
9
 
10
  st.set_page_config(page_title="PDF Tools", layout="wide")
11
 
@@ -231,29 +232,42 @@ def extract_invoice_info(model_choice, text):
231
  itm.setdefault(k, None)
232
  return {"invoice_header": hdr, "line_items": items}
233
 
234
- # --------- UNSTRACT API PDF-TO-TEXT HELPER ---------
 
 
 
 
 
 
 
 
 
 
 
235
  UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
236
  UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set this in your environment!
237
 
238
- def extract_text_from_pdf_unstract(pdf_file):
 
 
 
 
239
  headers = {
240
  "unstract-key": UNSTRACT_API_KEY,
241
- "Content-Type": "text/plain", # Matches your working Postman code!
242
  }
243
- pdf_bytes = pdf_file.read()
244
  url = f"{UNSTRACT_BASE}/whisper"
245
 
246
- with st.spinner("Uploading and processing PDF with Unstract..."):
247
- r = requests.post(url, headers=headers, data=pdf_bytes)
248
  if r.status_code != 202:
249
- st.error(f"Unstract: Error uploading PDF: {r.status_code} - {r.text}")
250
  return None
251
  whisper_hash = r.json().get("whisper_hash")
252
  if not whisper_hash:
253
  st.error("Unstract: No whisper_hash received.")
254
  return None
255
 
256
- # Step 2: Poll /whisper-status until processed
257
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
258
  for i in range(30): # Wait up to 60s (2s x 30)
259
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
@@ -269,7 +283,6 @@ def extract_text_from_pdf_unstract(pdf_file):
269
  st.error("Unstract: Timeout waiting for OCR to finish.")
270
  return None
271
 
272
- # Step 3: GET /whisper-retrieve?whisper_hash=...&text_only=true
273
  retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
274
  r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
275
  if r.status_code != 200:
@@ -282,14 +295,17 @@ def extract_text_from_pdf_unstract(pdf_file):
282
  return r.text
283
 
284
  # --------- INVOICE EXTRACTOR UI ---------
285
- st.title("Invoice Extractor")
286
  mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
287
- inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
 
 
 
288
  extracted_info = None
289
 
290
- if st.button("Extract") and inv_pdf:
291
- with st.spinner("Extracting text from PDF using Unstract..."):
292
- text = extract_text_from_pdf_unstract(inv_pdf)
293
  if text:
294
  extracted_info = extract_invoice_info(mdl, text)
295
  if extracted_info:
 
4
  import re
5
  import os
6
  import time
7
+ import mimetypes
8
 
9
+ from main import extract_key_phrases, score_sentences, summarize_text # Optional
10
 
11
  st.set_page_config(page_title="PDF Tools", layout="wide")
12
 
 
232
  itm.setdefault(k, None)
233
  return {"invoice_header": hdr, "line_items": items}
234
 
235
+ # --------- File type/content-type detection ---------
236
+ def get_content_type(filename):
237
+ mime, _ = mimetypes.guess_type(filename)
238
+ ext = filename.lower().split('.')[-1]
239
+ # Special case for PDF (Unstract quirk)
240
+ if ext == "pdf":
241
+ return "text/plain"
242
+ if mime is None:
243
+ return "application/octet-stream"
244
+ return mime
245
+
246
+ # --------- UNSTRACT API Multi-file PDF/Doc/Image-to-Text ---------
247
  UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
248
  UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set this in your environment!
249
 
250
+ def extract_text_from_unstract(uploaded_file):
251
+ filename = getattr(uploaded_file, "name", "uploaded_file")
252
+ file_bytes = uploaded_file.read()
253
+ content_type = get_content_type(filename)
254
+
255
  headers = {
256
  "unstract-key": UNSTRACT_API_KEY,
257
+ "Content-Type": content_type,
258
  }
 
259
  url = f"{UNSTRACT_BASE}/whisper"
260
 
261
+ with st.spinner("Uploading and processing document with Unstract..."):
262
+ r = requests.post(url, headers=headers, data=file_bytes)
263
  if r.status_code != 202:
264
+ st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
265
  return None
266
  whisper_hash = r.json().get("whisper_hash")
267
  if not whisper_hash:
268
  st.error("Unstract: No whisper_hash received.")
269
  return None
270
 
 
271
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
272
  for i in range(30): # Wait up to 60s (2s x 30)
273
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
 
283
  st.error("Unstract: Timeout waiting for OCR to finish.")
284
  return None
285
 
 
286
  retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
287
  r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
288
  if r.status_code != 200:
 
295
  return r.text
296
 
297
  # --------- INVOICE EXTRACTOR UI ---------
298
+ st.title("Invoice/Document Extractor")
299
  mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
300
+ inv_file = st.file_uploader(
301
+ "Invoice or Document File",
302
+ type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"]
303
+ )
304
  extracted_info = None
305
 
306
+ if st.button("Extract") and inv_file:
307
+ with st.spinner("Extracting text from document using Unstract..."):
308
+ text = extract_text_from_unstract(inv_file)
309
  if text:
310
  extracted_info = extract_invoice_info(mdl, text)
311
  if extracted_info: