Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,8 +4,9 @@ import json
|
|
| 4 |
import re
|
| 5 |
import os
|
| 6 |
import time
|
|
|
|
| 7 |
|
| 8 |
-
from main import extract_key_phrases, score_sentences, summarize_text # Optional
|
| 9 |
|
| 10 |
st.set_page_config(page_title="PDF Tools", layout="wide")
|
| 11 |
|
|
@@ -231,29 +232,42 @@ def extract_invoice_info(model_choice, text):
|
|
| 231 |
itm.setdefault(k, None)
|
| 232 |
return {"invoice_header": hdr, "line_items": items}
|
| 233 |
|
| 234 |
-
# ---------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
|
| 236 |
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set this in your environment!
|
| 237 |
|
| 238 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
headers = {
|
| 240 |
"unstract-key": UNSTRACT_API_KEY,
|
| 241 |
-
"Content-Type":
|
| 242 |
}
|
| 243 |
-
pdf_bytes = pdf_file.read()
|
| 244 |
url = f"{UNSTRACT_BASE}/whisper"
|
| 245 |
|
| 246 |
-
with st.spinner("Uploading and processing
|
| 247 |
-
r = requests.post(url, headers=headers, data=
|
| 248 |
if r.status_code != 202:
|
| 249 |
-
st.error(f"Unstract: Error uploading
|
| 250 |
return None
|
| 251 |
whisper_hash = r.json().get("whisper_hash")
|
| 252 |
if not whisper_hash:
|
| 253 |
st.error("Unstract: No whisper_hash received.")
|
| 254 |
return None
|
| 255 |
|
| 256 |
-
# Step 2: Poll /whisper-status until processed
|
| 257 |
status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
|
| 258 |
for i in range(30): # Wait up to 60s (2s x 30)
|
| 259 |
status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
|
@@ -269,7 +283,6 @@ def extract_text_from_pdf_unstract(pdf_file):
|
|
| 269 |
st.error("Unstract: Timeout waiting for OCR to finish.")
|
| 270 |
return None
|
| 271 |
|
| 272 |
-
# Step 3: GET /whisper-retrieve?whisper_hash=...&text_only=true
|
| 273 |
retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
|
| 274 |
r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 275 |
if r.status_code != 200:
|
|
@@ -282,14 +295,17 @@ def extract_text_from_pdf_unstract(pdf_file):
|
|
| 282 |
return r.text
|
| 283 |
|
| 284 |
# --------- INVOICE EXTRACTOR UI ---------
|
| 285 |
-
st.title("Invoice Extractor")
|
| 286 |
mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
| 288 |
extracted_info = None
|
| 289 |
|
| 290 |
-
if st.button("Extract") and
|
| 291 |
-
with st.spinner("Extracting text from
|
| 292 |
-
text =
|
| 293 |
if text:
|
| 294 |
extracted_info = extract_invoice_info(mdl, text)
|
| 295 |
if extracted_info:
|
|
|
|
| 4 |
import re
|
| 5 |
import os
|
| 6 |
import time
|
| 7 |
+
import mimetypes
|
| 8 |
|
| 9 |
+
from main import extract_key_phrases, score_sentences, summarize_text # Optional
|
| 10 |
|
| 11 |
st.set_page_config(page_title="PDF Tools", layout="wide")
|
| 12 |
|
|
|
|
| 232 |
itm.setdefault(k, None)
|
| 233 |
return {"invoice_header": hdr, "line_items": items}
|
| 234 |
|
| 235 |
+
# --------- File type/content-type detection ---------
|
| 236 |
+
def get_content_type(filename):
|
| 237 |
+
mime, _ = mimetypes.guess_type(filename)
|
| 238 |
+
ext = filename.lower().split('.')[-1]
|
| 239 |
+
# Special case for PDF (Unstract quirk)
|
| 240 |
+
if ext == "pdf":
|
| 241 |
+
return "text/plain"
|
| 242 |
+
if mime is None:
|
| 243 |
+
return "application/octet-stream"
|
| 244 |
+
return mime
|
| 245 |
+
|
| 246 |
+
# --------- UNSTRACT API Multi-file PDF/Doc/Image-to-Text ---------
|
| 247 |
UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
|
| 248 |
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set this in your environment!
|
| 249 |
|
| 250 |
+
def extract_text_from_unstract(uploaded_file):
|
| 251 |
+
filename = getattr(uploaded_file, "name", "uploaded_file")
|
| 252 |
+
file_bytes = uploaded_file.read()
|
| 253 |
+
content_type = get_content_type(filename)
|
| 254 |
+
|
| 255 |
headers = {
|
| 256 |
"unstract-key": UNSTRACT_API_KEY,
|
| 257 |
+
"Content-Type": content_type,
|
| 258 |
}
|
|
|
|
| 259 |
url = f"{UNSTRACT_BASE}/whisper"
|
| 260 |
|
| 261 |
+
with st.spinner("Uploading and processing document with Unstract..."):
|
| 262 |
+
r = requests.post(url, headers=headers, data=file_bytes)
|
| 263 |
if r.status_code != 202:
|
| 264 |
+
st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
|
| 265 |
return None
|
| 266 |
whisper_hash = r.json().get("whisper_hash")
|
| 267 |
if not whisper_hash:
|
| 268 |
st.error("Unstract: No whisper_hash received.")
|
| 269 |
return None
|
| 270 |
|
|
|
|
| 271 |
status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
|
| 272 |
for i in range(30): # Wait up to 60s (2s x 30)
|
| 273 |
status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
|
|
|
| 283 |
st.error("Unstract: Timeout waiting for OCR to finish.")
|
| 284 |
return None
|
| 285 |
|
|
|
|
| 286 |
retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
|
| 287 |
r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 288 |
if r.status_code != 200:
|
|
|
|
| 295 |
return r.text
|
| 296 |
|
| 297 |
# --------- INVOICE EXTRACTOR UI ---------
|
| 298 |
+
st.title("Invoice/Document Extractor")
|
| 299 |
mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
|
| 300 |
+
inv_file = st.file_uploader(
|
| 301 |
+
"Invoice or Document File",
|
| 302 |
+
type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"]
|
| 303 |
+
)
|
| 304 |
extracted_info = None
|
| 305 |
|
| 306 |
+
if st.button("Extract") and inv_file:
|
| 307 |
+
with st.spinner("Extracting text from document using Unstract..."):
|
| 308 |
+
text = extract_text_from_unstract(inv_file)
|
| 309 |
if text:
|
| 310 |
extracted_info = extract_invoice_info(mdl, text)
|
| 311 |
if extracted_info:
|