Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,16 +12,22 @@ def upload_pdf_to_unstract(pdf_file):
|
|
| 12 |
headers = {
|
| 13 |
"unstract-key": UNSTRACT_API_KEY,
|
| 14 |
}
|
| 15 |
-
# Always reset file pointer
|
| 16 |
pdf_file.seek(0)
|
| 17 |
file_bytes = pdf_file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
files = {
|
| 19 |
-
"file": (
|
| 20 |
-
pdf_file.name if pdf_file.name.lower().endswith(".pdf") else "invoice.pdf",
|
| 21 |
-
file_bytes,
|
| 22 |
-
"application/pdf"
|
| 23 |
-
),
|
| 24 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
with st.spinner("Uploading and starting OCR..."):
|
| 26 |
resp = requests.post(url, headers=headers, files=files)
|
| 27 |
if resp.status_code not in (200, 202):
|
|
@@ -73,15 +79,12 @@ st.title("Unstract OCR: PDF Invoice Text Extraction")
|
|
| 73 |
uploaded_pdf = st.file_uploader("Upload Invoice PDF", type="pdf")
|
| 74 |
|
| 75 |
if st.button("Extract Text from PDF") and uploaded_pdf:
|
| 76 |
-
# Step 1: Upload PDF and get whisper_hash
|
| 77 |
whisper_hash = upload_pdf_to_unstract(uploaded_pdf)
|
| 78 |
if not whisper_hash:
|
| 79 |
st.stop()
|
| 80 |
st.success(f"File accepted. Tracking hash: {whisper_hash}")
|
| 81 |
|
| 82 |
-
# Step 2: Poll until processed
|
| 83 |
if poll_until_processed(whisper_hash):
|
| 84 |
-
# Step 3: Retrieve text
|
| 85 |
text = retrieve_text(whisper_hash)
|
| 86 |
if text:
|
| 87 |
st.success("Text extraction complete!")
|
|
|
|
| 12 |
headers = {
|
| 13 |
"unstract-key": UNSTRACT_API_KEY,
|
| 14 |
}
|
| 15 |
+
# Always reset file pointer
|
| 16 |
pdf_file.seek(0)
|
| 17 |
file_bytes = pdf_file.read()
|
| 18 |
+
|
| 19 |
+
# Force correct .pdf extension and type
|
| 20 |
+
file_name = getattr(pdf_file, "name", None)
|
| 21 |
+
if not file_name or not file_name.lower().endswith(".pdf"):
|
| 22 |
+
file_name = "invoice.pdf"
|
| 23 |
+
|
| 24 |
files = {
|
| 25 |
+
"file": (file_name, file_bytes, "application/pdf"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
+
|
| 28 |
+
# Debug print for troubleshooting
|
| 29 |
+
# st.write("Uploading file with name:", file_name)
|
| 30 |
+
|
| 31 |
with st.spinner("Uploading and starting OCR..."):
|
| 32 |
resp = requests.post(url, headers=headers, files=files)
|
| 33 |
if resp.status_code not in (200, 202):
|
|
|
|
| 79 |
uploaded_pdf = st.file_uploader("Upload Invoice PDF", type="pdf")
|
| 80 |
|
| 81 |
if st.button("Extract Text from PDF") and uploaded_pdf:
|
|
|
|
| 82 |
whisper_hash = upload_pdf_to_unstract(uploaded_pdf)
|
| 83 |
if not whisper_hash:
|
| 84 |
st.stop()
|
| 85 |
st.success(f"File accepted. Tracking hash: {whisper_hash}")
|
| 86 |
|
|
|
|
| 87 |
if poll_until_processed(whisper_hash):
|
|
|
|
| 88 |
text = retrieve_text(whisper_hash)
|
| 89 |
if text:
|
| 90 |
st.success("Text extraction complete!")
|