Spaces:
Runtime error
Runtime error
Rivalcoder
commited on
Commit
Β·
40c134d
1
Parent(s):
71a01ff
[Edit] Update Access
Browse files- pdf_parser.py +15 -2
pdf_parser.py
CHANGED
|
@@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 5 |
from PIL import Image
|
| 6 |
import pytesseract
|
| 7 |
import imghdr
|
|
|
|
| 8 |
|
| 9 |
def _extract_text(page):
|
| 10 |
text = page.get_text()
|
|
@@ -19,7 +20,7 @@ def extract_text_from_image_bytes(image_bytes):
|
|
| 19 |
|
| 20 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
| 21 |
"""
|
| 22 |
-
Download document (PDF or
|
| 23 |
Gracefully return fallback message if unsupported or failed.
|
| 24 |
"""
|
| 25 |
try:
|
|
@@ -30,6 +31,18 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
|
| 30 |
print(f"β Failed to download: {str(e)}")
|
| 31 |
return [f"No data found in this document (download error)"]
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Check for unsupported content
|
| 34 |
if "zip" in content_type or url.endswith(".zip"):
|
| 35 |
return ["No data found in this document (zip)"]
|
|
@@ -46,7 +59,7 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
|
| 46 |
print(f"β OCR failed: {str(e)}")
|
| 47 |
return [f"No data found in this document (image/OCR error)"]
|
| 48 |
|
| 49 |
-
# Try PDF
|
| 50 |
try:
|
| 51 |
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
|
| 52 |
pages = list(doc)
|
|
|
|
| 5 |
from PIL import Image
|
| 6 |
import pytesseract
|
| 7 |
import imghdr
|
| 8 |
+
from bs4 import BeautifulSoup # pip install beautifulsoup4
|
| 9 |
|
| 10 |
def _extract_text(page):
|
| 11 |
text = page.get_text()
|
|
|
|
| 20 |
|
| 21 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
| 22 |
"""
|
| 23 |
+
Download document (PDF, Image, or Webpage) from URL, extract text accordingly.
|
| 24 |
Gracefully return fallback message if unsupported or failed.
|
| 25 |
"""
|
| 26 |
try:
|
|
|
|
| 31 |
print(f"β Failed to download: {str(e)}")
|
| 32 |
return [f"No data found in this document (download error)"]
|
| 33 |
|
| 34 |
+
# Handle HTML webpages
|
| 35 |
+
if "text/html" in content_type or url.endswith(".html"):
|
| 36 |
+
print("π Detected HTML page. Extracting text...")
|
| 37 |
+
try:
|
| 38 |
+
soup = BeautifulSoup(content, "html.parser")
|
| 39 |
+
text = soup.get_text(separator="\n")
|
| 40 |
+
lines = [t.strip() for t in text.splitlines() if t.strip()]
|
| 41 |
+
return lines if lines else ["No data found in this document (empty HTML)"]
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"β HTML parse failed: {str(e)}")
|
| 44 |
+
return [f"No data found in this document (HTML error)"]
|
| 45 |
+
|
| 46 |
# Check for unsupported content
|
| 47 |
if "zip" in content_type or url.endswith(".zip"):
|
| 48 |
return ["No data found in this document (zip)"]
|
|
|
|
| 59 |
print(f"β OCR failed: {str(e)}")
|
| 60 |
return [f"No data found in this document (image/OCR error)"]
|
| 61 |
|
| 62 |
+
# Try PDF parsing
|
| 63 |
try:
|
| 64 |
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
|
| 65 |
pages = list(doc)
|