Spaces:
Running
Running
File size: 1,233 Bytes
c1d887c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | import pdfplumber
from io import BytesIO
import logging
logger = logging.getLogger(__name__)
def parse_pdf_layout(content: bytes):
"""
Extract text and tables using pdfplumber while preserving layout.
Preserves Filipino/English (Taglish) content as-is.
"""
try:
results = []
with pdfplumber.open(BytesIO(content)) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text()
tables = page.extract_tables()
# Flatten tables into Markdown-like grid strings for better semantic grouping
formatted_tables = []
for table in tables:
if not table:
continue
rows = [" | ".join([str(cell).strip() if cell else "" for cell in row]) for row in table]
formatted_tables.append("\n".join(rows))
results.append({
"page_number": i + 1,
"text": text or "",
"tables": formatted_tables
})
return results
except Exception as e:
logger.error(f"Error parsing PDF: {e}")
raise
|