Spaces:
Configuration error
Configuration error
| """ | |
| Parse PDF, DOCX, and PPTX files into page‑level text dicts. | |
| Returns: List[{"text": str, "page": int | None}] | |
| """ | |
| import logging | |
| from typing import List, Dict, Any | |
| logger = logging.getLogger(__name__) | |
| def parse_pdf(path: str) -> List[Dict[str, Any]]: | |
| import pdfplumber | |
| pages: List[Dict[str, Any]] = [] | |
| with pdfplumber.open(path) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| text = page.extract_text() or "" | |
| # Also try extracting tables | |
| tables = page.extract_tables() or [] | |
| for table in tables: | |
| for row in table: | |
| if row: | |
| text += "\n" + " | ".join(str(cell or "") for cell in row) | |
| pages.append({"text": text, "page": i + 1}) | |
| logger.info("Parsed PDF %s: %d pages", path, len(pages)) | |
| return pages | |
| def parse_docx(path: str) -> List[Dict[str, Any]]: | |
| from docx import Document | |
| doc = Document(path) | |
| full_text = [] | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| full_text.append(para.text) | |
| # Also extract text from tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) | |
| if row_text: | |
| full_text.append(row_text) | |
| combined = "\n".join(full_text) | |
| logger.info("Parsed DOCX %s: %d chars", path, len(combined)) | |
| return [{"text": combined, "page": None}] | |
| def parse_pptx(path: str) -> List[Dict[str, Any]]: | |
| from pptx import Presentation | |
| prs = Presentation(path) | |
| pages: List[Dict[str, Any]] = [] | |
| for i, slide in enumerate(prs.slides): | |
| texts = [] | |
| for shape in slide.shapes: | |
| if shape.has_text_frame: | |
| for paragraph in shape.text_frame.paragraphs: | |
| text = paragraph.text.strip() | |
| if text: | |
| texts.append(text) | |
| if shape.has_table: | |
| for row in shape.table.rows: | |
| row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) | |
| if row_text: | |
| texts.append(row_text) | |
| page_text = "\n".join(texts) | |
| if page_text: | |
| pages.append({"text": page_text, "page": i + 1}) | |
| logger.info("Parsed PPTX %s: %d slides", path, len(pages)) | |
| return pages |