File size: 2,422 Bytes
419bd6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Parse PDF, DOCX, and PPTX files into page‑level text dicts.
Returns: List[{"text": str, "page": int | None}]
"""

import logging
from typing import List, Dict, Any

logger = logging.getLogger(__name__)


def parse_pdf(path: str) -> List[Dict[str, Any]]:
    import pdfplumber

    pages: List[Dict[str, Any]] = []
    with pdfplumber.open(path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            # Also try extracting tables
            tables = page.extract_tables() or []
            for table in tables:
                for row in table:
                    if row:
                        text += "\n" + " | ".join(str(cell or "") for cell in row)
            pages.append({"text": text, "page": i + 1})
    logger.info("Parsed PDF %s: %d pages", path, len(pages))
    return pages


def parse_docx(path: str) -> List[Dict[str, Any]]:
    from docx import Document

    doc = Document(path)
    full_text = []
    for para in doc.paragraphs:
        if para.text.strip():
            full_text.append(para.text)

    # Also extract text from tables
    for table in doc.tables:
        for row in table.rows:
            row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
            if row_text:
                full_text.append(row_text)

    combined = "\n".join(full_text)
    logger.info("Parsed DOCX %s: %d chars", path, len(combined))
    return [{"text": combined, "page": None}]


def parse_pptx(path: str) -> List[Dict[str, Any]]:
    from pptx import Presentation

    prs = Presentation(path)
    pages: List[Dict[str, Any]] = []
    for i, slide in enumerate(prs.slides):
        texts = []
        for shape in slide.shapes:
            if shape.has_text_frame:
                for paragraph in shape.text_frame.paragraphs:
                    text = paragraph.text.strip()
                    if text:
                        texts.append(text)
            if shape.has_table:
                for row in shape.table.rows:
                    row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
                    if row_text:
                        texts.append(row_text)
        page_text = "\n".join(texts)
        if page_text:
            pages.append({"text": page_text, "page": i + 1})
    logger.info("Parsed PPTX %s: %d slides", path, len(pages))
    return pages