File size: 2,521 Bytes
a612f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
import sys
from pathlib import Path
import fitz # pdf
from pptx import Presentation # pptx
import io


def extract_pdf(data: bytes) -> str:
    doc = fitz.open(stream=data, filetype="pdf")
    pages = [page.get_text() for page in doc]
    doc.close()
    return "\n\n".join(pages).strip()


def extract_pptx(data: bytes) -> str:
    prs = Presentation(io.BytesIO(data))
    slides = []
    for i, slide in enumerate(prs.slides, 1):
        texts = []
        for shape in slide.shapes:
            if shape.has_text_frame:
                for para in shape.text_frame.paragraphs:
                    line = para.text.strip()
                    if line:
                        texts.append(line)
        if texts:
            slides.append(f"[Slide {i}]\n" + "\n".join(texts))
    return "\n\n".join(slides).strip()


EXTRACTORS = {
    ".pdf":  extract_pdf,
    ".pptx": extract_pptx,
}

# wrapper

def extract_document(file_path: str) -> str:
    path = Path(file_path)

    if not path.exists():
        raise FileNotFoundError(f"ไม่พบไฟล์: {file_path}")

    ext = path.suffix.lower()
    if ext not in EXTRACTORS:
        raise ValueError(
            f"ไม่รองรับไฟล์ประเภท '{ext}' รองรับเฉพาะ PDF, PPTX"
        )

    data = path.read_bytes()
    text = EXTRACTORS[ext](data)

    if not text:
        raise ValueError(
            "ไม่พบข้อความ / ไม่รอบรับ PDF ที่ไม่มีข้อความ"
        )

    return truncate(clean(text))


# util functions

def clean(text: str) -> str:
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)
    return text.strip()


def truncate(text: str, max_chars: int = 12_000) -> str:
    if len(text) <= max_chars:
        return text
    cut = text[:max_chars].rfind("\n")
    return text[:cut if cut > 0 else max_chars] + \
        "\n\n[... ข้อความถูกตัดเนื่องจากเกินลิมิต]"


# if __name__ == "__main__":
#     if len(sys.argv) < 2:
#         print("usage: python extract.py <file.pdf|file.pptx>")
#         sys.exit(1)

#     try:
#         result = extract_document(sys.argv[1])
#         print(f"\n── ผลลัพธ์ ({len(result):,} chars) ──\n")
#         print(result[:500], "..." if len(result) > 500 else "")
#     except (FileNotFoundError, ValueError) as e:
#         print(f"error: {e}")
#         sys.exit(1)