File size: 2,521 Bytes
a612f32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | import re
import sys
from pathlib import Path
import fitz # pdf
from pptx import Presentation # pptx
import io
def extract_pdf(data: bytes) -> str:
doc = fitz.open(stream=data, filetype="pdf")
pages = [page.get_text() for page in doc]
doc.close()
return "\n\n".join(pages).strip()
def extract_pptx(data: bytes) -> str:
prs = Presentation(io.BytesIO(data))
slides = []
for i, slide in enumerate(prs.slides, 1):
texts = []
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = para.text.strip()
if line:
texts.append(line)
if texts:
slides.append(f"[Slide {i}]\n" + "\n".join(texts))
return "\n\n".join(slides).strip()
EXTRACTORS = {
".pdf": extract_pdf,
".pptx": extract_pptx,
}
# wrapper
def extract_document(file_path: str) -> str:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"ไม่พบไฟล์: {file_path}")
ext = path.suffix.lower()
if ext not in EXTRACTORS:
raise ValueError(
f"ไม่รองรับไฟล์ประเภท '{ext}' รองรับเฉพาะ PDF, PPTX"
)
data = path.read_bytes()
text = EXTRACTORS[ext](data)
if not text:
raise ValueError(
"ไม่พบข้อความ / ไม่รอบรับ PDF ที่ไม่มีข้อความ"
)
return truncate(clean(text))
# util functions
def clean(text: str) -> str:
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r" {2,}", " ", text)
return text.strip()
def truncate(text: str, max_chars: int = 12_000) -> str:
if len(text) <= max_chars:
return text
cut = text[:max_chars].rfind("\n")
return text[:cut if cut > 0 else max_chars] + \
"\n\n[... ข้อความถูกตัดเนื่องจากเกินลิมิต]"
# if __name__ == "__main__":
# if len(sys.argv) < 2:
# print("usage: python extract.py <file.pdf|file.pptx>")
# sys.exit(1)
# try:
# result = extract_document(sys.argv[1])
# print(f"\n── ผลลัพธ์ ({len(result):,} chars) ──\n")
# print(result[:500], "..." if len(result) > 500 else "")
# except (FileNotFoundError, ValueError) as e:
# print(f"error: {e}")
# sys.exit(1) |