import re import sys from pathlib import Path import fitz # pdf from pptx import Presentation # pptx import io def extract_pdf(data: bytes) -> str: doc = fitz.open(stream=data, filetype="pdf") pages = [page.get_text() for page in doc] doc.close() return "\n\n".join(pages).strip() def extract_pptx(data: bytes) -> str: prs = Presentation(io.BytesIO(data)) slides = [] for i, slide in enumerate(prs.slides, 1): texts = [] for shape in slide.shapes: if shape.has_text_frame: for para in shape.text_frame.paragraphs: line = para.text.strip() if line: texts.append(line) if texts: slides.append(f"[Slide {i}]\n" + "\n".join(texts)) return "\n\n".join(slides).strip() EXTRACTORS = { ".pdf": extract_pdf, ".pptx": extract_pptx, } # wrapper def extract_document(file_path: str) -> str: path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"ไม่พบไฟล์: {file_path}") ext = path.suffix.lower() if ext not in EXTRACTORS: raise ValueError( f"ไม่รองรับไฟล์ประเภท '{ext}' รองรับเฉพาะ PDF, PPTX" ) data = path.read_bytes() text = EXTRACTORS[ext](data) if not text: raise ValueError( "ไม่พบข้อความ / ไม่รอบรับ PDF ที่ไม่มีข้อความ" ) return truncate(clean(text)) # util functions def clean(text: str) -> str: text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r" {2,}", " ", text) return text.strip() def truncate(text: str, max_chars: int = 12_000) -> str: if len(text) <= max_chars: return text cut = text[:max_chars].rfind("\n") return text[:cut if cut > 0 else max_chars] + \ "\n\n[... ข้อความถูกตัดเนื่องจากเกินลิมิต]" # if __name__ == "__main__": # if len(sys.argv) < 2: # print("usage: python extract.py ") # sys.exit(1) # try: # result = extract_document(sys.argv[1]) # print(f"\n── ผลลัพธ์ ({len(result):,} chars) ──\n") # print(result[:500], "..." if len(result) > 500 else "") # except (FileNotFoundError, ValueError) as e: # print(f"error: {e}") # sys.exit(1)