| import re |
| import sys |
| from pathlib import Path |
| import fitz |
| from pptx import Presentation |
| import io |
|
|
|
|
| def extract_pdf(data: bytes) -> str: |
| doc = fitz.open(stream=data, filetype="pdf") |
| pages = [page.get_text() for page in doc] |
| doc.close() |
| return "\n\n".join(pages).strip() |
|
|
|
|
| def extract_pptx(data: bytes) -> str: |
| prs = Presentation(io.BytesIO(data)) |
| slides = [] |
| for i, slide in enumerate(prs.slides, 1): |
| texts = [] |
| for shape in slide.shapes: |
| if shape.has_text_frame: |
| for para in shape.text_frame.paragraphs: |
| line = para.text.strip() |
| if line: |
| texts.append(line) |
| if texts: |
| slides.append(f"[Slide {i}]\n" + "\n".join(texts)) |
| return "\n\n".join(slides).strip() |
|
|
|
|
| EXTRACTORS = { |
| ".pdf": extract_pdf, |
| ".pptx": extract_pptx, |
| } |
|
|
| |
|
|
| def extract_document(file_path: str) -> str: |
| path = Path(file_path) |
|
|
| if not path.exists(): |
| raise FileNotFoundError(f"ไม่พบไฟล์: {file_path}") |
|
|
| ext = path.suffix.lower() |
| if ext not in EXTRACTORS: |
| raise ValueError( |
| f"ไม่รองรับไฟล์ประเภท '{ext}' รองรับเฉพาะ PDF, PPTX" |
| ) |
|
|
| data = path.read_bytes() |
| text = EXTRACTORS[ext](data) |
|
|
| if not text: |
| raise ValueError( |
| "ไม่พบข้อความ / ไม่รอบรับ PDF ที่ไม่มีข้อความ" |
| ) |
|
|
| return truncate(clean(text)) |
|
|
|
|
| |
|
|
| def clean(text: str) -> str: |
| text = re.sub(r"\n{3,}", "\n\n", text) |
| text = re.sub(r" {2,}", " ", text) |
| return text.strip() |
|
|
|
|
| def truncate(text: str, max_chars: int = 12_000) -> str: |
| if len(text) <= max_chars: |
| return text |
| cut = text[:max_chars].rfind("\n") |
| return text[:cut if cut > 0 else max_chars] + \ |
| "\n\n[... ข้อความถูกตัดเนื่องจากเกินลิมิต]" |
|
|
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |