tank_poc / app /tools /presentation_handler.py
SUMANA SUMANAKUL (ING)
first commit
a612f32
import re
import sys
from pathlib import Path
import fitz # pdf
from pptx import Presentation # pptx
import io
def extract_pdf(data: bytes) -> str:
doc = fitz.open(stream=data, filetype="pdf")
pages = [page.get_text() for page in doc]
doc.close()
return "\n\n".join(pages).strip()
def extract_pptx(data: bytes) -> str:
prs = Presentation(io.BytesIO(data))
slides = []
for i, slide in enumerate(prs.slides, 1):
texts = []
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = para.text.strip()
if line:
texts.append(line)
if texts:
slides.append(f"[Slide {i}]\n" + "\n".join(texts))
return "\n\n".join(slides).strip()
EXTRACTORS = {
".pdf": extract_pdf,
".pptx": extract_pptx,
}
# wrapper
def extract_document(file_path: str) -> str:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"ไม่พบไฟล์: {file_path}")
ext = path.suffix.lower()
if ext not in EXTRACTORS:
raise ValueError(
f"ไม่รองรับไฟล์ประเภท '{ext}' รองรับเฉพาะ PDF, PPTX"
)
data = path.read_bytes()
text = EXTRACTORS[ext](data)
if not text:
raise ValueError(
"ไม่พบข้อความ / ไม่รอบรับ PDF ที่ไม่มีข้อความ"
)
return truncate(clean(text))
# util functions
def clean(text: str) -> str:
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r" {2,}", " ", text)
return text.strip()
def truncate(text: str, max_chars: int = 12_000) -> str:
if len(text) <= max_chars:
return text
cut = text[:max_chars].rfind("\n")
return text[:cut if cut > 0 else max_chars] + \
"\n\n[... ข้อความถูกตัดเนื่องจากเกินลิมิต]"
# if __name__ == "__main__":
# if len(sys.argv) < 2:
# print("usage: python extract.py <file.pdf|file.pptx>")
# sys.exit(1)
# try:
# result = extract_document(sys.argv[1])
# print(f"\n── ผลลัพธ์ ({len(result):,} chars) ──\n")
# print(result[:500], "..." if len(result) > 500 else "")
# except (FileNotFoundError, ValueError) as e:
# print(f"error: {e}")
# sys.exit(1)