\ import os from typing import List, Tuple import pdfplumber from docx import Document as DocxDocument from PIL import Image import pytesseract TEXT_EXT = {".txt", ".md", ".csv"} DOCX_EXT = {".docx"} PDF_EXT = {".pdf"} IMG_EXT = {".png", ".jpg", ".jpeg", ".webp"} def _read_text_file(path: str) -> str: return open(path, "r", encoding="utf-8", errors="ignore").read() def _read_docx(path: str) -> str: doc = DocxDocument(path) return "\n".join([p.text for p in doc.paragraphs]) def _read_pdf(path: str) -> str: out = [] with pdfplumber.open(path) as pdf: for p in pdf.pages: out.append(p.extract_text() or "") return "\n".join(out) def _read_image_ocr(path: str) -> str: img = Image.open(path) return pytesseract.image_to_string(img) def extract_text_from_files(filepaths: List[str]) -> List[Tuple[str, str]]: results = [] for fp in filepaths: _, ext = os.path.splitext(fp.lower()) try: if ext in TEXT_EXT: txt = _read_text_file(fp) elif ext in DOCX_EXT: txt = _read_docx(fp) elif ext in PDF_EXT: txt = _read_pdf(fp) elif ext in IMG_EXT: txt = _read_image_ocr(fp) else: txt = "" if txt and txt.strip(): results.append((os.path.basename(fp), txt)) except Exception: continue return results