Spaces:
Sleeping
Sleeping
| \ | |
| import os | |
| from typing import List, Tuple | |
| import pdfplumber | |
| from docx import Document as DocxDocument | |
| from PIL import Image | |
| import pytesseract | |
| TEXT_EXT = {".txt", ".md", ".csv"} | |
| DOCX_EXT = {".docx"} | |
| PDF_EXT = {".pdf"} | |
| IMG_EXT = {".png", ".jpg", ".jpeg", ".webp"} | |
| def _read_text_file(path: str) -> str: | |
| return open(path, "r", encoding="utf-8", errors="ignore").read() | |
| def _read_docx(path: str) -> str: | |
| doc = DocxDocument(path) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| def _read_pdf(path: str) -> str: | |
| out = [] | |
| with pdfplumber.open(path) as pdf: | |
| for p in pdf.pages: | |
| out.append(p.extract_text() or "") | |
| return "\n".join(out) | |
| def _read_image_ocr(path: str) -> str: | |
| img = Image.open(path) | |
| return pytesseract.image_to_string(img) | |
| def extract_text_from_files(filepaths: List[str]) -> List[Tuple[str, str]]: | |
| results = [] | |
| for fp in filepaths: | |
| _, ext = os.path.splitext(fp.lower()) | |
| try: | |
| if ext in TEXT_EXT: | |
| txt = _read_text_file(fp) | |
| elif ext in DOCX_EXT: | |
| txt = _read_docx(fp) | |
| elif ext in PDF_EXT: | |
| txt = _read_pdf(fp) | |
| elif ext in IMG_EXT: | |
| txt = _read_image_ocr(fp) | |
| else: | |
| txt = "" | |
| if txt and txt.strip(): | |
| results.append((os.path.basename(fp), txt)) | |
| except Exception: | |
| continue | |
| return results | |