Spaces:
Sleeping
Sleeping
| import asyncio | |
| import os | |
| from typing import List, Dict | |
| from pdfminer.high_level import extract_text as pdf_extract_text | |
| from docx import Document | |
| async def parse_documents(file_paths: List[str]) -> Dict: | |
| """ | |
| Read PDFs/DOCX/text files and return aggregated text plus metadata. | |
| """ | |
| loop = asyncio.get_running_loop() | |
| parts = [] | |
| files_meta = [] | |
| def _read(path): | |
| try: | |
| if path.lower().endswith(".pdf"): | |
| return pdf_extract_text(path) | |
| if path.lower().endswith(".docx"): | |
| doc = Document(path) | |
| return "\n".join(p.text for p in doc.paragraphs) | |
| with open(path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| except Exception as e: | |
| return f"[ERROR READING {path}: {e}]" | |
| for p in file_paths: | |
| text = await loop.run_in_executor(None, _read, p) | |
| parts.append(f"===== FILE: {os.path.basename(p)} =====\n{text}") | |
| try: | |
| files_meta.append({"path": p, "name": os.path.basename(p), "size": os.path.getsize(p)}) | |
| except Exception: | |
| files_meta.append({"path": p, "name": os.path.basename(p)}) | |
| return {"text": "\n\n".join(parts), "files": files_meta} | |