import asyncio import os from typing import List, Dict from pdfminer.high_level import extract_text as pdf_extract_text from docx import Document async def parse_documents(file_paths: List[str]) -> Dict: """ Read PDFs/DOCX/text files and return aggregated text plus metadata. """ loop = asyncio.get_running_loop() parts = [] files_meta = [] def _read(path): try: if path.lower().endswith(".pdf"): return pdf_extract_text(path) if path.lower().endswith(".docx"): doc = Document(path) return "\n".join(p.text for p in doc.paragraphs) with open(path, "r", encoding="utf-8") as f: return f.read() except Exception as e: return f"[ERROR READING {path}: {e}]" for p in file_paths: text = await loop.run_in_executor(None, _read, p) parts.append(f"===== FILE: {os.path.basename(p)} =====\n{text}") try: files_meta.append({"path": p, "name": os.path.basename(p), "size": os.path.getsize(p)}) except Exception: files_meta.append({"path": p, "name": os.path.basename(p)}) return {"text": "\n\n".join(parts), "files": files_meta}