Spaces:

dina1
/

mockup_agent

Sleeping

dina1 commited on Nov 6, 2025

Commit

05fb534

verified ·

1 Parent(s): 7cafda0

Create agents/document_parser.py

Files changed (1) hide show

agents/document_parser.py ADDED Viewed

+import asyncio
+import os
+from typing import List, Dict
+from pdfminer.high_level import extract_text as pdf_extract_text
+from docx import Document
+async def parse_documents(file_paths: List[str]) -> Dict:
+    """
+    Read PDFs/DOCX/text files and return aggregated text plus metadata.
+    """
+    loop = asyncio.get_running_loop()
+    parts = []
+    files_meta = []
+    def _read(path):
+        try:
+            if path.lower().endswith(".pdf"):
+                return pdf_extract_text(path)
+            if path.lower().endswith(".docx"):
+                doc = Document(path)
+                return "\n".join(p.text for p in doc.paragraphs)
+            with open(path, "r", encoding="utf-8") as f:
+                return f.read()
+        except Exception as e:
+            return f"[ERROR READING {path}: {e}]"
+    for p in file_paths:
+        text = await loop.run_in_executor(None, _read, p)
+        parts.append(f"===== FILE: {os.path.basename(p)} =====\n{text}")
+        try:
+            files_meta.append({"path": p, "name": os.path.basename(p), "size": os.path.getsize(p)})
+        except Exception:
+            files_meta.append({"path": p, "name": os.path.basename(p)})
+    return {"texts": "\n\n".join(parts), "files": files_meta}