mockup_agent / agents /document_parser.py
dina1's picture
Update agents/document_parser.py
2f5969c verified
import asyncio
import os
from typing import List, Dict
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
async def parse_documents(file_paths: List[str]) -> Dict:
"""
Read PDFs/DOCX/text files and return aggregated text plus metadata.
"""
loop = asyncio.get_running_loop()
parts = []
files_meta = []
def _read(path):
try:
if path.lower().endswith(".pdf"):
return pdf_extract_text(path)
if path.lower().endswith(".docx"):
doc = Document(path)
return "\n".join(p.text for p in doc.paragraphs)
with open(path, "r", encoding="utf-8") as f:
return f.read()
except Exception as e:
return f"[ERROR READING {path}: {e}]"
for p in file_paths:
text = await loop.run_in_executor(None, _read, p)
parts.append(f"===== FILE: {os.path.basename(p)} =====\n{text}")
try:
files_meta.append({"path": p, "name": os.path.basename(p), "size": os.path.getsize(p)})
except Exception:
files_meta.append({"path": p, "name": os.path.basename(p)})
return {"text": "\n\n".join(parts), "files": files_meta}