Spaces:
Sleeping
Sleeping
File size: 1,242 Bytes
05fb534 2f5969c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import asyncio
import os
from typing import List, Dict
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
async def parse_documents(file_paths: List[str]) -> Dict:
"""
Read PDFs/DOCX/text files and return aggregated text plus metadata.
"""
loop = asyncio.get_running_loop()
parts = []
files_meta = []
def _read(path):
try:
if path.lower().endswith(".pdf"):
return pdf_extract_text(path)
if path.lower().endswith(".docx"):
doc = Document(path)
return "\n".join(p.text for p in doc.paragraphs)
with open(path, "r", encoding="utf-8") as f:
return f.read()
except Exception as e:
return f"[ERROR READING {path}: {e}]"
for p in file_paths:
text = await loop.run_in_executor(None, _read, p)
parts.append(f"===== FILE: {os.path.basename(p)} =====\n{text}")
try:
files_meta.append({"path": p, "name": os.path.basename(p), "size": os.path.getsize(p)})
except Exception:
files_meta.append({"path": p, "name": os.path.basename(p)})
return {"text": "\n\n".join(parts), "files": files_meta}
|