File size: 1,242 Bytes
05fb534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f5969c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import asyncio
import os
from typing import List, Dict
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document

async def parse_documents(file_paths: List[str]) -> Dict:
    """
    Read PDFs/DOCX/text files and return aggregated text plus metadata.
    """
    loop = asyncio.get_running_loop()
    parts = []
    files_meta = []

    def _read(path):
        try:
            if path.lower().endswith(".pdf"):
                return pdf_extract_text(path)
            if path.lower().endswith(".docx"):
                doc = Document(path)
                return "\n".join(p.text for p in doc.paragraphs)
            with open(path, "r", encoding="utf-8") as f:
                return f.read()
        except Exception as e:
            return f"[ERROR READING {path}: {e}]"

    for p in file_paths:
        text = await loop.run_in_executor(None, _read, p)
        parts.append(f"===== FILE: {os.path.basename(p)} =====\n{text}")
        try:
            files_meta.append({"path": p, "name": os.path.basename(p), "size": os.path.getsize(p)})
        except Exception:
            files_meta.append({"path": p, "name": os.path.basename(p)})

    return {"text": "\n\n".join(parts), "files": files_meta}