dina1 commited on
Commit
05fb534
·
verified ·
1 Parent(s): 7cafda0

Create agents/document_parser.py

Browse files
Files changed (1) hide show
  1. agents/document_parser.py +35 -0
agents/document_parser.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from typing import List, Dict
4
+ from pdfminer.high_level import extract_text as pdf_extract_text
5
+ from docx import Document
6
+
7
+ async def parse_documents(file_paths: List[str]) -> Dict:
8
+ """
9
+ Read PDFs/DOCX/text files and return aggregated text plus metadata.
10
+ """
11
+ loop = asyncio.get_running_loop()
12
+ parts = []
13
+ files_meta = []
14
+
15
+ def _read(path):
16
+ try:
17
+ if path.lower().endswith(".pdf"):
18
+ return pdf_extract_text(path)
19
+ if path.lower().endswith(".docx"):
20
+ doc = Document(path)
21
+ return "\n".join(p.text for p in doc.paragraphs)
22
+ with open(path, "r", encoding="utf-8") as f:
23
+ return f.read()
24
+ except Exception as e:
25
+ return f"[ERROR READING {path}: {e}]"
26
+
27
+ for p in file_paths:
28
+ text = await loop.run_in_executor(None, _read, p)
29
+ parts.append(f"===== FILE: {os.path.basename(p)} =====\n{text}")
30
+ try:
31
+ files_meta.append({"path": p, "name": os.path.basename(p), "size": os.path.getsize(p)})
32
+ except Exception:
33
+ files_meta.append({"path": p, "name": os.path.basename(p)})
34
+
35
+ return {"texts": "\n\n".join(parts), "files": files_meta}