ASD / app /document_utils.py
Nx-Neuralon's picture
Upload 64 files
b6d0232 verified
from __future__ import annotations
import csv
import json
import os
from typing import Iterable
from pypdf import PdfReader
from docx import Document
SUPPORTED_DOC_EXTS = {".txt", ".md", ".json", ".csv", ".pdf", ".docx"}
def read_text_file(path: str) -> str:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def read_json_file(path: str) -> str:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
obj = json.load(f)
return json.dumps(obj, ensure_ascii=False, indent=2)
def read_csv_file(path: str) -> str:
rows = []
with open(path, "r", encoding="utf-8", errors="ignore", newline="") as f:
reader = csv.reader(f)
for row in reader:
rows.append(" | ".join(str(x) for x in row))
return "\n".join(rows)
def read_pdf_file(path: str) -> str:
reader = PdfReader(path)
texts = []
for page in reader.pages:
try:
texts.append(page.extract_text() or "")
except Exception:
continue
return "\n".join(texts)
def read_docx_file(path: str) -> str:
doc = Document(path)
parts = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n".join(parts)
def extract_text_from_document(path: str) -> str:
ext = os.path.splitext(path)[1].lower()
if ext in {".txt", ".md"}:
return read_text_file(path)
if ext == ".json":
return read_json_file(path)
if ext == ".csv":
return read_csv_file(path)
if ext == ".pdf":
return read_pdf_file(path)
if ext == ".docx":
return read_docx_file(path)
raise ValueError(f"不支持的文档类型: {ext}")
def build_document_bundle(doc_paths: Iterable[str]) -> str:
blocks = []
for path in doc_paths:
if not path or not os.path.exists(path):
continue
ext = os.path.splitext(path)[1].lower()
if ext not in SUPPORTED_DOC_EXTS:
continue
try:
content = extract_text_from_document(path).strip()
except Exception as e:
content = f"[文档读取失败] {type(e).__name__}: {e}"
blocks.append(
f"===== DOCUMENT START =====\n"
f"FILE_NAME: {os.path.basename(path)}\n"
f"FILE_PATH: {path}\n"
f"CONTENT:\n{content}\n"
f"===== DOCUMENT END ====="
)
return "\n\n".join(blocks)