from __future__ import annotations import csv import json import os from typing import Iterable from pypdf import PdfReader from docx import Document SUPPORTED_DOC_EXTS = {".txt", ".md", ".json", ".csv", ".pdf", ".docx"} def read_text_file(path: str) -> str: with open(path, "r", encoding="utf-8", errors="ignore") as f: return f.read() def read_json_file(path: str) -> str: with open(path, "r", encoding="utf-8", errors="ignore") as f: obj = json.load(f) return json.dumps(obj, ensure_ascii=False, indent=2) def read_csv_file(path: str) -> str: rows = [] with open(path, "r", encoding="utf-8", errors="ignore", newline="") as f: reader = csv.reader(f) for row in reader: rows.append(" | ".join(str(x) for x in row)) return "\n".join(rows) def read_pdf_file(path: str) -> str: reader = PdfReader(path) texts = [] for page in reader.pages: try: texts.append(page.extract_text() or "") except Exception: continue return "\n".join(texts) def read_docx_file(path: str) -> str: doc = Document(path) parts = [p.text for p in doc.paragraphs if p.text.strip()] return "\n".join(parts) def extract_text_from_document(path: str) -> str: ext = os.path.splitext(path)[1].lower() if ext in {".txt", ".md"}: return read_text_file(path) if ext == ".json": return read_json_file(path) if ext == ".csv": return read_csv_file(path) if ext == ".pdf": return read_pdf_file(path) if ext == ".docx": return read_docx_file(path) raise ValueError(f"不支持的文档类型: {ext}") def build_document_bundle(doc_paths: Iterable[str]) -> str: blocks = [] for path in doc_paths: if not path or not os.path.exists(path): continue ext = os.path.splitext(path)[1].lower() if ext not in SUPPORTED_DOC_EXTS: continue try: content = extract_text_from_document(path).strip() except Exception as e: content = f"[文档读取失败] {type(e).__name__}: {e}" blocks.append( f"===== DOCUMENT START =====\n" f"FILE_NAME: {os.path.basename(path)}\n" f"FILE_PATH: {path}\n" f"CONTENT:\n{content}\n" f"===== DOCUMENT END =====" ) return "\n\n".join(blocks)