Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import csv | |
| import json | |
| import os | |
| from typing import Iterable | |
| from pypdf import PdfReader | |
| from docx import Document | |
| SUPPORTED_DOC_EXTS = {".txt", ".md", ".json", ".csv", ".pdf", ".docx"} | |
| def read_text_file(path: str) -> str: | |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
| return f.read() | |
| def read_json_file(path: str) -> str: | |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
| obj = json.load(f) | |
| return json.dumps(obj, ensure_ascii=False, indent=2) | |
| def read_csv_file(path: str) -> str: | |
| rows = [] | |
| with open(path, "r", encoding="utf-8", errors="ignore", newline="") as f: | |
| reader = csv.reader(f) | |
| for row in reader: | |
| rows.append(" | ".join(str(x) for x in row)) | |
| return "\n".join(rows) | |
| def read_pdf_file(path: str) -> str: | |
| reader = PdfReader(path) | |
| texts = [] | |
| for page in reader.pages: | |
| try: | |
| texts.append(page.extract_text() or "") | |
| except Exception: | |
| continue | |
| return "\n".join(texts) | |
| def read_docx_file(path: str) -> str: | |
| doc = Document(path) | |
| parts = [p.text for p in doc.paragraphs if p.text.strip()] | |
| return "\n".join(parts) | |
| def extract_text_from_document(path: str) -> str: | |
| ext = os.path.splitext(path)[1].lower() | |
| if ext in {".txt", ".md"}: | |
| return read_text_file(path) | |
| if ext == ".json": | |
| return read_json_file(path) | |
| if ext == ".csv": | |
| return read_csv_file(path) | |
| if ext == ".pdf": | |
| return read_pdf_file(path) | |
| if ext == ".docx": | |
| return read_docx_file(path) | |
| raise ValueError(f"不支持的文档类型: {ext}") | |
| def build_document_bundle(doc_paths: Iterable[str]) -> str: | |
| blocks = [] | |
| for path in doc_paths: | |
| if not path or not os.path.exists(path): | |
| continue | |
| ext = os.path.splitext(path)[1].lower() | |
| if ext not in SUPPORTED_DOC_EXTS: | |
| continue | |
| try: | |
| content = extract_text_from_document(path).strip() | |
| except Exception as e: | |
| content = f"[文档读取失败] {type(e).__name__}: {e}" | |
| blocks.append( | |
| f"===== DOCUMENT START =====\n" | |
| f"FILE_NAME: {os.path.basename(path)}\n" | |
| f"FILE_PATH: {path}\n" | |
| f"CONTENT:\n{content}\n" | |
| f"===== DOCUMENT END =====" | |
| ) | |
| return "\n\n".join(blocks) |