File size: 2,499 Bytes
b6d0232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from __future__ import annotations

import csv
import json
import os
from typing import Iterable

from pypdf import PdfReader
from docx import Document


SUPPORTED_DOC_EXTS = {".txt", ".md", ".json", ".csv", ".pdf", ".docx"}


def read_text_file(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def read_json_file(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        obj = json.load(f)
    return json.dumps(obj, ensure_ascii=False, indent=2)


def read_csv_file(path: str) -> str:
    rows = []
    with open(path, "r", encoding="utf-8", errors="ignore", newline="") as f:
        reader = csv.reader(f)
        for row in reader:
            rows.append(" | ".join(str(x) for x in row))
    return "\n".join(rows)


def read_pdf_file(path: str) -> str:
    reader = PdfReader(path)
    texts = []
    for page in reader.pages:
        try:
            texts.append(page.extract_text() or "")
        except Exception:
            continue
    return "\n".join(texts)


def read_docx_file(path: str) -> str:
    doc = Document(path)
    parts = [p.text for p in doc.paragraphs if p.text.strip()]
    return "\n".join(parts)


def extract_text_from_document(path: str) -> str:
    ext = os.path.splitext(path)[1].lower()
    if ext in {".txt", ".md"}:
        return read_text_file(path)
    if ext == ".json":
        return read_json_file(path)
    if ext == ".csv":
        return read_csv_file(path)
    if ext == ".pdf":
        return read_pdf_file(path)
    if ext == ".docx":
        return read_docx_file(path)
    raise ValueError(f"不支持的文档类型: {ext}")


def build_document_bundle(doc_paths: Iterable[str]) -> str:
    blocks = []
    for path in doc_paths:
        if not path or not os.path.exists(path):
            continue
        ext = os.path.splitext(path)[1].lower()
        if ext not in SUPPORTED_DOC_EXTS:
            continue
        try:
            content = extract_text_from_document(path).strip()
        except Exception as e:
            content = f"[文档读取失败] {type(e).__name__}: {e}"
        blocks.append(
            f"===== DOCUMENT START =====\n"
            f"FILE_NAME: {os.path.basename(path)}\n"
            f"FILE_PATH: {path}\n"
            f"CONTENT:\n{content}\n"
            f"===== DOCUMENT END ====="
        )
    return "\n\n".join(blocks)