Spaces:

Nx-Neuralon
/

ASD

Sleeping

App Files Files Community

ASD / app /document_utils.py

Nx-Neuralon

Upload 64 files

b6d0232 verified 28 days ago

raw

history blame contribute delete

2.5 kB

	from __future__ import annotations

	import csv
	import json
	import os
	from typing import Iterable

	from pypdf import PdfReader
	from docx import Document


	SUPPORTED_DOC_EXTS = {".txt", ".md", ".json", ".csv", ".pdf", ".docx"}


	def read_text_file(path: str) -> str:
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()


	def read_json_file(path: str) -> str:
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	obj = json.load(f)
	return json.dumps(obj, ensure_ascii=False, indent=2)


	def read_csv_file(path: str) -> str:
	rows = []
	with open(path, "r", encoding="utf-8", errors="ignore", newline="") as f:
	reader = csv.reader(f)
	for row in reader:
	rows.append(" \| ".join(str(x) for x in row))
	return "\n".join(rows)


	def read_pdf_file(path: str) -> str:
	reader = PdfReader(path)
	texts = []
	for page in reader.pages:
	try:
	texts.append(page.extract_text() or "")
	except Exception:
	continue
	return "\n".join(texts)


	def read_docx_file(path: str) -> str:
	doc = Document(path)
	parts = [p.text for p in doc.paragraphs if p.text.strip()]
	return "\n".join(parts)


	def extract_text_from_document(path: str) -> str:
	ext = os.path.splitext(path)[1].lower()
	if ext in {".txt", ".md"}:
	return read_text_file(path)
	if ext == ".json":
	return read_json_file(path)
	if ext == ".csv":
	return read_csv_file(path)
	if ext == ".pdf":
	return read_pdf_file(path)
	if ext == ".docx":
	return read_docx_file(path)
	raise ValueError(f"不支持的文档类型: {ext}")


	def build_document_bundle(doc_paths: Iterable[str]) -> str:
	blocks = []
	for path in doc_paths:
	if not path or not os.path.exists(path):
	continue
	ext = os.path.splitext(path)[1].lower()
	if ext not in SUPPORTED_DOC_EXTS:
	continue
	try:
	content = extract_text_from_document(path).strip()
	except Exception as e:
	content = f"[文档读取失败] {type(e).__name__}: {e}"
	blocks.append(
	f"===== DOCUMENT START =====\n"
	f"FILE_NAME: {os.path.basename(path)}\n"
	f"FILE_PATH: {path}\n"
	f"CONTENT:\n{content}\n"
	f"===== DOCUMENT END ====="
	)
	return "\n\n".join(blocks)