Spaces:

maickzhong
/

AI-Writing-Creation-Assistant

Sleeping

App Files Files Community

AI-Writing-Creation-Assistant / document_parser.py

maickzhong

Upload 8 files

73e6793 verified 9 months ago

raw

history blame contribute delete

7.39 kB

	from PyPDF2 import PdfReader
	import docx
	import traceback
	import re # 导入正则表达式库

	class NovelOutlineParser:
	def parse(self, file_path):
	"""
	解析 PDF, DOCX, 或 TXT 文件。
	返回一个字典: {"text": 纯文本内容, "headings": 提取的标题列表, "error": 错误信息}
	"""
	text = None
	headings = [] # 初始化标题列表
	error = None
	try:
	if not file_path:
	return {"text": "", "headings": [], "error": "没有提供文件路径"}

	print(f"ℹ️ 开始解析文件: {file_path}")
	file_path_lower = file_path.lower()

	if file_path_lower.endswith('.pdf'):
	# PDF 提取标题比较困难，暂时只提取文本
	text = self._parse_pdf(file_path)
	headings = ["(PDF暂不支持提取标题)"] # 给出提示
	print("✅ PDF 文件解析完成 (未提取标题)。")
	elif file_path_lower.endswith('.docx'):
	text, headings = self._parse_docx(file_path) # 返回文本和标题列表
	print(f"✅ DOCX 文件解析完成 (提取到 {len(headings)} 个标题)。")
	elif file_path_lower.endswith('.txt'):
	text, headings = self._parse_txt(file_path) # 返回文本和标题列表
	print(f"✅ TXT 文件解析完成 (尝试提取了 {len(headings)} 个标题)。")
	else:
	error = "不支持的文件类型，请上传 .pdf, .docx 或 .txt 文件。"
	print(f"🚨 {error}")

	if text is None and error is None:
	error = f"未能从文件中提取文本内容 ({file_path})"
	print(f"🚨 {error}")

	except Exception as e:
	error = f"解析文件时发生意外错误: {str(e)}"
	print(f"🚨 {error}")
	traceback.print_exc()

	return {"text": text or "", "headings": headings or [], "error": error}

	def _parse_pdf(self, path):
	# (PDF 解析逻辑保持不变，只返回文本)
	text_content = []
	try:
	with open(path, 'rb') as f:
	reader = PdfReader(f)
	if reader.is_encrypted:
	try: reader.decrypt('')
	except: print(f"🚨 PDF '{path}' 需要密码。"); return None
	for page in reader.pages:
	extracted = page.extract_text()
	if extracted: text_content.append(extracted)
	return "\n".join(text_content)
	except Exception as e: print(f"🚨 解析 PDF '{path}' 出错: {e}"); return None

	def _parse_docx(self, path):
	"""解析 DOCX，提取文本和标题"""
	paragraphs_text = []
	headings_list = []
	try:
	doc = docx.Document(path)
	for para in doc.paragraphs:
	# 检查段落样式是否以 "Heading" 开头 (如 Heading 1, Heading 2)
	# 同时确保段落有实际文本内容
	if para.style and para.style.name.lower().startswith('heading') and para.text.strip():
	heading_text = para.text.strip()
	headings_list.append(heading_text)
	# 可以在文本中也加入标记，方便阅读上下文
	paragraphs_text.append(f"\n### {heading_text}\n") # 使用 Markdown 标记
	elif para.text.strip(): # 只添加非空段落
	paragraphs_text.append(para.text)
	# 注意：这里简单地用换行符连接所有段落文本
	# 如果需要更复杂的格式保留，逻辑会更复杂
	full_text = "\n".join(paragraphs_text)
	return full_text, headings_list
	except Exception as e:
	print(f"🚨 解析 DOCX 文件 '{path}' 时出错: {e}")
	return None, [] # 出错时返回 None 和空列表

	def _parse_txt(self, path):
	"""解析 TXT，提取文本，并尝试识别标题"""
	lines = []
	headings_list = []
	full_text_lines = []
	encodings_to_try = ['utf-8', 'gbk', 'gb2312']
	content = None

	# --- 尝试读取文件 ---
	for enc in encodings_to_try:
	try:
	with open(path, 'r', encoding=enc) as f:
	content = f.read()
	print(f" 成功使用编码 {enc} 读取 TXT 文件。")
	break # 读取成功即跳出循环
	except UnicodeDecodeError: print(f" 尝试编码 {enc} 失败...")
	except Exception as read_error: print(f" 使用编码 {enc} 读取时出错: {read_error}"); continue # 尝试下一种编码
	if content is None: print(f"🚨 无法解码 TXT 文件 '{path}'。"); return None, []

	# --- 尝试识别标题 ---
	lines = content.splitlines() # 按行分割
	for i, line in enumerate(lines):
	line_stripped = line.strip()
	is_heading = False
	heading_text = ""

	# 规则 1: Markdown 标题 (#, ##, ###)
	if line_stripped.startswith('#'):
	match = re.match(r"^(#+)\s+(.*)", line_stripped)
	if match:
	heading_text = match.group(2).strip()
	if heading_text:
	headings_list.append(heading_text)
	full_text_lines.append(f"\n{line_stripped}\n") # 保留原始 Markdown 格式
	is_heading = True

	# 规则 2: 中文章节标题 (如 "第一章 xxx", "第十章 xxx")
	# (需要根据您的实际格式调整正则表达式)
	if not is_heading:
	match_chinese_chapter = re.match(r"^\s(第\s[一二三四五六七八九十百千万零〇\d]+\s[章回卷节部篇])([^ ].)?$", line_stripped)
	if match_chinese_chapter:
	heading_text = line_stripped
	headings_list.append(heading_text)
	full_text_lines.append(f"\n### {heading_text}\n") # 添加 Markdown 标记
	is_heading = True

	# 规则 3: 可能是标题的短行（例如，居中或独立成行，前后有空行 - 此处简化判断）
	# (这个规则比较模糊，容易误判，可以根据需要开启或细化)
	# if not is_heading and len(line_stripped) > 0 and len(line_stripped) < 20: # 假设标题比较短
	# # 检查前后是否是空行（或文件边界）
	# prev_line_empty = (i == 0) or (not lines[i-1].strip())
	# next_line_empty = (i == len(lines)-1) or (not lines[i+1].strip())
	# if prev_line_empty and next_line_empty:
	# heading_text = line_stripped
	# headings_list.append(heading_text)
	# full_text_lines.append(f"\n### {heading_text}\n")
	# is_heading = True

	# 如果不是标题，则添加普通文本行
	if not is_heading:
	full_text_lines.append(line) # 保留原始行（可能包含空行）

	full_text = "\n".join(full_text_lines)
	return full_text, headings_list

	print("✅ `document_parser.py` (支持提取标题) 文件创建完成。")