|
|
from PyPDF2 import PdfReader |
|
|
import docx |
|
|
import traceback |
|
|
import re |
|
|
|
|
|
class NovelOutlineParser: |
|
|
def parse(self, file_path): |
|
|
""" |
|
|
解析 PDF, DOCX, 或 TXT 文件。 |
|
|
返回一个字典: {"text": 纯文本内容, "headings": 提取的标题列表, "error": 错误信息} |
|
|
""" |
|
|
text = None |
|
|
headings = [] |
|
|
error = None |
|
|
try: |
|
|
if not file_path: |
|
|
return {"text": "", "headings": [], "error": "没有提供文件路径"} |
|
|
|
|
|
print(f"ℹ️ 开始解析文件: {file_path}") |
|
|
file_path_lower = file_path.lower() |
|
|
|
|
|
if file_path_lower.endswith('.pdf'): |
|
|
|
|
|
text = self._parse_pdf(file_path) |
|
|
headings = ["(PDF暂不支持提取标题)"] |
|
|
print("✅ PDF 文件解析完成 (未提取标题)。") |
|
|
elif file_path_lower.endswith('.docx'): |
|
|
text, headings = self._parse_docx(file_path) |
|
|
print(f"✅ DOCX 文件解析完成 (提取到 {len(headings)} 个标题)。") |
|
|
elif file_path_lower.endswith('.txt'): |
|
|
text, headings = self._parse_txt(file_path) |
|
|
print(f"✅ TXT 文件解析完成 (尝试提取了 {len(headings)} 个标题)。") |
|
|
else: |
|
|
error = "不支持的文件类型,请上传 .pdf, .docx 或 .txt 文件。" |
|
|
print(f"🚨 {error}") |
|
|
|
|
|
if text is None and error is None: |
|
|
error = f"未能从文件中提取文本内容 ({file_path})" |
|
|
print(f"🚨 {error}") |
|
|
|
|
|
except Exception as e: |
|
|
error = f"解析文件时发生意外错误: {str(e)}" |
|
|
print(f"🚨 {error}") |
|
|
traceback.print_exc() |
|
|
|
|
|
return {"text": text or "", "headings": headings or [], "error": error} |
|
|
|
|
|
def _parse_pdf(self, path): |
|
|
|
|
|
text_content = [] |
|
|
try: |
|
|
with open(path, 'rb') as f: |
|
|
reader = PdfReader(f) |
|
|
if reader.is_encrypted: |
|
|
try: reader.decrypt('') |
|
|
except: print(f"🚨 PDF '{path}' 需要密码。"); return None |
|
|
for page in reader.pages: |
|
|
extracted = page.extract_text() |
|
|
if extracted: text_content.append(extracted) |
|
|
return "\n".join(text_content) |
|
|
except Exception as e: print(f"🚨 解析 PDF '{path}' 出错: {e}"); return None |
|
|
|
|
|
def _parse_docx(self, path): |
|
|
"""解析 DOCX,提取文本和标题""" |
|
|
paragraphs_text = [] |
|
|
headings_list = [] |
|
|
try: |
|
|
doc = docx.Document(path) |
|
|
for para in doc.paragraphs: |
|
|
|
|
|
|
|
|
if para.style and para.style.name.lower().startswith('heading') and para.text.strip(): |
|
|
heading_text = para.text.strip() |
|
|
headings_list.append(heading_text) |
|
|
|
|
|
paragraphs_text.append(f"\n### {heading_text}\n") |
|
|
elif para.text.strip(): |
|
|
paragraphs_text.append(para.text) |
|
|
|
|
|
|
|
|
full_text = "\n".join(paragraphs_text) |
|
|
return full_text, headings_list |
|
|
except Exception as e: |
|
|
print(f"🚨 解析 DOCX 文件 '{path}' 时出错: {e}") |
|
|
return None, [] |
|
|
|
|
|
def _parse_txt(self, path): |
|
|
"""解析 TXT,提取文本,并尝试识别标题""" |
|
|
lines = [] |
|
|
headings_list = [] |
|
|
full_text_lines = [] |
|
|
encodings_to_try = ['utf-8', 'gbk', 'gb2312'] |
|
|
content = None |
|
|
|
|
|
|
|
|
for enc in encodings_to_try: |
|
|
try: |
|
|
with open(path, 'r', encoding=enc) as f: |
|
|
content = f.read() |
|
|
print(f" 成功使用编码 {enc} 读取 TXT 文件。") |
|
|
break |
|
|
except UnicodeDecodeError: print(f" 尝试编码 {enc} 失败...") |
|
|
except Exception as read_error: print(f" 使用编码 {enc} 读取时出错: {read_error}"); continue |
|
|
if content is None: print(f"🚨 无法解码 TXT 文件 '{path}'。"); return None, [] |
|
|
|
|
|
|
|
|
lines = content.splitlines() |
|
|
for i, line in enumerate(lines): |
|
|
line_stripped = line.strip() |
|
|
is_heading = False |
|
|
heading_text = "" |
|
|
|
|
|
|
|
|
if line_stripped.startswith('#'): |
|
|
match = re.match(r"^(#+)\s+(.*)", line_stripped) |
|
|
if match: |
|
|
heading_text = match.group(2).strip() |
|
|
if heading_text: |
|
|
headings_list.append(heading_text) |
|
|
full_text_lines.append(f"\n{line_stripped}\n") |
|
|
is_heading = True |
|
|
|
|
|
|
|
|
|
|
|
if not is_heading: |
|
|
match_chinese_chapter = re.match(r"^\s*(第\s*[一二三四五六七八九十百千万零〇\d]+\s*[章回卷节部篇])([^ ].*)?$", line_stripped) |
|
|
if match_chinese_chapter: |
|
|
heading_text = line_stripped |
|
|
headings_list.append(heading_text) |
|
|
full_text_lines.append(f"\n### {heading_text}\n") |
|
|
is_heading = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not is_heading: |
|
|
full_text_lines.append(line) |
|
|
|
|
|
full_text = "\n".join(full_text_lines) |
|
|
return full_text, headings_list |
|
|
|
|
|
print("✅ `document_parser.py` (支持提取标题) 文件创建完成。") |
|
|
|