Spaces:

maickzhong
/

AI-Writing-Creation-Assistant

Sleeping

File size: 7,391 Bytes

73e6793

from PyPDF2 import PdfReader
import docx
import traceback
import re # 导入正则表达式库

class NovelOutlineParser:
    def parse(self, file_path):
        """
        解析 PDF, DOCX, 或 TXT 文件。
        返回一个字典: {"text": 纯文本内容, "headings": 提取的标题列表, "error": 错误信息}
        """
        text = None
        headings = [] # 初始化标题列表
        error = None
        try:
            if not file_path:
                return {"text": "", "headings": [], "error": "没有提供文件路径"}

            print(f"ℹ️ 开始解析文件: {file_path}")
            file_path_lower = file_path.lower()

            if file_path_lower.endswith('.pdf'):
                # PDF 提取标题比较困难，暂时只提取文本
                text = self._parse_pdf(file_path)
                headings = ["(PDF暂不支持提取标题)"] # 给出提示
                print("✅ PDF 文件解析完成 (未提取标题)。")
            elif file_path_lower.endswith('.docx'):
                text, headings = self._parse_docx(file_path) # 返回文本和标题列表
                print(f"✅ DOCX 文件解析完成 (提取到 {len(headings)} 个标题)。")
            elif file_path_lower.endswith('.txt'):
                text, headings = self._parse_txt(file_path) # 返回文本和标题列表
                print(f"✅ TXT 文件解析完成 (尝试提取了 {len(headings)} 个标题)。")
            else:
                error = "不支持的文件类型，请上传 .pdf, .docx 或 .txt 文件。"
                print(f"🚨 {error}")

            if text is None and error is None:
                error = f"未能从文件中提取文本内容 ({file_path})"
                print(f"🚨 {error}")

        except Exception as e:
            error = f"解析文件时发生意外错误: {str(e)}"
            print(f"🚨 {error}")
            traceback.print_exc()

        return {"text": text or "", "headings": headings or [], "error": error}

    def _parse_pdf(self, path):
        # (PDF 解析逻辑保持不变，只返回文本)
        text_content = []
        try:
            with open(path, 'rb') as f:
                reader = PdfReader(f)
                if reader.is_encrypted:
                    try: reader.decrypt('')
                    except: print(f"🚨 PDF '{path}' 需要密码。"); return None
                for page in reader.pages:
                    extracted = page.extract_text()
                    if extracted: text_content.append(extracted)
            return "\n".join(text_content)
        except Exception as e: print(f"🚨 解析 PDF '{path}' 出错: {e}"); return None

    def _parse_docx(self, path):
        """解析 DOCX，提取文本和标题"""
        paragraphs_text = []
        headings_list = []
        try:
            doc = docx.Document(path)
            for para in doc.paragraphs:
                # 检查段落样式是否以 "Heading" 开头 (如 Heading 1, Heading 2)
                # 同时确保段落有实际文本内容
                if para.style and para.style.name.lower().startswith('heading') and para.text.strip():
                    heading_text = para.text.strip()
                    headings_list.append(heading_text)
                    # 可以在文本中也加入标记，方便阅读上下文
                    paragraphs_text.append(f"\n### {heading_text}\n") # 使用 Markdown 标记
                elif para.text.strip(): # 只添加非空段落
                     paragraphs_text.append(para.text)
            # 注意：这里简单地用换行符连接所有段落文本
            # 如果需要更复杂的格式保留，逻辑会更复杂
            full_text = "\n".join(paragraphs_text)
            return full_text, headings_list
        except Exception as e:
            print(f"🚨 解析 DOCX 文件 '{path}' 时出错: {e}")
            return None, [] # 出错时返回 None 和空列表

    def _parse_txt(self, path):
        """解析 TXT，提取文本，并尝试识别标题"""
        lines = []
        headings_list = []
        full_text_lines = []
        encodings_to_try = ['utf-8', 'gbk', 'gb2312']
        content = None

        # --- 尝试读取文件 ---
        for enc in encodings_to_try:
            try:
                with open(path, 'r', encoding=enc) as f:
                    content = f.read()
                print(f"  成功使用编码 {enc} 读取 TXT 文件。")
                break # 读取成功即跳出循环
            except UnicodeDecodeError: print(f"  尝试编码 {enc} 失败...")
            except Exception as read_error: print(f"  使用编码 {enc} 读取时出错: {read_error}"); continue # 尝试下一种编码
        if content is None: print(f"🚨 无法解码 TXT 文件 '{path}'。"); return None, []

        # --- 尝试识别标题 ---
        lines = content.splitlines() # 按行分割
        for i, line in enumerate(lines):
            line_stripped = line.strip()
            is_heading = False
            heading_text = ""

            # 规则 1: Markdown 标题 (#, ##, ###)
            if line_stripped.startswith('#'):
                 match = re.match(r"^(#+)\s+(.*)", line_stripped)
                 if match:
                     heading_text = match.group(2).strip()
                     if heading_text:
                         headings_list.append(heading_text)
                         full_text_lines.append(f"\n{line_stripped}\n") # 保留原始 Markdown 格式
                         is_heading = True

            # 规则 2: 中文章节标题 (如 "第一章 xxx", "第十章 xxx")
            # (需要根据您的实际格式调整正则表达式)
            if not is_heading:
                match_chinese_chapter = re.match(r"^\s*(第\s*[一二三四五六七八九十百千万零〇\d]+\s*[章回卷节部篇])([^ ].*)?$", line_stripped)
                if match_chinese_chapter:
                    heading_text = line_stripped
                    headings_list.append(heading_text)
                    full_text_lines.append(f"\n### {heading_text}\n") # 添加 Markdown 标记
                    is_heading = True

            # 规则 3: 可能是标题的短行（例如，居中或独立成行，前后有空行 - 此处简化判断）
            # (这个规则比较模糊，容易误判，可以根据需要开启或细化)
            # if not is_heading and len(line_stripped) > 0 and len(line_stripped) < 20: # 假设标题比较短
            #    # 检查前后是否是空行（或文件边界）
            #    prev_line_empty = (i == 0) or (not lines[i-1].strip())
            #    next_line_empty = (i == len(lines)-1) or (not lines[i+1].strip())
            #    if prev_line_empty and next_line_empty:
            #        heading_text = line_stripped
            #        headings_list.append(heading_text)
            #        full_text_lines.append(f"\n### {heading_text}\n")
            #        is_heading = True

            # 如果不是标题，则添加普通文本行
            if not is_heading:
                full_text_lines.append(line) # 保留原始行（可能包含空行）

        full_text = "\n".join(full_text_lines)
        return full_text, headings_list

print("✅ `document_parser.py` (支持提取标题) 文件创建完成。")