Spaces:

dseditor
/

Docfixer

Sleeping

File size: 6,074 Bytes

accef7f
 
 
d035a8f
0b70552
 
 
accef7f
 
 
0b70552
 
 
 
 
 
 
d39278e
82e99be
d39278e
 
 
 
f06ed39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accef7f
 
 
 
af86850
accef7f
 
af86850
b14cac5
 
 
 
 
 
 
 
82e99be
 
 
accef7f
af86850
f06ed39
 
 
 
 
 
af86850
82e99be
 
 
af86850
82e99be
f06ed39
af86850
accef7f
 
f06ed39
 
 
accef7f
 
 
 
 
 
 
 
 
 
 
b14cac5
accef7f
 
82e99be
 
f06ed39
 
 
 
 
accef7f
 
82e99be
 
b14cac5
f06ed39
 
 
 
 
b14cac5
accef7f
 
 
 
b14cac5

import gradio as gr
import re
from docx import Document
from docx.shared import Cm, Pt
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
import tempfile
import os

def set_outline_level(paragraph, level: int = 0):
    p = paragraph._p
    pPr = p.get_or_add_pPr()
    outline = OxmlElement('w:outlineLvl')
    outline.set(qn('w:val'), str(level))
    pPr.append(outline)

def normalize_paragraph(text):
    # 處理段落內換行符號與多餘空格
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

def process_paragraphs_with_cleanup(doc, combined_pattern):
    """處理段落並清理多餘空行，實現 ^p^p -> ^p 效果"""
    content_list = []
    prev_empty = False
    
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            # 只有當前一個段落不是空的時候，才保留這個空段落
            if not prev_empty:
                content_list.append(('empty', ''))
            prev_empty = True
        elif re.search(combined_pattern, text):
            content_list.append(('heading', text))
            prev_empty = False
        else:
            content_list.append(('paragraph', text))
            prev_empty = False
    
    return content_list

def rebuild_document(doc, content_list):
    """重建文檔內容"""
    for kind, text in content_list:
        if kind == 'heading':
            heading = doc.add_paragraph(text, style='Heading 1')
            heading.paragraph_format.page_break_before = True
            heading.paragraph_format.space_before = Cm(0)
            heading.paragraph_format.space_after = Cm(0.3)
            heading.paragraph_format.line_spacing = 1.0
            heading.paragraph_format.left_indent = Cm(0)
            heading.paragraph_format.first_line_indent = Cm(0)
            set_outline_level(heading, 0)
            for run in heading.runs:
                run.font.name = '新細明體'
                run.font.size = Pt(16)
        elif kind == 'paragraph':
            clean_text = normalize_paragraph(text)
            para = doc.add_paragraph(clean_text)
            para.paragraph_format.space_before = Cm(0)
            para.paragraph_format.space_after = Cm(0)
            para.paragraph_format.line_spacing = 1.0
            para.paragraph_format.left_indent = Cm(0)
            para.paragraph_format.first_line_indent = Cm(0.7)
            for run in para.runs:
                run.font.name = '新細明體'
        elif kind == 'empty':
            # 只有在真正需要保留空段落時才創建
            doc.add_paragraph('')

def format_docx(file, chapter_keywords, remove_empty_paragraphs=True):
    if file is None:
        return None, "請上傳一個 Word 文件"
    if not chapter_keywords.strip():
        return None, "請輸入章節分段方式（例如：章,節,話）"

    try:
        doc = Document(file.name)

        # 確保有 Heading 1 樣式
        styles = doc.styles
        if 'Heading 1' not in styles:
            heading_style = styles.add_style('Heading 1', WD_STYLE_TYPE.PARAGRAPH)
            heading_style.base_style = styles['Normal']
            heading_style.font.bold = True
            heading_style.font.size = Pt(16)

        # 章節關鍵字模式
        keywords = [k.strip() for k in chapter_keywords.split(',')]
        patterns = [f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{k}' for k in keywords]
        combined_pattern = '|'.join(patterns)

        # 處理段落並清理多餘空行
        content_list = process_paragraphs_with_cleanup(doc, combined_pattern)
        
        # 如果選擇移除空段落，過濾掉所有空段落
        if remove_empty_paragraphs:
            content_list = [item for item in content_list if item[0] != 'empty']

        # 清空原始內容
        for p in doc.paragraphs:
            p._element.getparent().remove(p._element)

        # 重建段落
        rebuild_document(doc, content_list)

        output_path = tempfile.mktemp(suffix='.docx')
        doc.save(output_path)
        
        empty_status = "已移除所有空段落" if remove_empty_paragraphs else "保留單個空段落"
        return output_path, f"✅ 處理完成！找到章節關鍵字：{', '.join(keywords)}，{empty_status}"
    except Exception as e:
        return None, f"❌ 處理失敗：{str(e)}"

def create_interface():
    with gr.Blocks(title="Word 文件格式化工具", theme=gr.themes.Soft()) as demo:
        gr.HTML("""
        <div style="text-align: center; margin-bottom: 20px;">
            <h1>📄 Word 文件格式化工具</h1>
            <p>自動格式化您的 Word 文件，設定章節樣式和分頁</p>
        </div>
        """)

        with gr.Row():
            with gr.Column(scale=1):
                file_input = gr.File(label="上傳 Word 文件 (.docx)", file_types=[".docx"], file_count="single")
                chapter_input = gr.Textbox(label="章節分段方式", placeholder="章,節,話", value="章,節,話")
                remove_empty_checkbox = gr.Checkbox(
                    label="移除空段落", 
                    value=True, 
                    info="勾選時會移除所有空段落，取消勾選時會保留單個空段落（^p^p -> ^p）"
                )
                process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg")
            with gr.Column(scale=1):
                status_output = gr.Textbox(label="處理狀態", interactive=False, lines=3)
                download_output = gr.File(label="下載處理後的文件", interactive=False)

        process_btn.click(
            fn=format_docx, 
            inputs=[file_input, chapter_input, remove_empty_checkbox], 
            outputs=[download_output, status_output]
        )

    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)