File size: 6,074 Bytes
accef7f d035a8f 0b70552 accef7f 0b70552 d39278e 82e99be d39278e f06ed39 accef7f af86850 accef7f af86850 b14cac5 82e99be accef7f af86850 f06ed39 af86850 82e99be af86850 82e99be f06ed39 af86850 accef7f f06ed39 accef7f b14cac5 accef7f 82e99be f06ed39 accef7f 82e99be b14cac5 f06ed39 b14cac5 accef7f b14cac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
import re
from docx import Document
from docx.shared import Cm, Pt
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
import tempfile
import os
def set_outline_level(paragraph, level: int = 0):
p = paragraph._p
pPr = p.get_or_add_pPr()
outline = OxmlElement('w:outlineLvl')
outline.set(qn('w:val'), str(level))
pPr.append(outline)
def normalize_paragraph(text):
# 處理段落內換行符號與多餘空格
text = re.sub(r'[\r\n]+', ' ', text)
text = re.sub(r'\s{2,}', ' ', text)
return text.strip()
def process_paragraphs_with_cleanup(doc, combined_pattern):
"""處理段落並清理多餘空行,實現 ^p^p -> ^p 效果"""
content_list = []
prev_empty = False
for para in doc.paragraphs:
text = para.text.strip()
if not text:
# 只有當前一個段落不是空的時候,才保留這個空段落
if not prev_empty:
content_list.append(('empty', ''))
prev_empty = True
elif re.search(combined_pattern, text):
content_list.append(('heading', text))
prev_empty = False
else:
content_list.append(('paragraph', text))
prev_empty = False
return content_list
def rebuild_document(doc, content_list):
"""重建文檔內容"""
for kind, text in content_list:
if kind == 'heading':
heading = doc.add_paragraph(text, style='Heading 1')
heading.paragraph_format.page_break_before = True
heading.paragraph_format.space_before = Cm(0)
heading.paragraph_format.space_after = Cm(0.3)
heading.paragraph_format.line_spacing = 1.0
heading.paragraph_format.left_indent = Cm(0)
heading.paragraph_format.first_line_indent = Cm(0)
set_outline_level(heading, 0)
for run in heading.runs:
run.font.name = '新細明體'
run.font.size = Pt(16)
elif kind == 'paragraph':
clean_text = normalize_paragraph(text)
para = doc.add_paragraph(clean_text)
para.paragraph_format.space_before = Cm(0)
para.paragraph_format.space_after = Cm(0)
para.paragraph_format.line_spacing = 1.0
para.paragraph_format.left_indent = Cm(0)
para.paragraph_format.first_line_indent = Cm(0.7)
for run in para.runs:
run.font.name = '新細明體'
elif kind == 'empty':
# 只有在真正需要保留空段落時才創建
doc.add_paragraph('')
def format_docx(file, chapter_keywords, remove_empty_paragraphs=True):
if file is None:
return None, "請上傳一個 Word 文件"
if not chapter_keywords.strip():
return None, "請輸入章節分段方式(例如:章,節,話)"
try:
doc = Document(file.name)
# 確保有 Heading 1 樣式
styles = doc.styles
if 'Heading 1' not in styles:
heading_style = styles.add_style('Heading 1', WD_STYLE_TYPE.PARAGRAPH)
heading_style.base_style = styles['Normal']
heading_style.font.bold = True
heading_style.font.size = Pt(16)
# 章節關鍵字模式
keywords = [k.strip() for k in chapter_keywords.split(',')]
patterns = [f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{k}' for k in keywords]
combined_pattern = '|'.join(patterns)
# 處理段落並清理多餘空行
content_list = process_paragraphs_with_cleanup(doc, combined_pattern)
# 如果選擇移除空段落,過濾掉所有空段落
if remove_empty_paragraphs:
content_list = [item for item in content_list if item[0] != 'empty']
# 清空原始內容
for p in doc.paragraphs:
p._element.getparent().remove(p._element)
# 重建段落
rebuild_document(doc, content_list)
output_path = tempfile.mktemp(suffix='.docx')
doc.save(output_path)
empty_status = "已移除所有空段落" if remove_empty_paragraphs else "保留單個空段落"
return output_path, f"✅ 處理完成!找到章節關鍵字:{', '.join(keywords)},{empty_status}"
except Exception as e:
return None, f"❌ 處理失敗:{str(e)}"
def create_interface():
with gr.Blocks(title="Word 文件格式化工具", theme=gr.themes.Soft()) as demo:
gr.HTML("""
<div style="text-align: center; margin-bottom: 20px;">
<h1>📄 Word 文件格式化工具</h1>
<p>自動格式化您的 Word 文件,設定章節樣式和分頁</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="上傳 Word 文件 (.docx)", file_types=[".docx"], file_count="single")
chapter_input = gr.Textbox(label="章節分段方式", placeholder="章,節,話", value="章,節,話")
remove_empty_checkbox = gr.Checkbox(
label="移除空段落",
value=True,
info="勾選時會移除所有空段落,取消勾選時會保留單個空段落(^p^p -> ^p)"
)
process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg")
with gr.Column(scale=1):
status_output = gr.Textbox(label="處理狀態", interactive=False, lines=3)
download_output = gr.File(label="下載處理後的文件", interactive=False)
process_btn.click(
fn=format_docx,
inputs=[file_input, chapter_input, remove_empty_checkbox],
outputs=[download_output, status_output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |