|
|
|
|
|
import re, pathlib |
|
|
import docx |
|
|
|
|
|
BASE = pathlib.Path(__file__).resolve().parent.parent |
|
|
RAW = BASE / "raw_docs" |
|
|
OUT = BASE / "converted" |
|
|
OUT.mkdir(exist_ok=True) |
|
|
|
|
|
def table_to_markdown(table) -> str: |
|
|
"""Convert a python-docx table into Markdown format.""" |
|
|
rows = list(table.rows) |
|
|
if not rows: |
|
|
return "" |
|
|
|
|
|
|
|
|
data = [] |
|
|
for row in rows: |
|
|
row_data = [] |
|
|
for cell in row.cells: |
|
|
cell_text = cell.text.strip() |
|
|
if not cell_text: |
|
|
cell_text = " " |
|
|
cell_text = cell_text.replace('\n', ' ') |
|
|
cell_text = ' '.join(cell_text.split()) |
|
|
row_data.append(cell_text) |
|
|
data.append(row_data) |
|
|
|
|
|
if not data: |
|
|
return "" |
|
|
|
|
|
|
|
|
max_cols = max(len(row) for row in data) |
|
|
for row in data: |
|
|
while len(row) < max_cols: |
|
|
row.append(" ") |
|
|
|
|
|
|
|
|
header = "| " + " | ".join(data[0]) + " |" |
|
|
sep = "| " + " | ".join(["---"] * len(data[0])) + " |" |
|
|
body = ["| " + " | ".join(row) + " |" for row in data[1:]] |
|
|
|
|
|
return "\n".join([header, sep] + body) |
|
|
|
|
|
def get_paragraph_formatting(paragraph): |
|
|
"""Extract formatting information from a paragraph.""" |
|
|
text = paragraph.text.strip() |
|
|
if not text: |
|
|
return None |
|
|
|
|
|
is_bold = any(run.bold for run in paragraph.runs if run.text.strip()) |
|
|
is_italic = any(run.italic for run in paragraph.runs if run.text.strip()) |
|
|
|
|
|
return { |
|
|
'text': text, |
|
|
'bold': is_bold, |
|
|
'italic': is_italic |
|
|
} |
|
|
|
|
|
def format_paragraph(para_info): |
|
|
"""Format paragraph based on bold/italic.""" |
|
|
if not para_info: |
|
|
return "" |
|
|
|
|
|
text = para_info['text'] |
|
|
if para_info['bold'] and para_info['italic']: |
|
|
return f"__*{text}*__" |
|
|
elif para_info['bold']: |
|
|
return f"__{text}__" |
|
|
elif para_info['italic']: |
|
|
return f"*{text}*" |
|
|
return text |
|
|
|
|
|
def clean_and_normalize(text: str) -> str: |
|
|
"""Normalize Vietnamese legal document structure with proper hierarchy.""" |
|
|
lines = text.split('\n') |
|
|
processed_lines = [] |
|
|
|
|
|
|
|
|
for i, line in enumerate(lines): |
|
|
original_line = line |
|
|
line = line.strip() |
|
|
if not line: |
|
|
processed_lines.append(original_line) |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(r"^__CHƯƠNG\s+[IVXLC]+", line): |
|
|
line = "# " + re.sub(r"__", "", line) |
|
|
processed_lines.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(r"^__Điều\s+\d+", line): |
|
|
line = "## " + re.sub(r"__", "", line) |
|
|
processed_lines.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
bold_italic_match = re.match(r"^__\*(.*?)\*__(.*)$", line) |
|
|
if bold_italic_match: |
|
|
header_text = bold_italic_match.group(1).strip() |
|
|
content_text = bold_italic_match.group(2).strip() |
|
|
|
|
|
|
|
|
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", header_text): |
|
|
|
|
|
if ':' in header_text: |
|
|
parts = header_text.split(':', 1) |
|
|
header = parts[0].strip() + ':' |
|
|
header_content = parts[1].strip() |
|
|
processed_lines.append("#### Điểm " + header) |
|
|
if header_content: |
|
|
processed_lines.append(header_content) |
|
|
if content_text: |
|
|
processed_lines.append(content_text) |
|
|
else: |
|
|
processed_lines.append("#### Điểm " + header_text) |
|
|
if content_text: |
|
|
processed_lines.append(content_text) |
|
|
continue |
|
|
|
|
|
|
|
|
prev_line_empty = (i == 0 or not lines[i-1].strip()) |
|
|
prev_line_is_header = (i > 0 and lines[i-1].strip() and |
|
|
(re.match(r'^__Điều\s+\d+', lines[i-1].strip()) or |
|
|
lines[i-1].strip().startswith(('##', '###', '####')))) |
|
|
|
|
|
prev_line_is_content = (i > 0 and lines[i-1].strip() and |
|
|
(lines[i-1].strip().startswith(('-', '+', '*')) or |
|
|
re.match(r'^[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]', lines[i-1].strip()))) |
|
|
is_standalone = prev_line_empty or prev_line_is_header or prev_line_is_content |
|
|
|
|
|
|
|
|
if re.match(r"^__\d+\.\s+.*__$", line) and is_standalone: |
|
|
clean_text = re.sub(r"^__(.*)__$", r"\1", line) |
|
|
khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_text) |
|
|
if khoan_match: |
|
|
number = khoan_match.group(1) |
|
|
content = khoan_match.group(2) |
|
|
line = f"### Khoản {number}. {content}" |
|
|
else: |
|
|
line = "### Khoản " + clean_text |
|
|
processed_lines.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
if (re.match(r"^\d+\.\s+[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]", line) and |
|
|
is_standalone and len(line.split()) <= 8): |
|
|
|
|
|
prev_content_idx = i - 1 |
|
|
while prev_content_idx >= 0 and not lines[prev_content_idx].strip(): |
|
|
prev_content_idx -= 1 |
|
|
|
|
|
if (prev_content_idx >= 0 and |
|
|
(re.match(r'^__Điều\s+\d+', lines[prev_content_idx].strip()) or |
|
|
lines[prev_content_idx].strip().startswith('## Điều'))): |
|
|
khoan_match = re.match(r"^(\d+)\.\s+(.*)", line) |
|
|
if khoan_match: |
|
|
number = khoan_match.group(1) |
|
|
content = khoan_match.group(2) |
|
|
line = f"### Khoản {number}. {content}" |
|
|
processed_lines.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if re.match(r"^__\*.*\*__$", line) and is_standalone: |
|
|
clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line) |
|
|
|
|
|
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text): |
|
|
|
|
|
if ':' in clean_text: |
|
|
parts = clean_text.split(':', 1) |
|
|
header = parts[0].strip() + ':' |
|
|
content = parts[1].strip() |
|
|
processed_lines.append("#### Điểm " + header) |
|
|
if content: |
|
|
processed_lines.append(content) |
|
|
else: |
|
|
processed_lines.append("#### Điểm " + clean_text) |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(r"^\*.*\*$", line) and is_standalone: |
|
|
clean_text = re.sub(r"^\*(.*)\*$", r"\1", line) |
|
|
|
|
|
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text): |
|
|
|
|
|
if ':' in clean_text: |
|
|
parts = clean_text.split(':', 1) |
|
|
header = parts[0].strip() + ':' |
|
|
content = parts[1].strip() |
|
|
processed_lines.append("#### Điểm " + header) |
|
|
if content: |
|
|
processed_lines.append(content) |
|
|
else: |
|
|
processed_lines.append("#### Điểm " + clean_text) |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(r"^__\d+\.\d+\.\s+.*__$", line): |
|
|
clean_text = re.sub(r"^__(.*)__$", r"\1", line) |
|
|
line = "#### Điểm " + clean_text |
|
|
processed_lines.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(r"^\*\d+\.\d+\.\s+.*\*$", line): |
|
|
clean_text = re.sub(r"^\*(.*)\*$", r"\1", line) |
|
|
line = "#### Điểm " + clean_text |
|
|
processed_lines.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(r"^__\*\*.*\*__$", line) and is_standalone: |
|
|
clean_text = re.sub(r"^__\*\*(.*)\*__$", r"\1", line) |
|
|
line = "#### Điểm *" + clean_text |
|
|
processed_lines.append(line) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if re.match(r"^__\*[a-z]\)\s+.*\*__$", line): |
|
|
clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line) |
|
|
|
|
|
if ':' in clean_text: |
|
|
parts = clean_text.split(':', 1) |
|
|
header = parts[0].strip() + ':' |
|
|
content = parts[1].strip() |
|
|
processed_lines.append("#### Điểm " + header) |
|
|
if content: |
|
|
processed_lines.append(content) |
|
|
else: |
|
|
processed_lines.append("#### Điểm " + clean_text) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bold_match = re.match(r"^(__.*?__)\s*(.*)$", line) |
|
|
if bold_match: |
|
|
header_part = bold_match.group(1) |
|
|
content_part = bold_match.group(2).strip() |
|
|
|
|
|
|
|
|
clean_header = re.sub(r"^__(.*)__$", r"\1", header_part) |
|
|
if re.match(r"^\d+\.\s+", clean_header): |
|
|
khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_header) |
|
|
if khoan_match: |
|
|
number = khoan_match.group(1) |
|
|
header_content = khoan_match.group(2) |
|
|
processed_lines.append(f"### Khoản {number}. {header_content}") |
|
|
if content_part: |
|
|
processed_lines.append(content_part) |
|
|
continue |
|
|
|
|
|
elif re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header): |
|
|
processed_lines.append("#### Điểm " + clean_header) |
|
|
if content_part: |
|
|
processed_lines.append(content_part) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
italic_match = re.match(r"^(\*.*?\*)\s*(.*)$", line) |
|
|
if italic_match: |
|
|
header_part = italic_match.group(1) |
|
|
content_part = italic_match.group(2).strip() |
|
|
|
|
|
|
|
|
clean_header = re.sub(r"^\*(.*)\*$", r"\1", header_part) |
|
|
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header): |
|
|
processed_lines.append("#### Điểm " + clean_header) |
|
|
if content_part: |
|
|
processed_lines.append(content_part) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processed_lines.append(line) |
|
|
|
|
|
|
|
|
return '\n'.join(processed_lines).strip() |
|
|
|
|
|
def convert_doc_to_md(doc_path, md_path): |
|
|
"""Convert document (paragraphs + tables) to Markdown with normalization.""" |
|
|
doc = docx.Document(doc_path) |
|
|
markdown_lines = [] |
|
|
|
|
|
for element in doc.element.body: |
|
|
if element.tag.endswith('tbl'): |
|
|
table = docx.table.Table(element, doc) |
|
|
md_table = table_to_markdown(table) |
|
|
if markdown_lines and markdown_lines[-1].strip(): |
|
|
markdown_lines.append("") |
|
|
markdown_lines.append(md_table) |
|
|
markdown_lines.append("") |
|
|
|
|
|
elif element.tag.endswith('p'): |
|
|
paragraph = docx.text.paragraph.Paragraph(element, doc) |
|
|
para_info = get_paragraph_formatting(paragraph) |
|
|
if para_info and para_info['text']: |
|
|
markdown_lines.append(format_paragraph(para_info)) |
|
|
|
|
|
|
|
|
final_text = '\n'.join(markdown_lines) |
|
|
final_text = clean_and_normalize(final_text) |
|
|
|
|
|
md_path.write_text(final_text, encoding="utf-8") |
|
|
return md_path |
|
|
|
|
|
if __name__ == "__main__": |
|
|
for doc in RAW.iterdir(): |
|
|
if doc.suffix.lower() not in [".doc", ".docx"]: |
|
|
print("Skipping:", doc); continue |
|
|
out = OUT / (doc.stem + ".md") |
|
|
convert_doc_to_md(doc, out) |
|
|
print("Converted:", out) |
|
|
|