snote / scripts /document_parser.py
xuanbao01's picture
Upload folder using huggingface_hub
44c5827 verified
#!/usr/bin/env python3
import re, pathlib
import docx # from python-docx
BASE = pathlib.Path(__file__).resolve().parent.parent
RAW = BASE / "raw_docs"
OUT = BASE / "converted"
OUT.mkdir(exist_ok=True)
def table_to_markdown(table) -> str:
"""Convert a python-docx table into Markdown format."""
rows = list(table.rows)
if not rows:
return ""
# Extract text from each cell
data = []
for row in rows:
row_data = []
for cell in row.cells:
cell_text = cell.text.strip()
if not cell_text:
cell_text = " "
cell_text = cell_text.replace('\n', ' ')
cell_text = ' '.join(cell_text.split())
row_data.append(cell_text)
data.append(row_data)
if not data:
return ""
# Ensure all rows same length
max_cols = max(len(row) for row in data)
for row in data:
while len(row) < max_cols:
row.append(" ")
# Build markdown table
header = "| " + " | ".join(data[0]) + " |"
sep = "| " + " | ".join(["---"] * len(data[0])) + " |"
body = ["| " + " | ".join(row) + " |" for row in data[1:]]
return "\n".join([header, sep] + body)
def get_paragraph_formatting(paragraph):
"""Extract formatting information from a paragraph."""
text = paragraph.text.strip()
if not text:
return None
is_bold = any(run.bold for run in paragraph.runs if run.text.strip())
is_italic = any(run.italic for run in paragraph.runs if run.text.strip())
return {
'text': text,
'bold': is_bold,
'italic': is_italic
}
def format_paragraph(para_info):
"""Format paragraph based on bold/italic."""
if not para_info:
return ""
text = para_info['text']
if para_info['bold'] and para_info['italic']:
return f"__*{text}*__"
elif para_info['bold']:
return f"__{text}__"
elif para_info['italic']:
return f"*{text}*"
return text
def clean_and_normalize(text: str) -> str:
"""Normalize Vietnamese legal document structure with proper hierarchy."""
lines = text.split('\n')
processed_lines = []
# First pass: Convert basic formatting and handle Khoản/Điểm
for i, line in enumerate(lines):
original_line = line
line = line.strip()
if not line:
processed_lines.append(original_line)
continue
# CHƯƠNG -> #
if re.match(r"^__CHƯƠNG\s+[IVXLC]+", line):
line = "# " + re.sub(r"__", "", line)
processed_lines.append(line)
continue
# Điều -> ##
if re.match(r"^__Điều\s+\d+", line):
line = "## " + re.sub(r"__", "", line)
processed_lines.append(line)
continue
# Handle mixed formatting first (before other rules)
# Check for bold+italic header followed by plain text: __*header*__ content
bold_italic_match = re.match(r"^__\*(.*?)\*__(.*)$", line)
if bold_italic_match:
header_text = bold_italic_match.group(1).strip()
content_text = bold_italic_match.group(2).strip()
# Check if header starts with number, letter, or asterisk
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", header_text):
# Split at colon if present
if ':' in header_text:
parts = header_text.split(':', 1)
header = parts[0].strip() + ':'
header_content = parts[1].strip()
processed_lines.append("#### Điểm " + header)
if header_content:
processed_lines.append(header_content)
if content_text:
processed_lines.append(content_text)
else:
processed_lines.append("#### Điểm " + header_text)
if content_text:
processed_lines.append(content_text)
continue
# Check if this line is standalone (at beginning of paragraph)
prev_line_empty = (i == 0 or not lines[i-1].strip())
prev_line_is_header = (i > 0 and lines[i-1].strip() and
(re.match(r'^__Điều\s+\d+', lines[i-1].strip()) or
lines[i-1].strip().startswith(('##', '###', '####'))))
# Also consider it standalone if previous line is a bullet point or content line
prev_line_is_content = (i > 0 and lines[i-1].strip() and
(lines[i-1].strip().startswith(('-', '+', '*')) or
re.match(r'^[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]', lines[i-1].strip())))
is_standalone = prev_line_empty or prev_line_is_header or prev_line_is_content
# Khoản: Bold text (__text__), standalone, starts with number
if re.match(r"^__\d+\.\s+.*__$", line) and is_standalone:
clean_text = re.sub(r"^__(.*)__$", r"\1", line)
khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_text)
if khoan_match:
number = khoan_match.group(1)
content = khoan_match.group(2)
line = f"### Khoản {number}. {content}"
else:
line = "### Khoản " + clean_text
processed_lines.append(line)
continue
# Handle plain numbered items that follow Điều and look like section headers
if (re.match(r"^\d+\.\s+[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]", line) and
is_standalone and len(line.split()) <= 8): # Short enough to be a header
# Check if previous non-empty line is a Điều
prev_content_idx = i - 1
while prev_content_idx >= 0 and not lines[prev_content_idx].strip():
prev_content_idx -= 1
if (prev_content_idx >= 0 and
(re.match(r'^__Điều\s+\d+', lines[prev_content_idx].strip()) or
lines[prev_content_idx].strip().startswith('## Điều'))):
khoan_match = re.match(r"^(\d+)\.\s+(.*)", line)
if khoan_match:
number = khoan_match.group(1)
content = khoan_match.group(2)
line = f"### Khoản {number}. {content}"
processed_lines.append(line)
continue
# Note: Only bold numbered items should be converted to Khoản
# Plain numbered items should remain as regular numbered lists
# Điểm: Bold+italic text (__*text*__), standalone, starts with number or letter
if re.match(r"^__\*.*\*__$", line) and is_standalone:
clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line)
# Check if starts with number (1., 1.1., etc.), letter (a), b), etc.), or asterisk (*)
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text):
# Check if there's content after a colon that should be separated
if ':' in clean_text:
parts = clean_text.split(':', 1)
header = parts[0].strip() + ':'
content = parts[1].strip()
processed_lines.append("#### Điểm " + header)
if content:
processed_lines.append(content)
else:
processed_lines.append("#### Điểm " + clean_text)
continue
# Điểm: Just italic text (*text*), standalone, starts with number or letter
if re.match(r"^\*.*\*$", line) and is_standalone:
clean_text = re.sub(r"^\*(.*)\*$", r"\1", line)
# Check if starts with number (1., 1.1., etc.), letter (a), b), etc.), or asterisk (*)
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text):
# Check if there's content after a colon that should be separated
if ':' in clean_text:
parts = clean_text.split(':', 1)
header = parts[0].strip() + ':'
content = parts[1].strip()
processed_lines.append("#### Điểm " + header)
if content:
processed_lines.append(content)
else:
processed_lines.append("#### Điểm " + clean_text)
continue
# Handle numbered sub-items like "1.1.", "1.2.", etc. - bold format
if re.match(r"^__\d+\.\d+\.\s+.*__$", line):
clean_text = re.sub(r"^__(.*)__$", r"\1", line)
line = "#### Điểm " + clean_text
processed_lines.append(line)
continue
# Handle numbered sub-items like "1.1.", "1.2.", etc. - italic format
if re.match(r"^\*\d+\.\d+\.\s+.*\*$", line):
clean_text = re.sub(r"^\*(.*)\*$", r"\1", line)
line = "#### Điểm " + clean_text
processed_lines.append(line)
continue
# Handle asterisk items that are bold+italic: __** text*__
if re.match(r"^__\*\*.*\*__$", line) and is_standalone:
clean_text = re.sub(r"^__\*\*(.*)\*__$", r"\1", line)
line = "#### Điểm *" + clean_text
processed_lines.append(line)
continue
# Note: Plain numbered sub-items should remain as regular text
# Only bold or italic formatted items should be converted to Điểm
# Handle lettered items like "a)", "b)", "c)", etc. that are bold+italic
if re.match(r"^__\*[a-z]\)\s+.*\*__$", line):
clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line)
# Check if there's content after a colon that should be separated
if ':' in clean_text:
parts = clean_text.split(':', 1)
header = parts[0].strip() + ':'
content = parts[1].strip()
processed_lines.append("#### Điểm " + header)
if content:
processed_lines.append(content)
else:
processed_lines.append("#### Điểm " + clean_text)
continue
# Handle mixed formatting: bold/italic header + plain text content on same line
# Example: "__*1. Header:*__ Plain text content" should become "#### Điểm 1. Header:" + "Plain text content"
# Check for bold header followed by plain text
# Pattern: __header text__ remaining plain text
bold_match = re.match(r"^(__.*?__)\s*(.*)$", line)
if bold_match:
header_part = bold_match.group(1)
content_part = bold_match.group(2).strip()
# Process the header part for Khoản
clean_header = re.sub(r"^__(.*)__$", r"\1", header_part)
if re.match(r"^\d+\.\s+", clean_header):
khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_header)
if khoan_match:
number = khoan_match.group(1)
header_content = khoan_match.group(2)
processed_lines.append(f"### Khoản {number}. {header_content}")
if content_part:
processed_lines.append(content_part)
continue
# Process the header part for Điểm
elif re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header):
processed_lines.append("#### Điểm " + clean_header)
if content_part:
processed_lines.append(content_part)
continue
# Check for italic header followed by plain text
# Pattern: *header text* remaining plain text
italic_match = re.match(r"^(\*.*?\*)\s*(.*)$", line)
if italic_match:
header_part = italic_match.group(1)
content_part = italic_match.group(2).strip()
# Process the header part
clean_header = re.sub(r"^\*(.*)\*$", r"\1", header_part)
if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header):
processed_lines.append("#### Điểm " + clean_header)
if content_part:
processed_lines.append(content_part)
continue
# Note: Plain numbered items should remain as regular numbered lists
# Only convert to Điểm if they have proper formatting (bold/italic)
processed_lines.append(line)
# No need for Text: labels, just return the processed content
return '\n'.join(processed_lines).strip()
def convert_doc_to_md(doc_path, md_path):
"""Convert document (paragraphs + tables) to Markdown with normalization."""
doc = docx.Document(doc_path)
markdown_lines = []
for element in doc.element.body:
if element.tag.endswith('tbl'): # Table
table = docx.table.Table(element, doc)
md_table = table_to_markdown(table)
if markdown_lines and markdown_lines[-1].strip():
markdown_lines.append("")
markdown_lines.append(md_table)
markdown_lines.append("")
elif element.tag.endswith('p'): # Paragraph
paragraph = docx.text.paragraph.Paragraph(element, doc)
para_info = get_paragraph_formatting(paragraph)
if para_info and para_info['text']:
markdown_lines.append(format_paragraph(para_info))
# Join + normalize
final_text = '\n'.join(markdown_lines)
final_text = clean_and_normalize(final_text)
md_path.write_text(final_text, encoding="utf-8")
return md_path
if __name__ == "__main__":
for doc in RAW.iterdir():
if doc.suffix.lower() not in [".doc", ".docx"]:
print("Skipping:", doc); continue
out = OUT / (doc.stem + ".md")
convert_doc_to_md(doc, out)
print("Converted:", out)