Spaces:

HaiDoVASCHOOLS
/

massivemark-clone

Sleeping

App Files Files Community

unknown commited on Oct 23, 2025

Commit

8dbfbab

1 Parent(s): de7fe63

updated

Browse files

Files changed (1) hide show

utils/markdown_sanitizer.py +42 -6

utils/markdown_sanitizer.py CHANGED Viewed

@@ -1,12 +1,17 @@
-# utils/markdown_sanitizer.py
 import re
 _WS_WEIRD = re.compile(r"[\u00A0\u2000-\u200B\u2060\u3000]")
 ROW_RE = re.compile(r"^\s*\|.*\|\s*$")
 SEP_RE = re.compile(r"^\s*\|?\s*:?-{2,}\s*(\|\s*:?-{2,}\s*)+\|?\s*$")
 FENCE_RE = re.compile(r"^\s*(```|~~~)")
 def _split_cells(line: str) -> list[str]:
     return [c.strip() for c in line.strip().strip("|").split("|")]
@@ -31,6 +36,10 @@ def _balance_columns(block: list[str]) -> list[str]:
         fixed.append(_join_cells(cells))
     return fixed
 def _unindent_headings(line: str) -> str:
     m = re.match(r"^(\s{4,})(#{1,6}\s+.*)$", line)
     if m:
@@ -52,8 +61,12 @@ def _escape_leading_pipe(line: str) -> str:
         return line[:i] + r"\|" + line[i+1:]
     return line
 def _normalize_math_brackets(lines: list[str]) -> list[str]:
-    """Chuyển [ ... ] → $$ … $$ ; và $$…$$-trên-1-dòng → khối $$ trên dòng riêng."""
     out = []
     i, n = 0, len(lines)
     in_fence = False
@@ -68,7 +81,7 @@ def _normalize_math_brackets(lines: list[str]) -> list[str]:
         if not in_fence:
             t = ln.strip()
-            # $$ ... $$ trên một dòng → khối
             m = re.match(r"^\s*\$\$(.+)\$\$\s*$", ln)
             if m:
                 content = m.group(1).strip()
@@ -100,15 +113,20 @@ def _normalize_math_brackets(lines: list[str]) -> list[str]:
     return out
 def normalize_markdown_for_pandoc(md: str) -> str:
     md = md.replace("\r\n", "\n").replace("\r", "\n")
     md = _WS_WEIRD.sub(" ", md)
-    # Pass 1: chuẩn hoá khối toán học
     lines = md.split("\n")
     lines = _normalize_math_brackets(lines)
-    # Pass 2: bảng + escape pipes
     out = []
     i, n = 0, len(lines)
     in_fence = False
@@ -149,4 +167,22 @@ def normalize_markdown_for_pandoc(md: str) -> str:
         out.append(ln); i += 1
-    return "\n".join(out)

 import re
+# Xử lý ký tự khoảng trắng đặc biệt
 _WS_WEIRD = re.compile(r"[\u00A0\u2000-\u200B\u2060\u3000]")
+# Nhận diện bảng, code block, toán học
 ROW_RE = re.compile(r"^\s*\|.*\|\s*$")
 SEP_RE = re.compile(r"^\s*\|?\s*:?-{2,}\s*(\|\s*:?-{2,}\s*)+\|?\s*$")
 FENCE_RE = re.compile(r"^\s*(```|~~~)")
+# ------------------------- #
+#   HÀM HỖ TRỢ XỬ LÝ BẢNG   #
+# ------------------------- #
 def _split_cells(line: str) -> list[str]:
     return [c.strip() for c in line.strip().strip("|").split("|")]
         fixed.append(_join_cells(cells))
     return fixed
+# ------------------------- #
+#     HỖ TRỢ XỬ LÝ KHÁC     #
+# ------------------------- #
 def _unindent_headings(line: str) -> str:
     m = re.match(r"^(\s{4,})(#{1,6}\s+.*)$", line)
     if m:
         return line[:i] + r"\|" + line[i+1:]
     return line
+# ------------------------- #
+#    XỬ LÝ CÔNG THỨC TOÁN   #
+# ------------------------- #
 def _normalize_math_brackets(lines: list[str]) -> list[str]:
+    """Chuyển [ ... ] → $$ … $$ ; và $$…$$-trên-1-dòng → khối $$ riêng."""
     out = []
     i, n = 0, len(lines)
     in_fence = False
         if not in_fence:
             t = ln.strip()
+            # $$ ... $$ trên 1 dòng → khối riêng
             m = re.match(r"^\s*\$\$(.+)\$\$\s*$", ln)
             if m:
                 content = m.group(1).strip()
     return out
+# ------------------------- #
+#  CHUẨN HÓA TOÀN BỘ MARKDOWN #
+# ------------------------- #
 def normalize_markdown_for_pandoc(md: str) -> str:
+    """Chuẩn hóa Markdown trước khi đưa vào Pandoc."""
     md = md.replace("\r\n", "\n").replace("\r", "\n")
     md = _WS_WEIRD.sub(" ", md)
+    # Pass 1: Chuẩn hóa toán học
     lines = md.split("\n")
     lines = _normalize_math_brackets(lines)
+    # Pass 2: Bảng + escape ký tự
     out = []
     i, n = 0, len(lines)
     in_fence = False
         out.append(ln); i += 1
+    # Pass 3: Ép xuống dòng hợp lý (fix Word export)
+    final_lines = []
+    for j, ln in enumerate(out):
+        final_lines.append(ln)
+        # Nếu dòng hiện tại và dòng kế tiếp đều không trống và không thuộc bảng / code / heading
+        if (
+            j + 1 < len(out)
+            and out[j].strip() != ""
+            and out[j + 1].strip() != ""
+            and not ROW_RE.match(out[j])
+            and not FENCE_RE.match(out[j])
+            and not out[j].strip().startswith("|")
+            and not out[j + 1].strip().startswith("|")
+            and not out[j].strip().startswith("#")
+            and not out[j + 1].strip().startswith("#")
+        ):
+            final_lines[-1] = final_lines[-1].rstrip() + "  "  # ép xuống dòng Markdown
+    return "\n".join(final_lines)