beta-NORM / utils /pdf_md.py
GitHub Actions
Sync from GitHub master
92145af
import os
import re
from markitdown import MarkItDown
def clean_markdown_wrap_none(text: str) -> str:
"""
Joins all lines of each paragraph into a single line.
Preserves structure (headings, lists, code blocks).
"""
lines = text.splitlines()
out = []
buf = ""
bullet = re.compile(r"^(\s*[-*+]\s+|\s*\d+\.\s+)")
heading = re.compile(r"^\s{0,3}#{1,6}\s")
codefence = re.compile(r"^\s*```")
in_code = False
def flush():
nonlocal buf
if buf.strip():
out.append(buf.strip())
buf = ""
for raw in lines:
line = raw.rstrip("\n")
if codefence.match(line):
in_code = not in_code
flush()
out.append(line)
continue
if in_code:
out.append(line)
continue
if line.strip() == "":
flush()
out.append("")
continue
if heading.match(line) or bullet.match(line):
flush()
out.append(line)
continue
if not buf:
buf = line.strip()
continue
# Remove hyphen when word is split
if buf.endswith('-') and line.strip() and line.strip()[0].isalpha():
buf = buf[:-1] + line.strip()
else:
buf = buf + " " + line.strip()
flush()
return "\n".join(out)
def convert_document_to_markdown(
file_path: str,
output_path: str = None,
return_text: bool = True,
):
"""
Converts a single document (PDF/DOCX and other MarkItDown-supported types) to Markdown.
Parameters:
file_path (str): Path to input file
output_path (str, optional): Where to save the .md file
return_text (bool): If True, returns markdown text
Returns:
str | None: Markdown content (if return_text=True)
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
md = MarkItDown()
try:
result = md.convert(file_path)
clean_text = clean_markdown_wrap_none(result.text_content)
# Save file if requested
if output_path:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(clean_text)
if return_text:
return clean_text
return None
except Exception as e:
raise RuntimeError(f"Error converting file to Markdown: {e}")
def convert_pdf_to_markdown(
pdf_path: str,
output_path: str = None,
return_text: bool = True,
):
"""
Backward-compatible wrapper for PDF conversion.
"""
return convert_document_to_markdown(
file_path=pdf_path,
output_path=output_path,
return_text=return_text,
)