import os import re from markitdown import MarkItDown def clean_markdown_wrap_none(text: str) -> str: """ Joins all lines of each paragraph into a single line. Preserves structure (headings, lists, code blocks). """ lines = text.splitlines() out = [] buf = "" bullet = re.compile(r"^(\s*[-*+]\s+|\s*\d+\.\s+)") heading = re.compile(r"^\s{0,3}#{1,6}\s") codefence = re.compile(r"^\s*```") in_code = False def flush(): nonlocal buf if buf.strip(): out.append(buf.strip()) buf = "" for raw in lines: line = raw.rstrip("\n") if codefence.match(line): in_code = not in_code flush() out.append(line) continue if in_code: out.append(line) continue if line.strip() == "": flush() out.append("") continue if heading.match(line) or bullet.match(line): flush() out.append(line) continue if not buf: buf = line.strip() continue # Remove hyphen when word is split if buf.endswith('-') and line.strip() and line.strip()[0].isalpha(): buf = buf[:-1] + line.strip() else: buf = buf + " " + line.strip() flush() return "\n".join(out) def convert_document_to_markdown( file_path: str, output_path: str = None, return_text: bool = True, ): """ Converts a single document (PDF/DOCX and other MarkItDown-supported types) to Markdown. Parameters: file_path (str): Path to input file output_path (str, optional): Where to save the .md file return_text (bool): If True, returns markdown text Returns: str | None: Markdown content (if return_text=True) """ if not os.path.isfile(file_path): raise FileNotFoundError(f"File not found: {file_path}") md = MarkItDown() try: result = md.convert(file_path) clean_text = clean_markdown_wrap_none(result.text_content) # Save file if requested if output_path: os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: f.write(clean_text) if return_text: return clean_text return None except Exception as e: raise RuntimeError(f"Error converting file to Markdown: {e}") def convert_pdf_to_markdown( pdf_path: str, output_path: str = None, return_text: bool = True, ): """ Backward-compatible wrapper for PDF conversion. """ return convert_document_to_markdown( file_path=pdf_path, output_path=output_path, return_text=return_text, )