| import os |
| import re |
| from markitdown import MarkItDown |
|
|
|
|
| def clean_markdown_wrap_none(text: str) -> str: |
| """ |
| Joins all lines of each paragraph into a single line. |
| Preserves structure (headings, lists, code blocks). |
| """ |
| lines = text.splitlines() |
| out = [] |
| buf = "" |
|
|
| bullet = re.compile(r"^(\s*[-*+]\s+|\s*\d+\.\s+)") |
| heading = re.compile(r"^\s{0,3}#{1,6}\s") |
| codefence = re.compile(r"^\s*```") |
|
|
| in_code = False |
|
|
| def flush(): |
| nonlocal buf |
| if buf.strip(): |
| out.append(buf.strip()) |
| buf = "" |
|
|
| for raw in lines: |
| line = raw.rstrip("\n") |
|
|
| if codefence.match(line): |
| in_code = not in_code |
| flush() |
| out.append(line) |
| continue |
|
|
| if in_code: |
| out.append(line) |
| continue |
|
|
| if line.strip() == "": |
| flush() |
| out.append("") |
| continue |
|
|
| if heading.match(line) or bullet.match(line): |
| flush() |
| out.append(line) |
| continue |
|
|
| if not buf: |
| buf = line.strip() |
| continue |
|
|
| |
| if buf.endswith('-') and line.strip() and line.strip()[0].isalpha(): |
| buf = buf[:-1] + line.strip() |
| else: |
| buf = buf + " " + line.strip() |
|
|
| flush() |
| return "\n".join(out) |
|
|
|
|
| def convert_document_to_markdown( |
| file_path: str, |
| output_path: str = None, |
| return_text: bool = True, |
| ): |
| """ |
| Converts a single document (PDF/DOCX and other MarkItDown-supported types) to Markdown. |
| |
| Parameters: |
| file_path (str): Path to input file |
| output_path (str, optional): Where to save the .md file |
| return_text (bool): If True, returns markdown text |
| |
| Returns: |
| str | None: Markdown content (if return_text=True) |
| """ |
| if not os.path.isfile(file_path): |
| raise FileNotFoundError(f"File not found: {file_path}") |
|
|
| md = MarkItDown() |
|
|
| try: |
| result = md.convert(file_path) |
| clean_text = clean_markdown_wrap_none(result.text_content) |
|
|
| |
| if output_path: |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| with open(output_path, "w", encoding="utf-8") as f: |
| f.write(clean_text) |
|
|
| if return_text: |
| return clean_text |
|
|
| return None |
|
|
| except Exception as e: |
| raise RuntimeError(f"Error converting file to Markdown: {e}") |
|
|
|
|
| def convert_pdf_to_markdown( |
| pdf_path: str, |
| output_path: str = None, |
| return_text: bool = True, |
| ): |
| """ |
| Backward-compatible wrapper for PDF conversion. |
| """ |
| return convert_document_to_markdown( |
| file_path=pdf_path, |
| output_path=output_path, |
| return_text=return_text, |
| ) |