""" Convert source documents (PDF, HTML) to Markdown using MarkItDown. Saves converted files to sources/converted/. """ import os import sys from markitdown import MarkItDown RAW_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "raw") CONVERTED_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "converted") os.makedirs(CONVERTED_DIR, exist_ok=True) md = MarkItDown() # List all files to convert files = sorted(os.listdir(RAW_DIR)) print(f"Found {len(files)} files in {RAW_DIR}\n") for filename in files: filepath = os.path.join(RAW_DIR, filename) if not os.path.isfile(filepath): continue base_name = os.path.splitext(filename)[0] output_path = os.path.join(CONVERTED_DIR, f"{base_name}.md") print(f"Converting: {filename}") try: result = md.convert(filepath) text = result.text_content # Write to file with open(output_path, "w", encoding="utf-8") as f: f.write(text) # Stats lines = text.count("\n") + 1 size_kb = len(text.encode("utf-8")) / 1024 print(f" -> {output_path}") print(f" {lines} lines, {size_kb:.1f} KB\n") except Exception as e: print(f" FAILED: {e}\n")