File size: 1,242 Bytes
bad8b6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
Convert source documents (PDF, HTML) to Markdown using MarkItDown.
Saves converted files to sources/converted/.
"""

import os
import sys
from markitdown import MarkItDown

RAW_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "raw")
CONVERTED_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "converted")

os.makedirs(CONVERTED_DIR, exist_ok=True)

md = MarkItDown()

# List all files to convert
files = sorted(os.listdir(RAW_DIR))
print(f"Found {len(files)} files in {RAW_DIR}\n")

for filename in files:
    filepath = os.path.join(RAW_DIR, filename)
    if not os.path.isfile(filepath):
        continue

    base_name = os.path.splitext(filename)[0]
    output_path = os.path.join(CONVERTED_DIR, f"{base_name}.md")

    print(f"Converting: {filename}")
    try:
        result = md.convert(filepath)
        text = result.text_content

        # Write to file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)

        # Stats
        lines = text.count("\n") + 1
        size_kb = len(text.encode("utf-8")) / 1024
        print(f"  -> {output_path}")
        print(f"     {lines} lines, {size_kb:.1f} KB\n")
    except Exception as e:
        print(f"  FAILED: {e}\n")