hmc-rag / scripts /convert_sources.py
webmuppet
Initial commit — health marketing compliance RAG
bad8b6c
"""
Convert source documents (PDF, HTML) to Markdown using MarkItDown.
Saves converted files to sources/converted/.
"""
import os
import sys
from markitdown import MarkItDown
RAW_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "raw")
CONVERTED_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "converted")
os.makedirs(CONVERTED_DIR, exist_ok=True)
md = MarkItDown()
# List all files to convert
files = sorted(os.listdir(RAW_DIR))
print(f"Found {len(files)} files in {RAW_DIR}\n")
for filename in files:
filepath = os.path.join(RAW_DIR, filename)
if not os.path.isfile(filepath):
continue
base_name = os.path.splitext(filename)[0]
output_path = os.path.join(CONVERTED_DIR, f"{base_name}.md")
print(f"Converting: {filename}")
try:
result = md.convert(filepath)
text = result.text_content
# Write to file
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
# Stats
lines = text.count("\n") + 1
size_kb = len(text.encode("utf-8")) / 1024
print(f" -> {output_path}")
print(f" {lines} lines, {size_kb:.1f} KB\n")
except Exception as e:
print(f" FAILED: {e}\n")