phdm-21d-embedding / scripts /markdown_to_jsonl.py
issdandavis
feat: add markdown export to jsonl dataset pipeline
15611e8
#!/usr/bin/env python
"""Convert exported Markdown folders into JSONL dataset splits."""
from __future__ import annotations
import argparse
import hashlib
import json
import random
from pathlib import Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Build train/validation/test JSONL files from recursive Markdown exports."
)
parser.add_argument(
"--input-dir",
required=True,
help="Directory containing exported Markdown files (searched recursively).",
)
parser.add_argument(
"--output-dir",
default="data",
help="Directory where split JSONL files are written.",
)
parser.add_argument(
"--train-ratio",
type=float,
default=0.9,
help="Train split ratio.",
)
parser.add_argument(
"--validation-ratio",
type=float,
default=0.1,
help="Validation split ratio.",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for deterministic splitting.",
)
parser.add_argument(
"--min-chars",
type=int,
default=20,
help="Minimum cleaned text length to keep a record.",
)
return parser.parse_args()
def split_markdown_front_matter(text: str) -> tuple[dict[str, str], str]:
if not text.startswith("---\n"):
return {}, text
end_idx = text.find("\n---\n", 4)
if end_idx == -1:
return {}, text
block = text[4:end_idx]
body = text[end_idx + 5 :]
meta: dict[str, str] = {}
for line in block.splitlines():
if ":" not in line:
continue
key, value = line.split(":", 1)
meta[key.strip()] = value.strip()
return meta, body
def extract_title(text: str, fallback: str) -> str:
for line in text.splitlines():
stripped = line.strip()
if stripped.startswith("# "):
return stripped[2:].strip()
return fallback
def determine_space(rel_path: Path) -> str:
if len(rel_path.parts) > 1:
return rel_path.parts[0]
return "default"
def iter_markdown_records(input_dir: Path, min_chars: int) -> list[dict[str, object]]:
records: list[dict[str, object]] = []
seen_ids: set[str] = set()
for file_path in sorted(input_dir.rglob("*.md")):
rel_path = file_path.relative_to(input_dir)
raw = file_path.read_text(encoding="utf-8", errors="replace")
front_matter, body = split_markdown_front_matter(raw)
text = body.strip()
if len(text) < min_chars:
continue
record_id = hashlib.sha256(f"{rel_path}|{text}".encode("utf-8")).hexdigest()[:16]
if record_id in seen_ids:
continue
seen_ids.add(record_id)
title = extract_title(text, file_path.stem)
records.append(
{
"id": record_id,
"source": "perplexity_space_export",
"space": determine_space(rel_path),
"relative_path": rel_path.as_posix(),
"title": title,
"text": text,
"meta": front_matter,
}
)
return records
def validate_ratios(train_ratio: float, validation_ratio: float) -> None:
test_ratio = 1.0 - train_ratio - validation_ratio
if train_ratio <= 0 or validation_ratio < 0 or test_ratio < 0:
raise ValueError(
"Invalid split ratios. Require train_ratio > 0 and train_ratio + validation_ratio <= 1."
)
def split_records(
records: list[dict[str, object]], train_ratio: float, validation_ratio: float, seed: int
) -> dict[str, list[dict[str, object]]]:
shuffled = list(records)
random.Random(seed).shuffle(shuffled)
n_total = len(shuffled)
n_train = int(n_total * train_ratio)
n_validation = int(n_total * validation_ratio)
if n_total > 0 and n_train == 0:
n_train = 1
if n_train + n_validation > n_total:
n_validation = max(0, n_total - n_train)
train = shuffled[:n_train]
validation = shuffled[n_train : n_train + n_validation]
test = shuffled[n_train + n_validation :]
return {"train": train, "validation": validation, "test": test}
def write_jsonl(path: Path, records: list[dict[str, object]]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8", newline="\n") as handle:
for record in records:
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
def main() -> None:
args = parse_args()
validate_ratios(args.train_ratio, args.validation_ratio)
input_dir = Path(args.input_dir).expanduser().resolve()
output_dir = Path(args.output_dir).expanduser().resolve()
if not input_dir.exists():
raise FileNotFoundError(f"Input directory not found: {input_dir}")
records = iter_markdown_records(input_dir=input_dir, min_chars=args.min_chars)
if not records:
raise SystemExit(f"No valid Markdown records found in: {input_dir}")
splits = split_records(
records=records,
train_ratio=args.train_ratio,
validation_ratio=args.validation_ratio,
seed=args.seed,
)
for split_name, split_records_data in splits.items():
if not split_records_data:
continue
out_file = output_dir / f"{split_name}.jsonl"
write_jsonl(out_file, split_records_data)
print(f"Wrote {split_name}: {len(split_records_data)} rows -> {out_file}")
print(f"Total records processed: {len(records)}")
if __name__ == "__main__":
main()