phdm-21d-embedding / scripts /markdown_to_jsonl.py

issdandavis

feat: add markdown export to jsonl dataset pipeline

15611e8 5 days ago

5.65 kB

	#!/usr/bin/env python
	"""Convert exported Markdown folders into JSONL dataset splits."""

	from __future__ import annotations

	import argparse
	import hashlib
	import json
	import random
	from pathlib import Path


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Build train/validation/test JSONL files from recursive Markdown exports."
	)
	parser.add_argument(
	"--input-dir",
	required=True,
	help="Directory containing exported Markdown files (searched recursively).",
	)
	parser.add_argument(
	"--output-dir",
	default="data",
	help="Directory where split JSONL files are written.",
	)
	parser.add_argument(
	"--train-ratio",
	type=float,
	default=0.9,
	help="Train split ratio.",
	)
	parser.add_argument(
	"--validation-ratio",
	type=float,
	default=0.1,
	help="Validation split ratio.",
	)
	parser.add_argument(
	"--seed",
	type=int,
	default=42,
	help="Random seed for deterministic splitting.",
	)
	parser.add_argument(
	"--min-chars",
	type=int,
	default=20,
	help="Minimum cleaned text length to keep a record.",
	)
	return parser.parse_args()


	def split_markdown_front_matter(text: str) -> tuple[dict[str, str], str]:
	if not text.startswith("---\n"):
	return {}, text

	end_idx = text.find("\n---\n", 4)
	if end_idx == -1:
	return {}, text

	block = text[4:end_idx]
	body = text[end_idx + 5 :]
	meta: dict[str, str] = {}

	for line in block.splitlines():
	if ":" not in line:
	continue
	key, value = line.split(":", 1)
	meta[key.strip()] = value.strip()

	return meta, body


	def extract_title(text: str, fallback: str) -> str:
	for line in text.splitlines():
	stripped = line.strip()
	if stripped.startswith("# "):
	return stripped[2:].strip()
	return fallback


	def determine_space(rel_path: Path) -> str:
	if len(rel_path.parts) > 1:
	return rel_path.parts[0]
	return "default"


	def iter_markdown_records(input_dir: Path, min_chars: int) -> list[dict[str, object]]:
	records: list[dict[str, object]] = []
	seen_ids: set[str] = set()

	for file_path in sorted(input_dir.rglob("*.md")):
	rel_path = file_path.relative_to(input_dir)
	raw = file_path.read_text(encoding="utf-8", errors="replace")
	front_matter, body = split_markdown_front_matter(raw)
	text = body.strip()

	if len(text) < min_chars:
	continue

	record_id = hashlib.sha256(f"{rel_path}\|{text}".encode("utf-8")).hexdigest()[:16]
	if record_id in seen_ids:
	continue
	seen_ids.add(record_id)

	title = extract_title(text, file_path.stem)
	records.append(
	{
	"id": record_id,
	"source": "perplexity_space_export",
	"space": determine_space(rel_path),
	"relative_path": rel_path.as_posix(),
	"title": title,
	"text": text,
	"meta": front_matter,
	}
	)

	return records


	def validate_ratios(train_ratio: float, validation_ratio: float) -> None:
	test_ratio = 1.0 - train_ratio - validation_ratio
	if train_ratio <= 0 or validation_ratio < 0 or test_ratio < 0:
	raise ValueError(
	"Invalid split ratios. Require train_ratio > 0 and train_ratio + validation_ratio <= 1."
	)


	def split_records(
	records: list[dict[str, object]], train_ratio: float, validation_ratio: float, seed: int
	) -> dict[str, list[dict[str, object]]]:
	shuffled = list(records)
	random.Random(seed).shuffle(shuffled)

	n_total = len(shuffled)
	n_train = int(n_total * train_ratio)
	n_validation = int(n_total * validation_ratio)

	if n_total > 0 and n_train == 0:
	n_train = 1
	if n_train + n_validation > n_total:
	n_validation = max(0, n_total - n_train)

	train = shuffled[:n_train]
	validation = shuffled[n_train : n_train + n_validation]
	test = shuffled[n_train + n_validation :]
	return {"train": train, "validation": validation, "test": test}


	def write_jsonl(path: Path, records: list[dict[str, object]]) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	with path.open("w", encoding="utf-8", newline="\n") as handle:
	for record in records:
	handle.write(json.dumps(record, ensure_ascii=False) + "\n")


	def main() -> None:
	args = parse_args()
	validate_ratios(args.train_ratio, args.validation_ratio)

	input_dir = Path(args.input_dir).expanduser().resolve()
	output_dir = Path(args.output_dir).expanduser().resolve()

	if not input_dir.exists():
	raise FileNotFoundError(f"Input directory not found: {input_dir}")

	records = iter_markdown_records(input_dir=input_dir, min_chars=args.min_chars)
	if not records:
	raise SystemExit(f"No valid Markdown records found in: {input_dir}")

	splits = split_records(
	records=records,
	train_ratio=args.train_ratio,
	validation_ratio=args.validation_ratio,
	seed=args.seed,
	)

	for split_name, split_records_data in splits.items():
	if not split_records_data:
	continue
	out_file = output_dir / f"{split_name}.jsonl"
	write_jsonl(out_file, split_records_data)
	print(f"Wrote {split_name}: {len(split_records_data)} rows -> {out_file}")

	print(f"Total records processed: {len(records)}")


	if __name__ == "__main__":
	main()