#!/usr/bin/env python """Convert exported Markdown folders into JSONL dataset splits.""" from __future__ import annotations import argparse import hashlib import json import random from pathlib import Path def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Build train/validation/test JSONL files from recursive Markdown exports." ) parser.add_argument( "--input-dir", required=True, help="Directory containing exported Markdown files (searched recursively).", ) parser.add_argument( "--output-dir", default="data", help="Directory where split JSONL files are written.", ) parser.add_argument( "--train-ratio", type=float, default=0.9, help="Train split ratio.", ) parser.add_argument( "--validation-ratio", type=float, default=0.1, help="Validation split ratio.", ) parser.add_argument( "--seed", type=int, default=42, help="Random seed for deterministic splitting.", ) parser.add_argument( "--min-chars", type=int, default=20, help="Minimum cleaned text length to keep a record.", ) return parser.parse_args() def split_markdown_front_matter(text: str) -> tuple[dict[str, str], str]: if not text.startswith("---\n"): return {}, text end_idx = text.find("\n---\n", 4) if end_idx == -1: return {}, text block = text[4:end_idx] body = text[end_idx + 5 :] meta: dict[str, str] = {} for line in block.splitlines(): if ":" not in line: continue key, value = line.split(":", 1) meta[key.strip()] = value.strip() return meta, body def extract_title(text: str, fallback: str) -> str: for line in text.splitlines(): stripped = line.strip() if stripped.startswith("# "): return stripped[2:].strip() return fallback def determine_space(rel_path: Path) -> str: if len(rel_path.parts) > 1: return rel_path.parts[0] return "default" def iter_markdown_records(input_dir: Path, min_chars: int) -> list[dict[str, object]]: records: list[dict[str, object]] = [] seen_ids: set[str] = set() for file_path in sorted(input_dir.rglob("*.md")): rel_path = file_path.relative_to(input_dir) raw = file_path.read_text(encoding="utf-8", errors="replace") front_matter, body = split_markdown_front_matter(raw) text = body.strip() if len(text) < min_chars: continue record_id = hashlib.sha256(f"{rel_path}|{text}".encode("utf-8")).hexdigest()[:16] if record_id in seen_ids: continue seen_ids.add(record_id) title = extract_title(text, file_path.stem) records.append( { "id": record_id, "source": "perplexity_space_export", "space": determine_space(rel_path), "relative_path": rel_path.as_posix(), "title": title, "text": text, "meta": front_matter, } ) return records def validate_ratios(train_ratio: float, validation_ratio: float) -> None: test_ratio = 1.0 - train_ratio - validation_ratio if train_ratio <= 0 or validation_ratio < 0 or test_ratio < 0: raise ValueError( "Invalid split ratios. Require train_ratio > 0 and train_ratio + validation_ratio <= 1." ) def split_records( records: list[dict[str, object]], train_ratio: float, validation_ratio: float, seed: int ) -> dict[str, list[dict[str, object]]]: shuffled = list(records) random.Random(seed).shuffle(shuffled) n_total = len(shuffled) n_train = int(n_total * train_ratio) n_validation = int(n_total * validation_ratio) if n_total > 0 and n_train == 0: n_train = 1 if n_train + n_validation > n_total: n_validation = max(0, n_total - n_train) train = shuffled[:n_train] validation = shuffled[n_train : n_train + n_validation] test = shuffled[n_train + n_validation :] return {"train": train, "validation": validation, "test": test} def write_jsonl(path: Path, records: list[dict[str, object]]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8", newline="\n") as handle: for record in records: handle.write(json.dumps(record, ensure_ascii=False) + "\n") def main() -> None: args = parse_args() validate_ratios(args.train_ratio, args.validation_ratio) input_dir = Path(args.input_dir).expanduser().resolve() output_dir = Path(args.output_dir).expanduser().resolve() if not input_dir.exists(): raise FileNotFoundError(f"Input directory not found: {input_dir}") records = iter_markdown_records(input_dir=input_dir, min_chars=args.min_chars) if not records: raise SystemExit(f"No valid Markdown records found in: {input_dir}") splits = split_records( records=records, train_ratio=args.train_ratio, validation_ratio=args.validation_ratio, seed=args.seed, ) for split_name, split_records_data in splits.items(): if not split_records_data: continue out_file = output_dir / f"{split_name}.jsonl" write_jsonl(out_file, split_records_data) print(f"Wrote {split_name}: {len(split_records_data)} rows -> {out_file}") print(f"Total records processed: {len(records)}") if __name__ == "__main__": main()