|
|
|
|
|
"""Convert exported Markdown folders into JSONL dataset splits.""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import hashlib |
|
|
import json |
|
|
import random |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def parse_args() -> argparse.Namespace: |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Build train/validation/test JSONL files from recursive Markdown exports." |
|
|
) |
|
|
parser.add_argument( |
|
|
"--input-dir", |
|
|
required=True, |
|
|
help="Directory containing exported Markdown files (searched recursively).", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output-dir", |
|
|
default="data", |
|
|
help="Directory where split JSONL files are written.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--train-ratio", |
|
|
type=float, |
|
|
default=0.9, |
|
|
help="Train split ratio.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--validation-ratio", |
|
|
type=float, |
|
|
default=0.1, |
|
|
help="Validation split ratio.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--seed", |
|
|
type=int, |
|
|
default=42, |
|
|
help="Random seed for deterministic splitting.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--min-chars", |
|
|
type=int, |
|
|
default=20, |
|
|
help="Minimum cleaned text length to keep a record.", |
|
|
) |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def split_markdown_front_matter(text: str) -> tuple[dict[str, str], str]: |
|
|
if not text.startswith("---\n"): |
|
|
return {}, text |
|
|
|
|
|
end_idx = text.find("\n---\n", 4) |
|
|
if end_idx == -1: |
|
|
return {}, text |
|
|
|
|
|
block = text[4:end_idx] |
|
|
body = text[end_idx + 5 :] |
|
|
meta: dict[str, str] = {} |
|
|
|
|
|
for line in block.splitlines(): |
|
|
if ":" not in line: |
|
|
continue |
|
|
key, value = line.split(":", 1) |
|
|
meta[key.strip()] = value.strip() |
|
|
|
|
|
return meta, body |
|
|
|
|
|
|
|
|
def extract_title(text: str, fallback: str) -> str: |
|
|
for line in text.splitlines(): |
|
|
stripped = line.strip() |
|
|
if stripped.startswith("# "): |
|
|
return stripped[2:].strip() |
|
|
return fallback |
|
|
|
|
|
|
|
|
def determine_space(rel_path: Path) -> str: |
|
|
if len(rel_path.parts) > 1: |
|
|
return rel_path.parts[0] |
|
|
return "default" |
|
|
|
|
|
|
|
|
def iter_markdown_records(input_dir: Path, min_chars: int) -> list[dict[str, object]]: |
|
|
records: list[dict[str, object]] = [] |
|
|
seen_ids: set[str] = set() |
|
|
|
|
|
for file_path in sorted(input_dir.rglob("*.md")): |
|
|
rel_path = file_path.relative_to(input_dir) |
|
|
raw = file_path.read_text(encoding="utf-8", errors="replace") |
|
|
front_matter, body = split_markdown_front_matter(raw) |
|
|
text = body.strip() |
|
|
|
|
|
if len(text) < min_chars: |
|
|
continue |
|
|
|
|
|
record_id = hashlib.sha256(f"{rel_path}|{text}".encode("utf-8")).hexdigest()[:16] |
|
|
if record_id in seen_ids: |
|
|
continue |
|
|
seen_ids.add(record_id) |
|
|
|
|
|
title = extract_title(text, file_path.stem) |
|
|
records.append( |
|
|
{ |
|
|
"id": record_id, |
|
|
"source": "perplexity_space_export", |
|
|
"space": determine_space(rel_path), |
|
|
"relative_path": rel_path.as_posix(), |
|
|
"title": title, |
|
|
"text": text, |
|
|
"meta": front_matter, |
|
|
} |
|
|
) |
|
|
|
|
|
return records |
|
|
|
|
|
|
|
|
def validate_ratios(train_ratio: float, validation_ratio: float) -> None: |
|
|
test_ratio = 1.0 - train_ratio - validation_ratio |
|
|
if train_ratio <= 0 or validation_ratio < 0 or test_ratio < 0: |
|
|
raise ValueError( |
|
|
"Invalid split ratios. Require train_ratio > 0 and train_ratio + validation_ratio <= 1." |
|
|
) |
|
|
|
|
|
|
|
|
def split_records( |
|
|
records: list[dict[str, object]], train_ratio: float, validation_ratio: float, seed: int |
|
|
) -> dict[str, list[dict[str, object]]]: |
|
|
shuffled = list(records) |
|
|
random.Random(seed).shuffle(shuffled) |
|
|
|
|
|
n_total = len(shuffled) |
|
|
n_train = int(n_total * train_ratio) |
|
|
n_validation = int(n_total * validation_ratio) |
|
|
|
|
|
if n_total > 0 and n_train == 0: |
|
|
n_train = 1 |
|
|
if n_train + n_validation > n_total: |
|
|
n_validation = max(0, n_total - n_train) |
|
|
|
|
|
train = shuffled[:n_train] |
|
|
validation = shuffled[n_train : n_train + n_validation] |
|
|
test = shuffled[n_train + n_validation :] |
|
|
return {"train": train, "validation": validation, "test": test} |
|
|
|
|
|
|
|
|
def write_jsonl(path: Path, records: list[dict[str, object]]) -> None: |
|
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
|
with path.open("w", encoding="utf-8", newline="\n") as handle: |
|
|
for record in records: |
|
|
handle.write(json.dumps(record, ensure_ascii=False) + "\n") |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
args = parse_args() |
|
|
validate_ratios(args.train_ratio, args.validation_ratio) |
|
|
|
|
|
input_dir = Path(args.input_dir).expanduser().resolve() |
|
|
output_dir = Path(args.output_dir).expanduser().resolve() |
|
|
|
|
|
if not input_dir.exists(): |
|
|
raise FileNotFoundError(f"Input directory not found: {input_dir}") |
|
|
|
|
|
records = iter_markdown_records(input_dir=input_dir, min_chars=args.min_chars) |
|
|
if not records: |
|
|
raise SystemExit(f"No valid Markdown records found in: {input_dir}") |
|
|
|
|
|
splits = split_records( |
|
|
records=records, |
|
|
train_ratio=args.train_ratio, |
|
|
validation_ratio=args.validation_ratio, |
|
|
seed=args.seed, |
|
|
) |
|
|
|
|
|
for split_name, split_records_data in splits.items(): |
|
|
if not split_records_data: |
|
|
continue |
|
|
out_file = output_dir / f"{split_name}.jsonl" |
|
|
write_jsonl(out_file, split_records_data) |
|
|
print(f"Wrote {split_name}: {len(split_records_data)} rows -> {out_file}") |
|
|
|
|
|
print(f"Total records processed: {len(records)}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|