issdandavis
/

phdm-21d-embedding

@@ -75,10 +75,23 @@ Push local JSONL files to a dataset repo:
 python scripts/push_jsonl_dataset.py --dataset-id issdandavis/scbe-aethermoore-knowledge-base --train .\data\train.jsonl --validation .\data\validation.jsonl
 ```
 Expected JSONL row format example:
 ```json
-{"text":"Example source content","source":"notion","category":"policy"}
 ```
 ## Related

 python scripts/push_jsonl_dataset.py --dataset-id issdandavis/scbe-aethermoore-knowledge-base --train .\data\train.jsonl --validation .\data\validation.jsonl
 ```
+Convert Perplexity/Markdown exports into JSONL splits:
+```powershell
+python scripts/markdown_to_jsonl.py --input-dir C:\path\to\perplexity-export --output-dir .\data --train-ratio 0.9 --validation-ratio 0.1
+```
+One-shot flow (convert then push):
+```powershell
+python scripts/markdown_to_jsonl.py --input-dir C:\path\to\perplexity-export --output-dir .\data
+python scripts/push_jsonl_dataset.py --dataset-id issdandavis/your-central-knowledge-base --train .\data\train.jsonl --validation .\data\validation.jsonl --test .\data\test.jsonl
+```
 Expected JSONL row format example:
 ```json
+{"id":"6e4fcd3f34f5b021","source":"perplexity_space_export","space":"SCBE GitHub Deployment","relative_path":"SCBE GitHub Deployment/notes.md","title":"Deployment Notes","text":"Example source content","meta":{"author":"issdandavis"}}
 ```
 ## Related

scripts/markdown_to_jsonl.py ADDED Viewed

	@@ -0,0 +1,191 @@

+#!/usr/bin/env python
+"""Convert exported Markdown folders into JSONL dataset splits."""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import random
+from pathlib import Path
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Build train/validation/test JSONL files from recursive Markdown exports."
+    )
+    parser.add_argument(
+        "--input-dir",
+        required=True,
+        help="Directory containing exported Markdown files (searched recursively).",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="data",
+        help="Directory where split JSONL files are written.",
+    )
+    parser.add_argument(
+        "--train-ratio",
+        type=float,
+        default=0.9,
+        help="Train split ratio.",
+    )
+    parser.add_argument(
+        "--validation-ratio",
+        type=float,
+        default=0.1,
+        help="Validation split ratio.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for deterministic splitting.",
+    )
+    parser.add_argument(
+        "--min-chars",
+        type=int,
+        default=20,
+        help="Minimum cleaned text length to keep a record.",
+    )
+    return parser.parse_args()
+def split_markdown_front_matter(text: str) -> tuple[dict[str, str], str]:
+    if not text.startswith("---\n"):
+        return {}, text
+    end_idx = text.find("\n---\n", 4)
+    if end_idx == -1:
+        return {}, text
+    block = text[4:end_idx]
+    body = text[end_idx + 5 :]
+    meta: dict[str, str] = {}
+    for line in block.splitlines():
+        if ":" not in line:
+            continue
+        key, value = line.split(":", 1)
+        meta[key.strip()] = value.strip()
+    return meta, body
+def extract_title(text: str, fallback: str) -> str:
+    for line in text.splitlines():
+        stripped = line.strip()
+        if stripped.startswith("# "):
+            return stripped[2:].strip()
+    return fallback
+def determine_space(rel_path: Path) -> str:
+    if len(rel_path.parts) > 1:
+        return rel_path.parts[0]
+    return "default"
+def iter_markdown_records(input_dir: Path, min_chars: int) -> list[dict[str, object]]:
+    records: list[dict[str, object]] = []
+    seen_ids: set[str] = set()
+    for file_path in sorted(input_dir.rglob("*.md")):
+        rel_path = file_path.relative_to(input_dir)
+        raw = file_path.read_text(encoding="utf-8", errors="replace")
+        front_matter, body = split_markdown_front_matter(raw)
+        text = body.strip()
+        if len(text) < min_chars:
+            continue
+        record_id = hashlib.sha256(f"{rel_path}|{text}".encode("utf-8")).hexdigest()[:16]
+        if record_id in seen_ids:
+            continue
+        seen_ids.add(record_id)
+        title = extract_title(text, file_path.stem)
+        records.append(
+            {
+                "id": record_id,
+                "source": "perplexity_space_export",
+                "space": determine_space(rel_path),
+                "relative_path": rel_path.as_posix(),
+                "title": title,
+                "text": text,
+                "meta": front_matter,
+            }
+        )
+    return records
+def validate_ratios(train_ratio: float, validation_ratio: float) -> None:
+    test_ratio = 1.0 - train_ratio - validation_ratio
+    if train_ratio <= 0 or validation_ratio < 0 or test_ratio < 0:
+        raise ValueError(
+            "Invalid split ratios. Require train_ratio > 0 and train_ratio + validation_ratio <= 1."
+        )
+def split_records(
+    records: list[dict[str, object]], train_ratio: float, validation_ratio: float, seed: int
+) -> dict[str, list[dict[str, object]]]:
+    shuffled = list(records)
+    random.Random(seed).shuffle(shuffled)
+    n_total = len(shuffled)
+    n_train = int(n_total * train_ratio)
+    n_validation = int(n_total * validation_ratio)
+    if n_total > 0 and n_train == 0:
+        n_train = 1
+    if n_train + n_validation > n_total:
+        n_validation = max(0, n_total - n_train)
+    train = shuffled[:n_train]
+    validation = shuffled[n_train : n_train + n_validation]
+    test = shuffled[n_train + n_validation :]
+    return {"train": train, "validation": validation, "test": test}
+def write_jsonl(path: Path, records: list[dict[str, object]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8", newline="\n") as handle:
+        for record in records:
+            handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+def main() -> None:
+    args = parse_args()
+    validate_ratios(args.train_ratio, args.validation_ratio)
+    input_dir = Path(args.input_dir).expanduser().resolve()
+    output_dir = Path(args.output_dir).expanduser().resolve()
+    if not input_dir.exists():
+        raise FileNotFoundError(f"Input directory not found: {input_dir}")
+    records = iter_markdown_records(input_dir=input_dir, min_chars=args.min_chars)
+    if not records:
+        raise SystemExit(f"No valid Markdown records found in: {input_dir}")
+    splits = split_records(
+        records=records,
+        train_ratio=args.train_ratio,
+        validation_ratio=args.validation_ratio,
+        seed=args.seed,
+    )
+    for split_name, split_records_data in splits.items():
+        if not split_records_data:
+            continue
+        out_file = output_dir / f"{split_name}.jsonl"
+        write_jsonl(out_file, split_records_data)
+        print(f"Wrote {split_name}: {len(split_records_data)} rows -> {out_file}")
+    print(f"Total records processed: {len(records)}")
+if __name__ == "__main__":
+    main()