issdandavis commited on
Commit
15611e8
·
1 Parent(s): f47473f

feat: add markdown export to jsonl dataset pipeline

Browse files
Files changed (2) hide show
  1. README.md +14 -1
  2. scripts/markdown_to_jsonl.py +191 -0
README.md CHANGED
@@ -75,10 +75,23 @@ Push local JSONL files to a dataset repo:
75
  python scripts/push_jsonl_dataset.py --dataset-id issdandavis/scbe-aethermoore-knowledge-base --train .\data\train.jsonl --validation .\data\validation.jsonl
76
  ```
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  Expected JSONL row format example:
79
 
80
  ```json
81
- {"text":"Example source content","source":"notion","category":"policy"}
82
  ```
83
 
84
  ## Related
 
75
  python scripts/push_jsonl_dataset.py --dataset-id issdandavis/scbe-aethermoore-knowledge-base --train .\data\train.jsonl --validation .\data\validation.jsonl
76
  ```
77
 
78
+ Convert Perplexity/Markdown exports into JSONL splits:
79
+
80
+ ```powershell
81
+ python scripts/markdown_to_jsonl.py --input-dir C:\path\to\perplexity-export --output-dir .\data --train-ratio 0.9 --validation-ratio 0.1
82
+ ```
83
+
84
+ One-shot flow (convert then push):
85
+
86
+ ```powershell
87
+ python scripts/markdown_to_jsonl.py --input-dir C:\path\to\perplexity-export --output-dir .\data
88
+ python scripts/push_jsonl_dataset.py --dataset-id issdandavis/your-central-knowledge-base --train .\data\train.jsonl --validation .\data\validation.jsonl --test .\data\test.jsonl
89
+ ```
90
+
91
  Expected JSONL row format example:
92
 
93
  ```json
94
+ {"id":"6e4fcd3f34f5b021","source":"perplexity_space_export","space":"SCBE GitHub Deployment","relative_path":"SCBE GitHub Deployment/notes.md","title":"Deployment Notes","text":"Example source content","meta":{"author":"issdandavis"}}
95
  ```
96
 
97
  ## Related
scripts/markdown_to_jsonl.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Convert exported Markdown folders into JSONL dataset splits."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import hashlib
8
+ import json
9
+ import random
10
+ from pathlib import Path
11
+
12
+
13
+ def parse_args() -> argparse.Namespace:
14
+ parser = argparse.ArgumentParser(
15
+ description="Build train/validation/test JSONL files from recursive Markdown exports."
16
+ )
17
+ parser.add_argument(
18
+ "--input-dir",
19
+ required=True,
20
+ help="Directory containing exported Markdown files (searched recursively).",
21
+ )
22
+ parser.add_argument(
23
+ "--output-dir",
24
+ default="data",
25
+ help="Directory where split JSONL files are written.",
26
+ )
27
+ parser.add_argument(
28
+ "--train-ratio",
29
+ type=float,
30
+ default=0.9,
31
+ help="Train split ratio.",
32
+ )
33
+ parser.add_argument(
34
+ "--validation-ratio",
35
+ type=float,
36
+ default=0.1,
37
+ help="Validation split ratio.",
38
+ )
39
+ parser.add_argument(
40
+ "--seed",
41
+ type=int,
42
+ default=42,
43
+ help="Random seed for deterministic splitting.",
44
+ )
45
+ parser.add_argument(
46
+ "--min-chars",
47
+ type=int,
48
+ default=20,
49
+ help="Minimum cleaned text length to keep a record.",
50
+ )
51
+ return parser.parse_args()
52
+
53
+
54
+ def split_markdown_front_matter(text: str) -> tuple[dict[str, str], str]:
55
+ if not text.startswith("---\n"):
56
+ return {}, text
57
+
58
+ end_idx = text.find("\n---\n", 4)
59
+ if end_idx == -1:
60
+ return {}, text
61
+
62
+ block = text[4:end_idx]
63
+ body = text[end_idx + 5 :]
64
+ meta: dict[str, str] = {}
65
+
66
+ for line in block.splitlines():
67
+ if ":" not in line:
68
+ continue
69
+ key, value = line.split(":", 1)
70
+ meta[key.strip()] = value.strip()
71
+
72
+ return meta, body
73
+
74
+
75
+ def extract_title(text: str, fallback: str) -> str:
76
+ for line in text.splitlines():
77
+ stripped = line.strip()
78
+ if stripped.startswith("# "):
79
+ return stripped[2:].strip()
80
+ return fallback
81
+
82
+
83
+ def determine_space(rel_path: Path) -> str:
84
+ if len(rel_path.parts) > 1:
85
+ return rel_path.parts[0]
86
+ return "default"
87
+
88
+
89
+ def iter_markdown_records(input_dir: Path, min_chars: int) -> list[dict[str, object]]:
90
+ records: list[dict[str, object]] = []
91
+ seen_ids: set[str] = set()
92
+
93
+ for file_path in sorted(input_dir.rglob("*.md")):
94
+ rel_path = file_path.relative_to(input_dir)
95
+ raw = file_path.read_text(encoding="utf-8", errors="replace")
96
+ front_matter, body = split_markdown_front_matter(raw)
97
+ text = body.strip()
98
+
99
+ if len(text) < min_chars:
100
+ continue
101
+
102
+ record_id = hashlib.sha256(f"{rel_path}|{text}".encode("utf-8")).hexdigest()[:16]
103
+ if record_id in seen_ids:
104
+ continue
105
+ seen_ids.add(record_id)
106
+
107
+ title = extract_title(text, file_path.stem)
108
+ records.append(
109
+ {
110
+ "id": record_id,
111
+ "source": "perplexity_space_export",
112
+ "space": determine_space(rel_path),
113
+ "relative_path": rel_path.as_posix(),
114
+ "title": title,
115
+ "text": text,
116
+ "meta": front_matter,
117
+ }
118
+ )
119
+
120
+ return records
121
+
122
+
123
+ def validate_ratios(train_ratio: float, validation_ratio: float) -> None:
124
+ test_ratio = 1.0 - train_ratio - validation_ratio
125
+ if train_ratio <= 0 or validation_ratio < 0 or test_ratio < 0:
126
+ raise ValueError(
127
+ "Invalid split ratios. Require train_ratio > 0 and train_ratio + validation_ratio <= 1."
128
+ )
129
+
130
+
131
+ def split_records(
132
+ records: list[dict[str, object]], train_ratio: float, validation_ratio: float, seed: int
133
+ ) -> dict[str, list[dict[str, object]]]:
134
+ shuffled = list(records)
135
+ random.Random(seed).shuffle(shuffled)
136
+
137
+ n_total = len(shuffled)
138
+ n_train = int(n_total * train_ratio)
139
+ n_validation = int(n_total * validation_ratio)
140
+
141
+ if n_total > 0 and n_train == 0:
142
+ n_train = 1
143
+ if n_train + n_validation > n_total:
144
+ n_validation = max(0, n_total - n_train)
145
+
146
+ train = shuffled[:n_train]
147
+ validation = shuffled[n_train : n_train + n_validation]
148
+ test = shuffled[n_train + n_validation :]
149
+ return {"train": train, "validation": validation, "test": test}
150
+
151
+
152
+ def write_jsonl(path: Path, records: list[dict[str, object]]) -> None:
153
+ path.parent.mkdir(parents=True, exist_ok=True)
154
+ with path.open("w", encoding="utf-8", newline="\n") as handle:
155
+ for record in records:
156
+ handle.write(json.dumps(record, ensure_ascii=False) + "\n")
157
+
158
+
159
+ def main() -> None:
160
+ args = parse_args()
161
+ validate_ratios(args.train_ratio, args.validation_ratio)
162
+
163
+ input_dir = Path(args.input_dir).expanduser().resolve()
164
+ output_dir = Path(args.output_dir).expanduser().resolve()
165
+
166
+ if not input_dir.exists():
167
+ raise FileNotFoundError(f"Input directory not found: {input_dir}")
168
+
169
+ records = iter_markdown_records(input_dir=input_dir, min_chars=args.min_chars)
170
+ if not records:
171
+ raise SystemExit(f"No valid Markdown records found in: {input_dir}")
172
+
173
+ splits = split_records(
174
+ records=records,
175
+ train_ratio=args.train_ratio,
176
+ validation_ratio=args.validation_ratio,
177
+ seed=args.seed,
178
+ )
179
+
180
+ for split_name, split_records_data in splits.items():
181
+ if not split_records_data:
182
+ continue
183
+ out_file = output_dir / f"{split_name}.jsonl"
184
+ write_jsonl(out_file, split_records_data)
185
+ print(f"Wrote {split_name}: {len(split_records_data)} rows -> {out_file}")
186
+
187
+ print(f"Total records processed: {len(records)}")
188
+
189
+
190
+ if __name__ == "__main__":
191
+ main()