retro / scripts /merge_datasets.py
sankalphs's picture
Phase 1: project setup, MiniMax-M3 synthetic dataset generation, 1446-row clean dataset
122cc3c
Raw
History Blame Contribute Delete
1.97 kB
"""Append extra mentor rows to clean dataset and re-validate."""
import json
import re
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
CLEAN_FILE = ROOT / "data" / "retro-alpha-clean.jsonl"
EXTRA_FILE = ROOT / "data" / "retro-alpha-mentor-extra.jsonl"
FINAL_FILE = ROOT / "data" / "retro-alpha-final.jsonl"
def parse_mentor(response: str) -> dict | None:
try:
roast = re.search(r"roast:\s*(.+)", response).group(1).strip()
sharpe = float(re.search(r"sharpe_ratio:\s*([-\d.]+)", response).group(1))
lesson = re.search(r"lesson:\s*(.+)", response).group(1).strip()
suggestion = re.search(r"suggestion:\s*(.+)", response).group(1).strip()
return {"roast": roast, "sharpe_ratio": sharpe, "lesson": lesson, "suggestion": suggestion}
except Exception:
return None
def main():
if not CLEAN_FILE.exists():
print(f"Clean file not found: {CLEAN_FILE}")
sys.exit(1)
if not EXTRA_FILE.exists():
print(f"Extra file not found: {EXTRA_FILE}")
sys.exit(1)
# Copy clean file
with open(CLEAN_FILE, "r", encoding="utf-8") as f:
clean_rows = [line for line in f if line.strip()]
# Parse and append extra mentor rows
extra_valid = 0
extra_invalid = 0
with open(EXTRA_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
row = json.loads(line)
if parse_mentor(row.get("response", "")):
clean_rows.append(line)
extra_valid += 1
else:
extra_invalid += 1
with open(FINAL_FILE, "w", encoding="utf-8") as f:
for line in clean_rows:
f.write(line + "\n")
print(f"Extra mentor valid: {extra_valid}, invalid: {extra_invalid}")
print(f"Final dataset: {FINAL_FILE} ({len(clean_rows)} rows)")
if __name__ == "__main__":
main()