"""Append extra mentor rows to clean dataset and re-validate.""" import json import re import sys from pathlib import Path ROOT = Path(__file__).resolve().parent.parent CLEAN_FILE = ROOT / "data" / "retro-alpha-clean.jsonl" EXTRA_FILE = ROOT / "data" / "retro-alpha-mentor-extra.jsonl" FINAL_FILE = ROOT / "data" / "retro-alpha-final.jsonl" def parse_mentor(response: str) -> dict | None: try: roast = re.search(r"roast:\s*(.+)", response).group(1).strip() sharpe = float(re.search(r"sharpe_ratio:\s*([-\d.]+)", response).group(1)) lesson = re.search(r"lesson:\s*(.+)", response).group(1).strip() suggestion = re.search(r"suggestion:\s*(.+)", response).group(1).strip() return {"roast": roast, "sharpe_ratio": sharpe, "lesson": lesson, "suggestion": suggestion} except Exception: return None def main(): if not CLEAN_FILE.exists(): print(f"Clean file not found: {CLEAN_FILE}") sys.exit(1) if not EXTRA_FILE.exists(): print(f"Extra file not found: {EXTRA_FILE}") sys.exit(1) # Copy clean file with open(CLEAN_FILE, "r", encoding="utf-8") as f: clean_rows = [line for line in f if line.strip()] # Parse and append extra mentor rows extra_valid = 0 extra_invalid = 0 with open(EXTRA_FILE, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue row = json.loads(line) if parse_mentor(row.get("response", "")): clean_rows.append(line) extra_valid += 1 else: extra_invalid += 1 with open(FINAL_FILE, "w", encoding="utf-8") as f: for line in clean_rows: f.write(line + "\n") print(f"Extra mentor valid: {extra_valid}, invalid: {extra_invalid}") print(f"Final dataset: {FINAL_FILE} ({len(clean_rows)} rows)") if __name__ == "__main__": main()