Spaces:
Running
Running
File size: 1,974 Bytes
122cc3c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | """Append extra mentor rows to clean dataset and re-validate."""
import json
import re
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
CLEAN_FILE = ROOT / "data" / "retro-alpha-clean.jsonl"
EXTRA_FILE = ROOT / "data" / "retro-alpha-mentor-extra.jsonl"
FINAL_FILE = ROOT / "data" / "retro-alpha-final.jsonl"
def parse_mentor(response: str) -> dict | None:
try:
roast = re.search(r"roast:\s*(.+)", response).group(1).strip()
sharpe = float(re.search(r"sharpe_ratio:\s*([-\d.]+)", response).group(1))
lesson = re.search(r"lesson:\s*(.+)", response).group(1).strip()
suggestion = re.search(r"suggestion:\s*(.+)", response).group(1).strip()
return {"roast": roast, "sharpe_ratio": sharpe, "lesson": lesson, "suggestion": suggestion}
except Exception:
return None
def main():
if not CLEAN_FILE.exists():
print(f"Clean file not found: {CLEAN_FILE}")
sys.exit(1)
if not EXTRA_FILE.exists():
print(f"Extra file not found: {EXTRA_FILE}")
sys.exit(1)
# Copy clean file
with open(CLEAN_FILE, "r", encoding="utf-8") as f:
clean_rows = [line for line in f if line.strip()]
# Parse and append extra mentor rows
extra_valid = 0
extra_invalid = 0
with open(EXTRA_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
row = json.loads(line)
if parse_mentor(row.get("response", "")):
clean_rows.append(line)
extra_valid += 1
else:
extra_invalid += 1
with open(FINAL_FILE, "w", encoding="utf-8") as f:
for line in clean_rows:
f.write(line + "\n")
print(f"Extra mentor valid: {extra_valid}, invalid: {extra_invalid}")
print(f"Final dataset: {FINAL_FILE} ({len(clean_rows)} rows)")
if __name__ == "__main__":
main()
|