Spaces:

build-small-hackathon
/

retro

Running

App Files Files Community

retro / scripts /merge_datasets.py

sankalphs

Phase 1: project setup, MiniMax-M3 synthetic dataset generation, 1446-row clean dataset

122cc3c 4 days ago

Raw

History Blame Contribute Delete

1.97 kB

	"""Append extra mentor rows to clean dataset and re-validate."""

	import json
	import re
	import sys
	from pathlib import Path

	ROOT = Path(__file__).resolve().parent.parent
	CLEAN_FILE = ROOT / "data" / "retro-alpha-clean.jsonl"
	EXTRA_FILE = ROOT / "data" / "retro-alpha-mentor-extra.jsonl"
	FINAL_FILE = ROOT / "data" / "retro-alpha-final.jsonl"


	def parse_mentor(response: str) -> dict \| None:
	try:
	roast = re.search(r"roast:\s*(.+)", response).group(1).strip()
	sharpe = float(re.search(r"sharpe_ratio:\s*([-\d.]+)", response).group(1))
	lesson = re.search(r"lesson:\s*(.+)", response).group(1).strip()
	suggestion = re.search(r"suggestion:\s*(.+)", response).group(1).strip()
	return {"roast": roast, "sharpe_ratio": sharpe, "lesson": lesson, "suggestion": suggestion}
	except Exception:
	return None


	def main():
	if not CLEAN_FILE.exists():
	print(f"Clean file not found: {CLEAN_FILE}")
	sys.exit(1)
	if not EXTRA_FILE.exists():
	print(f"Extra file not found: {EXTRA_FILE}")
	sys.exit(1)

	# Copy clean file
	with open(CLEAN_FILE, "r", encoding="utf-8") as f:
	clean_rows = [line for line in f if line.strip()]

	# Parse and append extra mentor rows
	extra_valid = 0
	extra_invalid = 0
	with open(EXTRA_FILE, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	row = json.loads(line)
	if parse_mentor(row.get("response", "")):
	clean_rows.append(line)
	extra_valid += 1
	else:
	extra_invalid += 1

	with open(FINAL_FILE, "w", encoding="utf-8") as f:
	for line in clean_rows:
	f.write(line + "\n")

	print(f"Extra mentor valid: {extra_valid}, invalid: {extra_invalid}")
	print(f"Final dataset: {FINAL_FILE} ({len(clean_rows)} rows)")


	if __name__ == "__main__":
	main()