import pandas as pd import json import sqlite3 from pathlib import Path from typing import Optional def load_programs_to_df(db_path_str: str) -> Optional[pd.DataFrame]: """ Loads the 'programs' table from an SQLite database into a pandas DataFrame. Args: db_path_str: The path to the SQLite database file. Returns: A pandas DataFrame containing program data, or None if an error occurs or no data is found. The 'metrics' JSON string is parsed and its key-value pairs are added as columns to the DataFrame. """ db_file = Path(db_path_str) if not db_file.exists(): print(f"Error: Database file not found at {db_path_str}") return None conn = None try: conn = sqlite3.connect(str(db_file)) cursor = conn.cursor() cursor.execute("SELECT * FROM programs") # Fetch all columns all_program_rows = cursor.fetchall() if not all_program_rows: print(f"No programs found in the database: {db_path_str}") return pd.DataFrame() # Return empty DataFrame if no programs # Get column names from cursor.description column_names = [description[0] for description in cursor.description] # print(column_names) programs_data = [] for row_tuple in all_program_rows: # Convert row tuple to dict p_dict = dict(zip(column_names, row_tuple)) # Metrics and metadata are stored as JSON strings metrics_json = p_dict.get("metrics", "{}") metrics_dict = json.loads(metrics_json) if metrics_json else {} # Parse inspiration_ids JSON archive_insp_ids_json = p_dict.get("archive_inspiration_ids", "[]") archive_insp_ids = ( json.loads(archive_insp_ids_json) if archive_insp_ids_json else [] ) top_k_insp_ids_json = p_dict.get("top_k_inspiration_ids", "[]") top_k_insp_ids = ( json.loads(top_k_insp_ids_json) if top_k_insp_ids_json else [] ) metadata_json = p_dict.get("metadata", "{}") metadata_dict = json.loads(metadata_json) if metadata_json else {} # Parse public_metrics and private_metrics public_metrics_raw = p_dict.get("public_metrics", "{}") if isinstance(public_metrics_raw, str): public_metrics_dict = ( json.loads(public_metrics_raw) if public_metrics_raw else {} ) else: public_metrics_dict = public_metrics_raw or {} private_metrics_raw = p_dict.get("private_metrics", "{}") if isinstance(private_metrics_raw, str): private_metrics_dict = ( json.loads(private_metrics_raw) if private_metrics_raw else {} ) else: private_metrics_dict = private_metrics_raw or {} embedding = p_dict.get("embedding", []) if isinstance(embedding, str): embedding = json.loads(embedding) # Create a flat dictionary for the DataFrame try: timestamp = pd.to_datetime(p_dict.get("timestamp"), unit="s") except Exception: timestamp = None flat_data = { "id": p_dict.get("id"), "code": p_dict.get("code"), "language": p_dict.get("language"), "parent_id": p_dict.get("parent_id"), "archive_inspiration_ids": archive_insp_ids, "top_k_inspiration_ids": top_k_insp_ids, "generation": p_dict.get("generation"), "timestamp": timestamp, "complexity": p_dict.get("complexity"), "embedding": embedding, "code_diff": p_dict.get("code_diff"), "correct": bool(p_dict.get("correct", False)), "combined_score": p_dict.get("combined_score"), **metadata_dict, **public_metrics_dict, **private_metrics_dict, "text_feedback": p_dict.get("text_feedback", ""), } flat_data.update(metrics_dict) programs_data.append(flat_data) return pd.DataFrame(programs_data) except sqlite3.Error as e: print(f"SQLite error while loading {db_path_str}: {e}") return None except json.JSONDecodeError as e: db_path = db_path_str print(f"JSON decoding error for metrics/metadata in {db_path}: {e}") return None finally: if conn: conn.close() def get_path_to_best_node( df: pd.DataFrame, score_column: str = "combined_score" ) -> pd.DataFrame: """ Finds the chronological path to the node with the highest score. Args: df: DataFrame containing program data score_column: The column name to use for finding the best node (default: "combined_score") Returns: A DataFrame representing the chronological path to the best node, starting from the earliest ancestor and ending with the best node. """ if df.empty: return pd.DataFrame() if score_column not in df.columns: raise ValueError(f"Column '{score_column}' not found in DataFrame") # Create a dictionary mapping id to row for quick lookups id_to_row = {row["id"]: row for _, row in df.iterrows()} print(f"Total rows: {len(df)}") # Only correct rows correct_df = df[df["correct"]] print(f"Correct rows: {len(correct_df)}") # Find the node with the maximum score best_node_row = correct_df.loc[correct_df[score_column].idxmax()] # Start building the path with the best node path = [best_node_row.to_dict()] current_id = best_node_row["parent_id"] # Trace back through parent_ids to construct the path while current_id is not None and current_id in id_to_row: parent_row = id_to_row[current_id] path.append(parent_row.to_dict()) current_id = parent_row["parent_id"] # Reverse to get chronological order (oldest first) return pd.DataFrame(path[::-1]) def store_best_path(df: pd.DataFrame, results_dir: str): best_path = get_path_to_best_node(df) path_dir = Path(f"{results_dir}/best_path") path_dir.mkdir(exist_ok=True) patch_dir = Path(f"{path_dir}/patches") patch_dir.mkdir(exist_ok=True) code_dir = Path(f"{path_dir}/code") code_dir.mkdir(exist_ok=True) meta_dir = Path(f"{path_dir}/meta") meta_dir.mkdir(exist_ok=True) i = 0 for _, row in best_path.iterrows(): print(f"\nGeneration {row['generation']} - Score: {row['combined_score']:.2f}") if row["code_diff"] is not None: patch_path = patch_dir / f"patch_{i}.patch" patch_path.write_text(str(row["code_diff"])) print(f"Saved patch to {patch_path}") base_path = code_dir / f"main_{i}.py" base_path.write_text(str(row["code"])) # store row data as json, handle non-serializable types import datetime def default_serializer(obj): if isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() try: import pandas as pd if isinstance(obj, pd.Timestamp): return obj.isoformat() except ImportError: pass return str(obj) row_data_path = meta_dir / f"meta_{i}.json" row_data_path.write_text(json.dumps(row.to_dict(), default=default_serializer)) print(f"Saved meta data to {row_data_path}") print(f"Saved base code to {base_path}") print(row["patch_name"]) i += 1