{ "phase_results": [ { "phase": "simple_multiturn_transcript", "best_selection_metric_value": 6.045462267592054, "best_metrics": { "validation/loss_total": 1.7929038028086592, "validation/loss_response": 2.106443265529265, "validation/loss_current_user_reconstruction": 1.6467093680787979, "validation/loss_probe": 1.3120193120973094, "validation/response_first_token_exact_match": 0.1518987341772152, "validation/current-user_reconstruction_first_token_exact_match": 0.22784810126582278, "validation/probe_first_token_exact_match": 1.0, "validation/supervised_turn_count": 474.0, "validation/response_exact_match": 0.007936507936507936, "validation/response_similarity": 0.0982966461414312, "validation/response_token_f1": 0.18865678434635447, "validation/response_line_recall": 0.007936507936507936, "validation/response_reconstruction_similarity": 0.2672819842355101, "validation/response_reconstruction_exact_match": 0.0, "validation/response_reconstruction_token_f1": 0.2672819842355101, "validation/probe_exact_match": 0.0, "validation/probe_transcript_similarity": 0.23611152152229414, "validation/probe_token_f1": 0.4722230430445883, "validation/probe_line_recall": 0.0, "validation/response_similarity_by_turn/turn_2": 0.1278913959770529, "validation/response_similarity_by_turn/turn_3": 0.09627094558882597, "validation/response_similarity_by_turn/turn_4": 0.07133303521592968, "validation/response_similarity_by_turn/turn_5": 0.08668160657233656, "validation/response_similarity_by_turn/turn_6": 0.060287911253460294, "validation/response_reconstruction_similarity_by_turn/turn_2": 0.4131726085584623, "validation/response_reconstruction_similarity_by_turn/turn_3": 0.2114348366719477, "validation/response_reconstruction_similarity_by_turn/turn_4": 0.20395226248593826, "validation/response_reconstruction_similarity_by_turn/turn_5": 0.16776266235940282, "validation/response_reconstruction_similarity_by_turn/turn_6": 0.1779640527539687, "validation/probe_transcript_similarity_by_turn/turn_2": 0.2543098189406736, "validation/probe_transcript_similarity_by_turn/turn_3": 0.22976204949484447, "validation/probe_transcript_similarity_by_turn/turn_4": 0.22848382794491162, "validation/probe_transcript_similarity_by_turn/turn_5": 0.22902170973243605, "validation/probe_transcript_similarity_by_turn/turn_6": 0.20982533421592103, "validation/goal_loss": 6.045462267592054 }, "global_step": 386, "train_dataset": { "example_count": 7696, "pair_count_mean": 4.2548076923076925, "pair_count_max": 6, "response_target_tokens_mean_turn2_plus": 126.23921517671518, "response_target_tokens_max_turn2_plus": 1643, "probe_target_tokens_mean_turn2_plus": 471.31613825363826, "probe_target_tokens_max_turn2_plus": 3285, "dataset_counts": { "chatalpaca_multiturn_enriched": 7696 }, "probe_question_text": "What is everything we have talked about so far? Give exact conversation transcript verbatim in following format: [User 1]: X [Assistant 1]: Y [User 2]: A etc" }, "validation_dataset": { "example_count": 151, "pair_count_mean": 4.139072847682119, "pair_count_max": 6, "response_target_tokens_mean_turn2_plus": 123.21854304635761, "response_target_tokens_max_turn2_plus": 889, "probe_target_tokens_mean_turn2_plus": 448.5298013245033, "probe_target_tokens_max_turn2_plus": 1545, "dataset_counts": { "chatalpaca_multiturn_enriched": 151 }, "probe_question_text": "What is everything we have talked about so far? Give exact conversation transcript verbatim in following format: [User 1]: X [Assistant 1]: Y [User 2]: A etc" }, "wandb_enabled": true } ] }