Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 {'ego4d/narration_val_L4096_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}]}} Evaluation datasets: * ego4d/narration_val | num samples: 65 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.2 ER@0.05: 0.863 (S=0, C=1462, M=7129, R=283) ER@0.10: 0.863 (S=0, C=1462, M=7129, R=283) ER@0.15: 0.863 (S=0, C=1462, M=7129, R=283) ER@0.20: 0.863 (S=0, C=1462, M=7129, R=283) ER@0.25: 0.863 (S=0, C=1462, M=7129, R=283) ER@0.30: 0.863 (S=1, C=1461, M=7129, R=283) ER@0.35: 0.863 (S=3, C=1459, M=7129, R=283) ER@0.40: 0.865 (S=15, C=1447, M=7129, R=283) ER@0.45: 0.867 (S=34, C=1428, M=7129, R=283) ER@0.50: 0.870 (S=65, C=1397, M=7129, R=283) ER@0.55: 0.882 (S=165, C=1297, M=7129, R=283) ER@0.60: 0.898 (S=299, C=1163, M=7129, R=283) ER@0.65: 0.915 (S=448, C=1014, M=7129, R=283) ER@0.70: 0.936 (S=626, C=836, M=7129, R=283) ER@0.75: 0.955 (S=790, C=672, M=7129, R=283) ER@0.80: 0.978 (S=994, C=468, M=7129, R=283) ER@0.85: 0.998 (S=1166, C=296, M=7129, R=283) ER@0.90: 1.014 (S=1303, C=159, M=7129, R=283) ER@0.95: 1.024 (S=1381, C=81, M=7129, R=283) ER@1.00: 1.027 (S=1411, C=51, M=7129, R=283) Evalulation: ego4d-narration_val_L4096_I1/stream/notalk0.2-maxlen_4k Metrics: missing_rate: 0.8298 redundant_rate: 0.1622 match_cost: 0.3210 semantic_score: 0.7264 jaccard_index: 0.1648 Bleu_1: 0.4403 Bleu_1_w: 0.0725 Bleu_2: 0.2713 Bleu_2_w: 0.0447 Bleu_3: 0.1742 Bleu_3_w: 0.0287 Bleu_4: 0.1155 Bleu_4_w: 0.0190 CIDEr: 1.0878 CIDEr_w: 0.1792 METEOR: 0.2163 METEOR_w: 0.0356 mean_error_rate: 0.9134 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 ER@0.05: 1.026 (S=0, C=5981, M=2610, R=6203) ER@0.10: 1.026 (S=0, C=5981, M=2610, R=6203) ER@0.15: 1.026 (S=0, C=5981, M=2610, R=6203) ER@0.20: 1.026 (S=0, C=5981, M=2610, R=6203) ER@0.25: 1.026 (S=0, C=5981, M=2610, R=6203) ER@0.30: 1.026 (S=5, C=5976, M=2610, R=6203) ER@0.35: 1.028 (S=22, C=5959, M=2610, R=6203) ER@0.40: 1.035 (S=77, C=5904, M=2610, R=6203) ER@0.45: 1.054 (S=246, C=5735, M=2610, R=6203) ER@0.50: 1.090 (S=554, C=5427, M=2610, R=6203) ER@0.55: 1.143 (S=1004, C=4977, M=2610, R=6203) ER@0.60: 1.215 (S=1628, C=4353, M=2610, R=6203) ER@0.65: 1.291 (S=2279, C=3702, M=2610, R=6203) ER@0.70: 1.367 (S=2931, C=3050, M=2610, R=6203) ER@0.75: 1.446 (S=3613, C=2368, M=2610, R=6203) ER@0.80: 1.535 (S=4376, C=1605, M=2610, R=6203) ER@0.85: 1.606 (S=4981, C=1000, M=2610, R=6203) ER@0.90: 1.655 (S=5407, C=574, M=2610, R=6203) ER@0.95: 1.683 (S=5644, C=337, M=2610, R=6203) ER@1.00: 1.699 (S=5786, C=195, M=2610, R=6203) Evalulation: ego4d-narration_val_L4096_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.3038 redundant_rate: 0.5091 match_cost: 0.3448 semantic_score: 0.7023 jaccard_index: 0.4043 Bleu_1: 0.4528 Bleu_1_w: 0.1830 Bleu_2: 0.2831 Bleu_2_w: 0.1145 Bleu_3: 0.1875 Bleu_3_w: 0.0758 Bleu_4: 0.1271 Bleu_4_w: 0.0514 CIDEr: 1.1200 CIDEr_w: 0.4528 METEOR: 0.2080 METEOR_w: 0.0841 mean_error_rate: 1.2502 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 ER@0.05: 2.927 (S=0, C=8019, M=572, R=24571) ER@0.10: 2.927 (S=0, C=8019, M=572, R=24571) ER@0.15: 2.927 (S=0, C=8019, M=572, R=24571) ER@0.20: 2.927 (S=0, C=8019, M=572, R=24571) ER@0.25: 2.927 (S=4, C=8015, M=572, R=24571) ER@0.30: 2.929 (S=20, C=7999, M=572, R=24571) ER@0.35: 2.934 (S=62, C=7957, M=572, R=24571) ER@0.40: 2.947 (S=177, C=7842, M=572, R=24571) ER@0.45: 2.977 (S=433, C=7586, M=572, R=24571) ER@0.50: 3.029 (S=876, C=7143, M=572, R=24571) ER@0.55: 3.105 (S=1528, C=6491, M=572, R=24571) ER@0.60: 3.198 (S=2329, C=5690, M=572, R=24571) ER@0.65: 3.297 (S=3181, C=4838, M=572, R=24571) ER@0.70: 3.402 (S=4083, C=3936, M=572, R=24571) ER@0.75: 3.517 (S=5068, C=2951, M=572, R=24571) ER@0.80: 3.624 (S=5992, C=2027, M=572, R=24571) ER@0.85: 3.718 (S=6797, C=1222, M=572, R=24571) ER@0.90: 3.776 (S=7295, C=724, M=572, R=24571) ER@0.95: 3.809 (S=7578, C=441, M=572, R=24571) ER@1.00: 3.831 (S=7772, C=247, M=572, R=24571) Evalulation: ego4d-narration_val_L4096_I1/stream/notalk0.4-maxlen_4k Metrics: missing_rate: 0.0666 redundant_rate: 0.7539 match_cost: 0.3386 semantic_score: 0.6923 jaccard_index: 0.2418 Bleu_1: 0.4251 Bleu_1_w: 0.1028 Bleu_2: 0.2628 Bleu_2_w: 0.0635 Bleu_3: 0.1732 Bleu_3_w: 0.0419 Bleu_4: 0.1153 Bleu_4_w: 0.0279 CIDEr: 1.0783 CIDEr_w: 0.2608 METEOR: 0.1964 METEOR_w: 0.0475 mean_error_rate: 3.2363 All Finished! Time: 0.18 minutes Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 {'assembly101/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}]}, 'ego4d/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}]}, 'egoexolearn/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}]}, 'epickitchens/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}]}, 'holoassist/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}]}, 'wtag/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}]}} Evaluation datasets: * ego4d/dialog_val | num samples: 96 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.3 ER@0.05: 0.874 (S=0, C=1598, M=3216, R=991) ER@0.10: 0.874 (S=0, C=1598, M=3216, R=991) ER@0.15: 0.875 (S=3, C=1595, M=3216, R=991) ER@0.20: 0.877 (S=13, C=1585, M=3216, R=991) ER@0.25: 0.880 (S=28, C=1570, M=3216, R=991) ER@0.30: 0.888 (S=67, C=1531, M=3216, R=991) ER@0.35: 0.898 (S=117, C=1481, M=3216, R=991) ER@0.40: 0.917 (S=208, C=1390, M=3216, R=991) ER@0.45: 0.942 (S=327, C=1271, M=3216, R=991) ER@0.50: 0.974 (S=482, C=1116, M=3216, R=991) ER@0.55: 1.005 (S=632, C=966, M=3216, R=991) ER@0.60: 1.044 (S=820, C=778, M=3216, R=991) ER@0.65: 1.080 (S=994, C=604, M=3216, R=991) ER@0.70: 1.112 (S=1147, C=451, M=3216, R=991) ER@0.75: 1.139 (S=1278, C=320, M=3216, R=991) ER@0.80: 1.163 (S=1390, C=208, M=3216, R=991) ER@0.85: 1.180 (S=1473, C=125, M=3216, R=991) ER@0.90: 1.194 (S=1539, C=59, M=3216, R=991) ER@0.95: 1.201 (S=1576, C=22, M=3216, R=991) ER@1.00: 1.206 (S=1598, C=0, M=3216, R=991) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.6681 redundant_rate: 0.3828 match_cost: 0.4448 semantic_score: 0.5959 jaccard_index: 0.2753 Bleu_1: 0.3391 Bleu_1_w: 0.0933 Bleu_2: 0.2263 Bleu_2_w: 0.0623 Bleu_3: 0.1656 Bleu_3_w: 0.0456 Bleu_4: 0.1275 Bleu_4_w: 0.0351 CIDEr: 0.6738 CIDEr_w: 0.1855 METEOR: 0.1632 METEOR_w: 0.0449 mean_error_rate: 1.0161 ER@0.05: 0.874 (S=0, C=1598, M=3216, R=991) ER@0.10: 0.874 (S=0, C=1598, M=3216, R=991) ER@0.15: 0.875 (S=3, C=1595, M=3216, R=991) ER@0.20: 0.877 (S=13, C=1585, M=3216, R=991) ER@0.25: 0.880 (S=28, C=1570, M=3216, R=991) ER@0.30: 0.888 (S=67, C=1531, M=3216, R=991) ER@0.35: 0.898 (S=117, C=1481, M=3216, R=991) ER@0.40: 0.917 (S=208, C=1390, M=3216, R=991) ER@0.45: 0.942 (S=327, C=1271, M=3216, R=991) ER@0.50: 0.974 (S=482, C=1116, M=3216, R=991) ER@0.55: 1.005 (S=632, C=966, M=3216, R=991) ER@0.60: 1.044 (S=820, C=778, M=3216, R=991) ER@0.65: 1.080 (S=994, C=604, M=3216, R=991) ER@0.70: 1.112 (S=1147, C=451, M=3216, R=991) ER@0.75: 1.139 (S=1278, C=320, M=3216, R=991) ER@0.80: 1.163 (S=1390, C=208, M=3216, R=991) ER@0.85: 1.180 (S=1473, C=125, M=3216, R=991) ER@0.90: 1.194 (S=1539, C=59, M=3216, R=991) ER@0.95: 1.201 (S=1576, C=22, M=3216, R=991) ER@1.00: 1.206 (S=1598, C=0, M=3216, R=991) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.6681 redundant_rate: 0.3828 match_cost: 0.4448 semantic_score: 0.5959 jaccard_index: 0.2753 Bleu_1: 0.3391 Bleu_1_w: 0.0933 Bleu_2: 0.2263 Bleu_2_w: 0.0623 Bleu_3: 0.1656 Bleu_3_w: 0.0456 Bleu_4: 0.1275 Bleu_4_w: 0.0351 CIDEr: 0.6738 CIDEr_w: 0.1855 METEOR: 0.1632 METEOR_w: 0.0449 mean_error_rate: 1.0161 Evaluation datasets: * holoassist/dialog_val | num samples: 291 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.3 ER@0.05: 0.607 (S=9, C=6349, M=8903, R=348) ER@0.10: 0.607 (S=15, C=6343, M=8903, R=348) ER@0.15: 0.609 (S=46, C=6312, M=8903, R=348) ER@0.20: 0.613 (S=109, C=6249, M=8903, R=348) ER@0.25: 0.622 (S=245, C=6113, M=8903, R=348) ER@0.30: 0.634 (S=427, C=5931, M=8903, R=348) ER@0.35: 0.652 (S=703, C=5655, M=8903, R=348) ER@0.40: 0.674 (S=1031, C=5327, M=8903, R=348) ER@0.45: 0.703 (S=1475, C=4883, M=8903, R=348) ER@0.50: 0.735 (S=1963, C=4395, M=8903, R=348) ER@0.55: 0.773 (S=2547, C=3811, M=8903, R=348) ER@0.60: 0.811 (S=3119, C=3239, M=8903, R=348) ER@0.65: 0.851 (S=3741, C=2617, M=8903, R=348) ER@0.70: 0.893 (S=4379, C=1979, M=8903, R=348) ER@0.75: 0.932 (S=4970, C=1388, M=8903, R=348) ER@0.80: 0.965 (S=5473, C=885, M=8903, R=348) ER@0.85: 0.989 (S=5845, C=513, M=8903, R=348) ER@0.90: 1.008 (S=6125, C=233, M=8903, R=348) ER@0.95: 1.019 (S=6295, C=63, M=8903, R=348) ER@1.00: 1.023 (S=6354, C=4, M=8903, R=348) Evalulation: holoassist-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.5834 redundant_rate: 0.0519 match_cost: 0.4314 semantic_score: 0.5936 jaccard_index: 0.4073 Bleu_1: 0.3692 Bleu_1_w: 0.1504 Bleu_2: 0.2517 Bleu_2_w: 0.1025 Bleu_3: 0.1857 Bleu_3_w: 0.0757 Bleu_4: 0.1423 Bleu_4_w: 0.0580 CIDEr: 0.7988 CIDEr_w: 0.3254 METEOR: 0.1755 METEOR_w: 0.0715 mean_error_rate: 0.7860 Evaluation datasets: * epickitchens/dialog_val | num samples: 150 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.3 ER@0.05: 0.960 (S=1, C=3882, M=2549, R=3624) ER@0.10: 0.962 (S=14, C=3869, M=2549, R=3624) ER@0.15: 0.967 (S=44, C=3839, M=2549, R=3624) ER@0.20: 0.977 (S=108, C=3775, M=2549, R=3624) ER@0.25: 0.992 (S=205, C=3678, M=2549, R=3624) ER@0.30: 1.021 (S=395, C=3488, M=2549, R=3624) ER@0.35: 1.063 (S=667, C=3216, M=2549, R=3624) ER@0.40: 1.112 (S=982, C=2901, M=2549, R=3624) ER@0.45: 1.174 (S=1376, C=2507, M=2549, R=3624) ER@0.50: 1.237 (S=1785, C=2098, M=2549, R=3624) ER@0.55: 1.302 (S=2204, C=1679, M=2549, R=3624) ER@0.60: 1.358 (S=2563, C=1320, M=2549, R=3624) ER@0.65: 1.412 (S=2908, C=975, M=2549, R=3624) ER@0.70: 1.463 (S=3236, C=647, M=2549, R=3624) ER@0.75: 1.497 (S=3458, C=425, M=2549, R=3624) ER@0.80: 1.526 (S=3643, C=240, M=2549, R=3624) ER@0.85: 1.545 (S=3762, C=121, M=2549, R=3624) ER@0.90: 1.555 (S=3828, C=55, M=2549, R=3624) ER@0.95: 1.560 (S=3860, C=23, M=2549, R=3624) ER@1.00: 1.563 (S=3883, C=0, M=2549, R=3624) Evalulation: epickitchens-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.3963 redundant_rate: 0.4827 match_cost: 0.5297 semantic_score: 0.5236 jaccard_index: 0.3861 Bleu_1: 0.3089 Bleu_1_w: 0.1193 Bleu_2: 0.1870 Bleu_2_w: 0.0722 Bleu_3: 0.1258 Bleu_3_w: 0.0486 Bleu_4: 0.0911 Bleu_4_w: 0.0352 CIDEr: 0.5723 CIDEr_w: 0.2210 METEOR: 0.1391 METEOR_w: 0.0537 mean_error_rate: 1.2623 Evaluation datasets: * egoexolearn/dialog_val | num samples: 123 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.3 ER@0.05: 0.844 (S=0, C=2590, M=9401, R=720) ER@0.10: 0.844 (S=2, C=2588, M=9401, R=720) ER@0.15: 0.845 (S=13, C=2577, M=9401, R=720) ER@0.20: 0.846 (S=25, C=2565, M=9401, R=720) ER@0.25: 0.848 (S=51, C=2539, M=9401, R=720) ER@0.30: 0.852 (S=98, C=2492, M=9401, R=720) ER@0.35: 0.859 (S=183, C=2407, M=9401, R=720) ER@0.40: 0.870 (S=308, C=2282, M=9401, R=720) ER@0.45: 0.885 (S=493, C=2097, M=9401, R=720) ER@0.50: 0.903 (S=709, C=1881, M=9401, R=720) ER@0.55: 0.922 (S=931, C=1659, M=9401, R=720) ER@0.60: 0.944 (S=1197, C=1393, M=9401, R=720) ER@0.65: 0.968 (S=1481, C=1109, M=9401, R=720) ER@0.70: 0.989 (S=1741, C=849, M=9401, R=720) ER@0.75: 1.012 (S=2013, C=577, M=9401, R=720) ER@0.80: 1.028 (S=2200, C=390, M=9401, R=720) ER@0.85: 1.041 (S=2363, C=227, M=9401, R=720) ER@0.90: 1.052 (S=2495, C=95, M=9401, R=720) ER@0.95: 1.057 (S=2554, C=36, M=9401, R=720) ER@1.00: 1.060 (S=2590, C=0, M=9401, R=720) Evalulation: egoexolearn-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.7840 redundant_rate: 0.2175 match_cost: 0.4205 semantic_score: 0.6107 jaccard_index: 0.2038 Bleu_1: 0.3775 Bleu_1_w: 0.0769 Bleu_2: 0.2554 Bleu_2_w: 0.0520 Bleu_3: 0.1866 Bleu_3_w: 0.0380 Bleu_4: 0.1422 Bleu_4_w: 0.0290 CIDEr: 0.7699 CIDEr_w: 0.1569 METEOR: 0.1708 METEOR_w: 0.0348 mean_error_rate: 0.9335 Evaluation datasets: * wtag/dialog_val | num samples: 21 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.3 ER@0.05: 0.738 (S=6, C=622, M=445, R=341) ER@0.10: 0.752 (S=21, C=607, M=445, R=341) ER@0.15: 0.781 (S=52, C=576, M=445, R=341) ER@0.20: 0.811 (S=84, C=544, M=445, R=341) ER@0.25: 0.851 (S=127, C=501, M=445, R=341) ER@0.30: 0.882 (S=160, C=468, M=445, R=341) ER@0.35: 0.921 (S=202, C=426, M=445, R=341) ER@0.40: 0.951 (S=234, C=394, M=445, R=341) ER@0.45: 0.993 (S=279, C=349, M=445, R=341) ER@0.50: 1.052 (S=343, C=285, M=445, R=341) ER@0.55: 1.096 (S=390, C=238, M=445, R=341) ER@0.60: 1.129 (S=425, C=203, M=445, R=341) ER@0.65: 1.176 (S=476, C=152, M=445, R=341) ER@0.70: 1.212 (S=514, C=114, M=445, R=341) ER@0.75: 1.252 (S=557, C=71, M=445, R=341) ER@0.80: 1.273 (S=580, C=48, M=445, R=341) ER@0.85: 1.287 (S=595, C=33, M=445, R=341) ER@0.90: 1.302 (S=611, C=17, M=445, R=341) ER@0.95: 1.312 (S=622, C=6, M=445, R=341) ER@1.00: 1.318 (S=628, C=0, M=445, R=341) Evalulation: wtag-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.4147 redundant_rate: 0.3519 match_cost: 0.5530 semantic_score: 0.4763 jaccard_index: 0.4441 Bleu_1: 0.2360 Bleu_1_w: 0.1048 Bleu_2: 0.1536 Bleu_2_w: 0.0682 Bleu_3: 0.1080 Bleu_3_w: 0.0480 Bleu_4: 0.0791 Bleu_4_w: 0.0351 CIDEr: 0.4533 CIDEr_w: 0.2013 METEOR: 0.1613 METEOR_w: 0.0717 mean_error_rate: 1.0543 Evaluation datasets: * assembly101/dialog_val | num samples: 336 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.3 ER@0.05: 0.667 (S=2, C=3483, M=4833, R=711) ER@0.10: 0.668 (S=9, C=3476, M=4833, R=711) ER@0.15: 0.670 (S=26, C=3459, M=4833, R=711) ER@0.20: 0.673 (S=53, C=3432, M=4833, R=711) ER@0.25: 0.679 (S=106, C=3379, M=4833, R=711) ER@0.30: 0.688 (S=179, C=3306, M=4833, R=711) ER@0.35: 0.707 (S=337, C=3148, M=4833, R=711) ER@0.40: 0.733 (S=556, C=2929, M=4833, R=711) ER@0.45: 0.767 (S=832, C=2653, M=4833, R=711) ER@0.50: 0.806 (S=1159, C=2326, M=4833, R=711) ER@0.55: 0.851 (S=1533, C=1952, M=4833, R=711) ER@0.60: 0.895 (S=1897, C=1588, M=4833, R=711) ER@0.65: 0.937 (S=2251, C=1234, M=4833, R=711) ER@0.70: 0.966 (S=2495, C=990, M=4833, R=711) ER@0.75: 1.000 (S=2770, C=715, M=4833, R=711) ER@0.80: 1.027 (S=2999, C=486, M=4833, R=711) ER@0.85: 1.049 (S=3183, C=302, M=4833, R=711) ER@0.90: 1.069 (S=3348, C=137, M=4833, R=711) ER@0.95: 1.079 (S=3429, C=56, M=4833, R=711) ER@1.00: 1.085 (S=3482, C=3, M=4833, R=711) Evalulation: assembly101-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.5810 redundant_rate: 0.1694 match_cost: 0.4482 semantic_score: 0.5854 jaccard_index: 0.3860 Bleu_1: 0.3927 Bleu_1_w: 0.1516 Bleu_2: 0.2798 Bleu_2_w: 0.1080 Bleu_3: 0.2103 Bleu_3_w: 0.0812 Bleu_4: 0.1654 Bleu_4_w: 0.0638 CIDEr: 0.7986 CIDEr_w: 0.3083 METEOR: 0.1856 METEOR_w: 0.0716 mean_error_rate: 0.8507 All Finished! Time: 0.50 minutes File "/opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/mmassist/eval/eval.py", line 118 print(f"Runs:\n{'\n'.join(eval_args.inference_setups.split(','))}") ^ SyntaxError: f-string expression part cannot include a backslash File "/opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/mmassist/eval/eval.py", line 118 print(f"Runs:\n{'\n'.join(eval_args.inference_setups.split(','))}") ^ SyntaxError: f-string expression part cannot include a backslash Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 {'assembly101/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.35}]}, 'ego4d/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.35}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.35}]}, 'egoexolearn/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.35}]}, 'epickitchens/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.35}]}, 'holoassist/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.35}]}, 'wtag/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.35}]}} Evaluation datasets: * ego4d/dialog_val | num samples: 96 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.35 ER@0.05: 0.970 (S=0, C=1782, M=3032, R=1636) ER@0.10: 0.970 (S=0, C=1782, M=3032, R=1636) ER@0.15: 0.971 (S=5, C=1777, M=3032, R=1636) ER@0.20: 0.974 (S=19, C=1763, M=3032, R=1636) ER@0.25: 0.977 (S=37, C=1745, M=3032, R=1636) ER@0.30: 0.988 (S=87, C=1695, M=3032, R=1636) ER@0.35: 1.002 (S=156, C=1626, M=3032, R=1636) ER@0.40: 1.023 (S=255, C=1527, M=3032, R=1636) ER@0.45: 1.050 (S=386, C=1396, M=3032, R=1636) ER@0.50: 1.086 (S=559, C=1223, M=3032, R=1636) ER@0.55: 1.120 (S=723, C=1059, M=3032, R=1636) ER@0.60: 1.158 (S=905, C=877, M=3032, R=1636) ER@0.65: 1.201 (S=1113, C=669, M=3032, R=1636) ER@0.70: 1.240 (S=1300, C=482, M=3032, R=1636) ER@0.75: 1.274 (S=1467, C=315, M=3032, R=1636) ER@0.80: 1.298 (S=1580, C=202, M=3032, R=1636) ER@0.85: 1.317 (S=1672, C=110, M=3032, R=1636) ER@0.90: 1.328 (S=1726, C=56, M=3032, R=1636) ER@0.95: 1.336 (S=1764, C=18, M=3032, R=1636) ER@1.00: 1.340 (S=1782, C=0, M=3032, R=1636) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.35-maxlen_4k Metrics: missing_rate: 0.6298 redundant_rate: 0.4786 match_cost: 0.4571 semantic_score: 0.5891 jaccard_index: 0.2763 Bleu_1: 0.3298 Bleu_1_w: 0.0911 Bleu_2: 0.2159 Bleu_2_w: 0.0597 Bleu_3: 0.1562 Bleu_3_w: 0.0431 Bleu_4: 0.1194 Bleu_4_w: 0.0330 CIDEr: 0.6364 CIDEr_w: 0.1758 METEOR: 0.1607 METEOR_w: 0.0444 mean_error_rate: 1.1310 ER@0.05: 0.970 (S=0, C=1782, M=3032, R=1636) ER@0.10: 0.970 (S=0, C=1782, M=3032, R=1636) ER@0.15: 0.971 (S=5, C=1777, M=3032, R=1636) ER@0.20: 0.974 (S=19, C=1763, M=3032, R=1636) ER@0.25: 0.977 (S=37, C=1745, M=3032, R=1636) ER@0.30: 0.988 (S=87, C=1695, M=3032, R=1636) ER@0.35: 1.002 (S=156, C=1626, M=3032, R=1636) ER@0.40: 1.023 (S=255, C=1527, M=3032, R=1636) ER@0.45: 1.050 (S=386, C=1396, M=3032, R=1636) ER@0.50: 1.086 (S=559, C=1223, M=3032, R=1636) ER@0.55: 1.120 (S=723, C=1059, M=3032, R=1636) ER@0.60: 1.158 (S=905, C=877, M=3032, R=1636) ER@0.65: 1.201 (S=1113, C=669, M=3032, R=1636) ER@0.70: 1.240 (S=1300, C=482, M=3032, R=1636) ER@0.75: 1.274 (S=1467, C=315, M=3032, R=1636) ER@0.80: 1.298 (S=1580, C=202, M=3032, R=1636) ER@0.85: 1.317 (S=1672, C=110, M=3032, R=1636) ER@0.90: 1.328 (S=1726, C=56, M=3032, R=1636) ER@0.95: 1.336 (S=1764, C=18, M=3032, R=1636) ER@1.00: 1.340 (S=1782, C=0, M=3032, R=1636) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.35-maxlen_4k Metrics: missing_rate: 0.6298 redundant_rate: 0.4786 match_cost: 0.4571 semantic_score: 0.5891 jaccard_index: 0.2763 Bleu_1: 0.3298 Bleu_1_w: 0.0911 Bleu_2: 0.2159 Bleu_2_w: 0.0597 Bleu_3: 0.1562 Bleu_3_w: 0.0431 Bleu_4: 0.1194 Bleu_4_w: 0.0330 CIDEr: 0.6364 CIDEr_w: 0.1758 METEOR: 0.1607 METEOR_w: 0.0444 mean_error_rate: 1.1310 Evaluation datasets: * holoassist/dialog_val | num samples: 291 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.35 ER@0.05: 0.641 (S=10, C=7813, M=7438, R=2337) ER@0.10: 0.642 (S=29, C=7794, M=7438, R=2337) ER@0.15: 0.645 (S=70, C=7753, M=7438, R=2337) ER@0.20: 0.652 (S=173, C=7650, M=7438, R=2337) ER@0.25: 0.667 (S=401, C=7422, M=7438, R=2337) ER@0.30: 0.685 (S=679, C=7144, M=7438, R=2337) ER@0.35: 0.712 (S=1089, C=6734, M=7438, R=2337) ER@0.40: 0.740 (S=1518, C=6305, M=7438, R=2337) ER@0.45: 0.776 (S=2060, C=5763, M=7438, R=2337) ER@0.50: 0.819 (S=2723, C=5100, M=7438, R=2337) ER@0.55: 0.863 (S=3388, C=4435, M=7438, R=2337) ER@0.60: 0.912 (S=4143, C=3680, M=7438, R=2337) ER@0.65: 0.962 (S=4909, C=2914, M=7438, R=2337) ER@0.70: 1.011 (S=5658, C=2165, M=7438, R=2337) ER@0.75: 1.055 (S=6321, C=1502, M=7438, R=2337) ER@0.80: 1.090 (S=6864, C=959, M=7438, R=2337) ER@0.85: 1.118 (S=7292, C=531, M=7438, R=2337) ER@0.90: 1.138 (S=7590, C=233, M=7438, R=2337) ER@0.95: 1.149 (S=7757, C=66, M=7438, R=2337) ER@1.00: 1.153 (S=7816, C=7, M=7438, R=2337) Evalulation: holoassist-dialog_val_L0_I1/stream/notalk0.35-maxlen_4k Metrics: missing_rate: 0.4874 redundant_rate: 0.2300 match_cost: 0.4575 semantic_score: 0.5744 jaccard_index: 0.4445 Bleu_1: 0.3553 Bleu_1_w: 0.1579 Bleu_2: 0.2373 Bleu_2_w: 0.1055 Bleu_3: 0.1731 Bleu_3_w: 0.0769 Bleu_4: 0.1312 Bleu_4_w: 0.0583 CIDEr: 0.7173 CIDEr_w: 0.3189 METEOR: 0.1641 METEOR_w: 0.0729 mean_error_rate: 0.8715 Evaluation datasets: * epickitchens/dialog_val | num samples: 150 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.35 ER@0.05: 1.405 (S=3, C=4273, M=2156, R=6878) ER@0.10: 1.407 (S=18, C=4258, M=2156, R=6878) ER@0.15: 1.411 (S=40, C=4236, M=2156, R=6878) ER@0.20: 1.420 (S=101, C=4175, M=2156, R=6878) ER@0.25: 1.439 (S=224, C=4052, M=2156, R=6878) ER@0.30: 1.468 (S=407, C=3869, M=2156, R=6878) ER@0.35: 1.510 (S=680, C=3596, M=2156, R=6878) ER@0.40: 1.569 (S=1059, C=3217, M=2156, R=6878) ER@0.45: 1.632 (S=1460, C=2816, M=2156, R=6878) ER@0.50: 1.703 (S=1920, C=2356, M=2156, R=6878) ER@0.55: 1.777 (S=2394, C=1882, M=2156, R=6878) ER@0.60: 1.845 (S=2833, C=1443, M=2156, R=6878) ER@0.65: 1.907 (S=3233, C=1043, M=2156, R=6878) ER@0.70: 1.958 (S=3563, C=713, M=2156, R=6878) ER@0.75: 1.997 (S=3812, C=464, M=2156, R=6878) ER@0.80: 2.025 (S=3993, C=283, M=2156, R=6878) ER@0.85: 2.045 (S=4121, C=155, M=2156, R=6878) ER@0.90: 2.059 (S=4212, C=64, M=2156, R=6878) ER@0.95: 2.067 (S=4259, C=17, M=2156, R=6878) ER@1.00: 2.069 (S=4274, C=2, M=2156, R=6878) Evalulation: epickitchens-dialog_val_L0_I1/stream/notalk0.35-maxlen_4k Metrics: missing_rate: 0.3352 redundant_rate: 0.6166 match_cost: 0.5227 semantic_score: 0.5264 jaccard_index: 0.3213 Bleu_1: 0.2964 Bleu_1_w: 0.0952 Bleu_2: 0.1765 Bleu_2_w: 0.0567 Bleu_3: 0.1170 Bleu_3_w: 0.0376 Bleu_4: 0.0833 Bleu_4_w: 0.0268 CIDEr: 0.5245 CIDEr_w: 0.1685 METEOR: 0.1381 METEOR_w: 0.0444 mean_error_rate: 1.7357 Evaluation datasets: * egoexolearn/dialog_val | num samples: 123 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.35 ER@0.05: 0.909 (S=1, C=3265, M=8725, R=2174) ER@0.10: 0.909 (S=4, C=3262, M=8725, R=2174) ER@0.15: 0.910 (S=16, C=3250, M=8725, R=2174) ER@0.20: 0.912 (S=32, C=3234, M=8725, R=2174) ER@0.25: 0.915 (S=76, C=3190, M=8725, R=2174) ER@0.30: 0.921 (S=145, C=3121, M=8725, R=2174) ER@0.35: 0.933 (S=288, C=2978, M=8725, R=2174) ER@0.40: 0.949 (S=479, C=2787, M=8725, R=2174) ER@0.45: 0.968 (S=708, C=2558, M=8725, R=2174) ER@0.50: 0.996 (S=1047, C=2219, M=8725, R=2174) ER@0.55: 1.025 (S=1392, C=1874, M=8725, R=2174) ER@0.60: 1.051 (S=1702, C=1564, M=8725, R=2174) ER@0.65: 1.078 (S=2023, C=1243, M=8725, R=2174) ER@0.70: 1.105 (S=2351, C=915, M=8725, R=2174) ER@0.75: 1.129 (S=2635, C=631, M=8725, R=2174) ER@0.80: 1.148 (S=2863, C=403, M=8725, R=2174) ER@0.85: 1.163 (S=3042, C=224, M=8725, R=2174) ER@0.90: 1.174 (S=3181, C=85, M=8725, R=2174) ER@0.95: 1.179 (S=3237, C=29, M=8725, R=2174) ER@1.00: 1.181 (S=3266, C=0, M=8725, R=2174) Evalulation: egoexolearn-dialog_val_L0_I1/stream/notalk0.35-maxlen_4k Metrics: missing_rate: 0.7276 redundant_rate: 0.3996 match_cost: 0.4578 semantic_score: 0.5889 jaccard_index: 0.2306 Bleu_1: 0.3660 Bleu_1_w: 0.0844 Bleu_2: 0.2423 Bleu_2_w: 0.0559 Bleu_3: 0.1734 Bleu_3_w: 0.0400 Bleu_4: 0.1299 Bleu_4_w: 0.0299 CIDEr: 0.6624 CIDEr_w: 0.1527 METEOR: 0.1616 METEOR_w: 0.0373 mean_error_rate: 1.0277 Evaluation datasets: * wtag/dialog_val | num samples: 21 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.35 ER@0.05: 0.940 (S=12, C=661, M=400, R=597) ER@0.10: 0.952 (S=24, C=649, M=400, R=597) ER@0.15: 0.974 (S=48, C=625, M=400, R=597) ER@0.20: 0.998 (S=74, C=599, M=400, R=597) ER@0.25: 1.034 (S=112, C=561, M=400, R=597) ER@0.30: 1.075 (S=156, C=517, M=400, R=597) ER@0.35: 1.109 (S=193, C=480, M=400, R=597) ER@0.40: 1.151 (S=238, C=435, M=400, R=597) ER@0.45: 1.190 (S=280, C=393, M=400, R=597) ER@0.50: 1.256 (S=351, C=322, M=400, R=597) ER@0.55: 1.310 (S=409, C=264, M=400, R=597) ER@0.60: 1.354 (S=456, C=217, M=400, R=597) ER@0.65: 1.409 (S=515, C=158, M=400, R=597) ER@0.70: 1.452 (S=561, C=112, M=400, R=597) ER@0.75: 1.497 (S=609, C=64, M=400, R=597) ER@0.80: 1.515 (S=629, C=44, M=400, R=597) ER@0.85: 1.527 (S=641, C=32, M=400, R=597) ER@0.90: 1.544 (S=660, C=13, M=400, R=597) ER@0.95: 1.555 (S=671, C=2, M=400, R=597) ER@1.00: 1.556 (S=673, C=0, M=400, R=597) Evalulation: wtag-dialog_val_L0_I1/stream/notalk0.35-maxlen_4k Metrics: missing_rate: 0.3728 redundant_rate: 0.4701 match_cost: 0.5487 semantic_score: 0.4821 jaccard_index: 0.4030 Bleu_1: 0.2399 Bleu_1_w: 0.0967 Bleu_2: 0.1548 Bleu_2_w: 0.0624 Bleu_3: 0.1089 Bleu_3_w: 0.0439 Bleu_4: 0.0791 Bleu_4_w: 0.0319 CIDEr: 0.4224 CIDEr_w: 0.1702 METEOR: 0.1591 METEOR_w: 0.0641 mean_error_rate: 1.2699 Evaluation datasets: * assembly101/dialog_val | num samples: 336 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.35 ER@0.05: 0.700 (S=5, C=4658, M=3655, R=2163) ER@0.10: 0.701 (S=12, C=4651, M=3655, R=2163) ER@0.15: 0.703 (S=33, C=4630, M=3655, R=2163) ER@0.20: 0.708 (S=75, C=4588, M=3655, R=2163) ER@0.25: 0.717 (S=146, C=4517, M=3655, R=2163) ER@0.30: 0.734 (S=287, C=4376, M=3655, R=2163) ER@0.35: 0.761 (S=511, C=4152, M=3655, R=2163) ER@0.40: 0.800 (S=839, C=3824, M=3655, R=2163) ER@0.45: 0.846 (S=1217, C=3446, M=3655, R=2163) ER@0.50: 0.907 (S=1730, C=2933, M=3655, R=2163) ER@0.55: 0.975 (S=2289, C=2374, M=3655, R=2163) ER@0.60: 1.034 (S=2783, C=1880, M=3655, R=2163) ER@0.65: 1.090 (S=3252, C=1411, M=3655, R=2163) ER@0.70: 1.130 (S=3579, C=1084, M=3655, R=2163) ER@0.75: 1.169 (S=3906, C=757, M=3655, R=2163) ER@0.80: 1.198 (S=4149, C=514, M=3655, R=2163) ER@0.85: 1.221 (S=4335, C=328, M=3655, R=2163) ER@0.90: 1.242 (S=4512, C=151, M=3655, R=2163) ER@0.95: 1.253 (S=4608, C=55, M=3655, R=2163) ER@1.00: 1.259 (S=4658, C=5, M=3655, R=2163) Evalulation: assembly101-dialog_val_L0_I1/stream/notalk0.35-maxlen_4k Metrics: missing_rate: 0.4394 redundant_rate: 0.3169 match_cost: 0.4822 semantic_score: 0.5648 jaccard_index: 0.4449 Bleu_1: 0.3689 Bleu_1_w: 0.1641 Bleu_2: 0.2544 Bleu_2_w: 0.1132 Bleu_3: 0.1867 Bleu_3_w: 0.0830 Bleu_4: 0.1441 Bleu_4_w: 0.0641 CIDEr: 0.6552 CIDEr_w: 0.2915 METEOR: 0.1687 METEOR_w: 0.0751 mean_error_rate: 0.9575 All Finished! Time: 44.05 minutes Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 Runs: ego4d/dialog_val_L0_I1|stream|4k|0.35|summarize_and_drop ego4d/dialog_val_L0_I1|stream|4k|0.35|summarize_and_drop holoassist/dialog_val_L0_I1|stream|4k|0.35|summarize_and_drop epickitchens/dialog_val_L0_I1|stream|4k|0.35|summarize_and_drop egoexolearn/dialog_val_L0_I1|stream|4k|0.35|summarize_and_drop wtag/dialog_val_L0_I1|stream|4k|0.35|summarize_and_drop assembly101/dialog_val_L0_I1|stream|4k|0.35|summarize_and_drop scripts/eval/Aug_eval_stream.sh: 75: des: not found Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 {'assembly101/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.1}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}]}, 'ego4d/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.05}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.1}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}]}, 'egoexolearn/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.1}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}]}, 'epickitchens/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.1}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}]}, 'holoassist/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.1}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}]}, 'wtag/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.1}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}]}} Evaluation datasets: * ego4d/dialog_val | num samples: 96 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.05 ER@0.50: 0.867 (S=167, C=669, M=3978, R=30) ER@0.55: 0.878 (S=217, C=619, M=3978, R=30) ER@0.60: 0.896 (S=307, C=529, M=3978, R=30) ER@0.65: 0.916 (S=403, C=433, M=3978, R=30) ER@0.70: 0.932 (S=481, C=355, M=3978, R=30) ER@0.75: 0.957 (S=601, C=235, M=3978, R=30) ER@0.80: 0.972 (S=671, C=165, M=3978, R=30) ER@0.85: 0.985 (S=734, C=102, M=3978, R=30) ER@0.90: 0.996 (S=786, C=50, M=3978, R=30) ER@0.95: 1.003 (S=820, C=16, M=3978, R=30) ER@1.00: 1.006 (S=836, C=0, M=3978, R=30) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.05-maxlen_4k Metrics: missing_rate: 0.8263 redundant_rate: 0.0346 match_cost: 0.3603 semantic_score: 0.6504 mean_error_rate: 0.9463 mean_error_rate_v2: 0.9405 jaccard_index: 0.1726 jaccard_index_v2: 0.0595 AP: 0.3331 AR: 0.0599 Avg-F1: 0.1016 num_matched: 836.0000 num_missed: 3978.0000 num_redundant: 30.0000 num_correct_5: 669.0000 Bleu_1: 0.4009 Bleu_1_w: 0.0692 Bleu_2: 0.2932 Bleu_2_w: 0.0506 Bleu_3: 0.2283 Bleu_3_w: 0.0394 Bleu_4: 0.1844 Bleu_4_w: 0.0318 CIDEr: 1.1238 CIDEr_w: 0.1940 METEOR: 0.2092 METEOR_w: 0.0361 Updating eval setup: not_talk_threshold: 0.05 -> 0.1 ER@0.50: 0.874 (S=193, C=711, M=3910, R=104) ER@0.55: 0.886 (S=251, C=653, M=3910, R=104) ER@0.60: 0.905 (S=343, C=561, M=3910, R=104) ER@0.65: 0.925 (S=437, C=467, M=3910, R=104) ER@0.70: 0.944 (S=532, C=372, M=3910, R=104) ER@0.75: 0.970 (S=656, C=248, M=3910, R=104) ER@0.80: 0.985 (S=730, C=174, M=3910, R=104) ER@0.85: 1.000 (S=802, C=102, M=3910, R=104) ER@0.90: 1.011 (S=853, C=51, M=3910, R=104) ER@0.95: 1.018 (S=888, C=16, M=3910, R=104) ER@1.00: 1.022 (S=904, C=0, M=3910, R=104) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.1-maxlen_4k Metrics: missing_rate: 0.8122 redundant_rate: 0.1032 match_cost: 0.3671 semantic_score: 0.6444 mean_error_rate: 0.9582 mean_error_rate_v2: 0.9380 jaccard_index: 0.1838 jaccard_index_v2: 0.0620 AP: 0.3026 AR: 0.0634 Avg-F1: 0.1048 num_matched: 904.0000 num_missed: 3910.0000 num_redundant: 104.0000 num_correct_5: 711.0000 Bleu_1: 0.3992 Bleu_1_w: 0.0734 Bleu_2: 0.2919 Bleu_2_w: 0.0537 Bleu_3: 0.2278 Bleu_3_w: 0.0419 Bleu_4: 0.1841 Bleu_4_w: 0.0338 CIDEr: 1.1193 CIDEr_w: 0.2058 METEOR: 0.2092 METEOR_w: 0.0385 Updating eval setup: not_talk_threshold: 0.1 -> 0.2 ER@0.50: 0.877 (S=324, C=938, M=3552, R=344) ER@0.55: 0.899 (S=434, C=828, M=3552, R=344) ER@0.60: 0.927 (S=565, C=697, M=3552, R=344) ER@0.65: 0.956 (S=708, C=554, M=3552, R=344) ER@0.70: 0.984 (S=839, C=423, M=3552, R=344) ER@0.75: 1.010 (S=968, C=294, M=3552, R=344) ER@0.80: 1.032 (S=1070, C=192, M=3552, R=344) ER@0.85: 1.048 (S=1150, C=112, M=3552, R=344) ER@0.90: 1.061 (S=1212, C=50, M=3552, R=344) ER@0.95: 1.068 (S=1243, C=19, M=3552, R=344) ER@1.00: 1.071 (S=1262, C=0, M=3552, R=344) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: missing_rate: 0.7378 redundant_rate: 0.2142 match_cost: 0.4114 semantic_score: 0.6185 mean_error_rate: 0.9939 mean_error_rate_v2: 0.9276 jaccard_index: 0.2447 jaccard_index_v2: 0.0724 AP: 0.2325 AR: 0.0776 Avg-F1: 0.1163 num_matched: 1262.0000 num_missed: 3552.0000 num_redundant: 344.0000 num_correct_5: 938.0000 Bleu_1: 0.3718 Bleu_1_w: 0.0910 Bleu_2: 0.2621 Bleu_2_w: 0.0641 Bleu_3: 0.1980 Bleu_3_w: 0.0485 Bleu_4: 0.1562 Bleu_4_w: 0.0382 CIDEr: 0.9479 CIDEr_w: 0.2319 METEOR: 0.1951 METEOR_w: 0.0477 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 ER@0.50: 0.974 (S=482, C=1116, M=3216, R=991) ER@0.55: 1.005 (S=632, C=966, M=3216, R=991) ER@0.60: 1.044 (S=820, C=778, M=3216, R=991) ER@0.65: 1.080 (S=994, C=604, M=3216, R=991) ER@0.70: 1.112 (S=1147, C=451, M=3216, R=991) ER@0.75: 1.139 (S=1278, C=320, M=3216, R=991) ER@0.80: 1.163 (S=1390, C=208, M=3216, R=991) ER@0.85: 1.180 (S=1473, C=125, M=3216, R=991) ER@0.90: 1.194 (S=1539, C=59, M=3216, R=991) ER@0.95: 1.201 (S=1576, C=22, M=3216, R=991) ER@1.00: 1.206 (S=1598, C=0, M=3216, R=991) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.6681 redundant_rate: 0.3828 match_cost: 0.4448 semantic_score: 0.5959 mean_error_rate: 1.1181 mean_error_rate_v2: 0.9272 jaccard_index: 0.2753 jaccard_index_v2: 0.0728 AP: 0.1632 AR: 0.0878 Avg-F1: 0.1142 num_matched: 1598.0000 num_missed: 3216.0000 num_redundant: 991.0000 num_correct_5: 1116.0000 Bleu_1: 0.3748 Bleu_1_w: 0.1032 Bleu_2: 0.2619 Bleu_2_w: 0.0721 Bleu_3: 0.1972 Bleu_3_w: 0.0543 Bleu_4: 0.1551 Bleu_4_w: 0.0427 CIDEr: 0.8899 CIDEr_w: 0.2450 METEOR: 0.1878 METEOR_w: 0.0517 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 ER@0.50: 1.446 (S=651, C=1370, M=2793, R=3516) ER@0.55: 1.488 (S=856, C=1165, M=2793, R=3516) ER@0.60: 1.533 (S=1073, C=948, M=2793, R=3516) ER@0.65: 1.583 (S=1310, C=711, M=2793, R=3516) ER@0.70: 1.625 (S=1512, C=509, M=2793, R=3516) ER@0.75: 1.661 (S=1687, C=334, M=2793, R=3516) ER@0.80: 1.687 (S=1813, C=208, M=2793, R=3516) ER@0.85: 1.706 (S=1904, C=117, M=2793, R=3516) ER@0.90: 1.718 (S=1962, C=59, M=2793, R=3516) ER@0.95: 1.727 (S=2006, C=15, M=2793, R=3516) ER@1.00: 1.730 (S=2021, C=0, M=2793, R=3516) Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: missing_rate: 0.5802 redundant_rate: 0.6350 match_cost: 0.4652 semantic_score: 0.5812 mean_error_rate: 1.6277 mean_error_rate_v2: 0.9407 jaccard_index: 0.2426 jaccard_index_v2: 0.0593 AP: 0.0893 AR: 0.1027 Avg-F1: 0.0955 num_matched: 2021.0000 num_missed: 2793.0000 num_redundant: 3516.0000 num_correct_5: 1370.0000 Bleu_1: 0.3489 Bleu_1_w: 0.0846 Bleu_2: 0.2356 Bleu_2_w: 0.0572 Bleu_3: 0.1724 Bleu_3_w: 0.0418 Bleu_4: 0.1331 Bleu_4_w: 0.0323 CIDEr: 0.7083 CIDEr_w: 0.1718 METEOR: 0.1732 METEOR_w: 0.0420 Evaluation datasets: * holoassist/dialog_val | num samples: 291 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.1 ER@0.50: 0.893 (S=615, C=1652, M=12994, R=20) ER@0.55: 0.907 (S=829, C=1438, M=12994, R=20) ER@0.60: 0.923 (S=1067, C=1200, M=12994, R=20) ER@0.65: 0.937 (S=1288, C=979, M=12994, R=20) ER@0.70: 0.952 (S=1508, C=759, M=12994, R=20) ER@0.75: 0.966 (S=1723, C=544, M=12994, R=20) ER@0.80: 0.978 (S=1909, C=358, M=12994, R=20) ER@0.85: 0.987 (S=2055, C=212, M=12994, R=20) ER@0.90: 0.995 (S=2170, C=97, M=12994, R=20) ER@0.95: 0.999 (S=2237, C=30, M=12994, R=20) ER@1.00: 1.001 (S=2265, C=2, M=12994, R=20) Evalulation: holoassist-dialog_val_L0_I1/stream/notalk0.1-maxlen_4k Metrics: missing_rate: 0.8515 redundant_rate: 0.0087 match_cost: 0.4095 semantic_score: 0.6075 mean_error_rate: 0.9580 mean_error_rate_v2: 0.9567 jaccard_index: 0.1484 jaccard_index_v2: 0.0433 AP: 0.2890 AR: 0.0433 Avg-F1: 0.0753 num_matched: 2267.0000 num_missed: 12994.0000 num_redundant: 20.0000 num_correct_5: 1652.0000 Bleu_1: 0.4318 Bleu_1_w: 0.0641 Bleu_2: 0.3170 Bleu_2_w: 0.0470 Bleu_3: 0.2456 Bleu_3_w: 0.0364 Bleu_4: 0.1959 Bleu_4_w: 0.0291 CIDEr: 1.1810 CIDEr_w: 0.1752 METEOR: 0.2112 METEOR_w: 0.0313 Updating eval setup: not_talk_threshold: 0.1 -> 0.2 ER@0.50: 0.761 (S=1643, C=3730, M=9888, R=88) ER@0.55: 0.794 (S=2147, C=3226, M=9888, R=88) ER@0.60: 0.825 (S=2613, C=2760, M=9888, R=88) ER@0.65: 0.859 (S=3137, C=2236, M=9888, R=88) ER@0.70: 0.896 (S=3698, C=1675, M=9888, R=88) ER@0.75: 0.929 (S=4201, C=1172, M=9888, R=88) ER@0.80: 0.956 (S=4611, C=762, M=9888, R=88) ER@0.85: 0.978 (S=4955, C=418, M=9888, R=88) ER@0.90: 0.993 (S=5182, C=191, M=9888, R=88) ER@0.95: 1.002 (S=5318, C=55, M=9888, R=88) ER@1.00: 1.005 (S=5366, C=7, M=9888, R=88) Evalulation: holoassist-dialog_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: missing_rate: 0.6479 redundant_rate: 0.0161 match_cost: 0.4281 semantic_score: 0.5936 mean_error_rate: 0.9091 mean_error_rate_v2: 0.9039 jaccard_index: 0.3501 jaccard_index_v2: 0.0961 AP: 0.2702 AR: 0.0967 Avg-F1: 0.1424 num_matched: 5373.0000 num_missed: 9888.0000 num_redundant: 88.0000 num_correct_5: 3730.0000 Bleu_1: 0.4253 Bleu_1_w: 0.1489 Bleu_2: 0.3072 Bleu_2_w: 0.1075 Bleu_3: 0.2343 Bleu_3_w: 0.0820 Bleu_4: 0.1839 Bleu_4_w: 0.0644 CIDEr: 1.0880 CIDEr_w: 0.3809 METEOR: 0.2063 METEOR_w: 0.0722 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 ER@0.50: 0.735 (S=1963, C=4395, M=8903, R=348) ER@0.55: 0.773 (S=2547, C=3811, M=8903, R=348) ER@0.60: 0.811 (S=3119, C=3239, M=8903, R=348) ER@0.65: 0.851 (S=3741, C=2617, M=8903, R=348) ER@0.70: 0.893 (S=4379, C=1979, M=8903, R=348) ER@0.75: 0.932 (S=4970, C=1388, M=8903, R=348) ER@0.80: 0.965 (S=5473, C=885, M=8903, R=348) ER@0.85: 0.989 (S=5845, C=513, M=8903, R=348) ER@0.90: 1.008 (S=6125, C=233, M=8903, R=348) ER@0.95: 1.019 (S=6295, C=63, M=8903, R=348) ER@1.00: 1.023 (S=6354, C=4, M=8903, R=348) Evalulation: holoassist-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.5834 redundant_rate: 0.0519 match_cost: 0.4314 semantic_score: 0.5936 mean_error_rate: 0.9089 mean_error_rate_v2: 0.8886 jaccard_index: 0.4073 jaccard_index_v2: 0.1114 AP: 0.2593 AR: 0.1139 Avg-F1: 0.1583 num_matched: 6358.0000 num_missed: 8903.0000 num_redundant: 348.0000 num_correct_5: 4395.0000 Bleu_1: 0.4187 Bleu_1_w: 0.1705 Bleu_2: 0.2991 Bleu_2_w: 0.1218 Bleu_3: 0.2264 Bleu_3_w: 0.0922 Bleu_4: 0.1765 Bleu_4_w: 0.0719 CIDEr: 1.0479 CIDEr_w: 0.4269 METEOR: 0.2032 METEOR_w: 0.0828 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 ER@0.50: 1.353 (S=3884, C=6464, M=4913, R=11853) ER@0.55: 1.412 (S=4778, C=5570, M=4913, R=11853) ER@0.60: 1.474 (S=5734, C=4614, M=4913, R=11853) ER@0.65: 1.539 (S=6727, C=3621, M=4913, R=11853) ER@0.70: 1.599 (S=7636, C=2712, M=4913, R=11853) ER@0.75: 1.655 (S=8491, C=1857, M=4913, R=11853) ER@0.80: 1.703 (S=9223, C=1125, M=4913, R=11853) ER@0.85: 1.738 (S=9760, C=588, M=4913, R=11853) ER@0.90: 1.761 (S=10105, C=243, M=4913, R=11853) ER@0.95: 1.772 (S=10269, C=79, M=4913, R=11853) ER@1.00: 1.776 (S=10343, C=5, M=4913, R=11853) Evalulation: holoassist-dialog_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: missing_rate: 0.3219 redundant_rate: 0.5339 match_cost: 0.4723 semantic_score: 0.5619 mean_error_rate: 1.6166 mean_error_rate_v2: 0.9099 jaccard_index: 0.3816 jaccard_index_v2: 0.0901 AP: 0.1101 AR: 0.1601 Avg-F1: 0.1304 num_matched: 10348.0000 num_missed: 4913.0000 num_redundant: 11853.0000 num_correct_5: 6464.0000 Bleu_1: 0.3951 Bleu_1_w: 0.1508 Bleu_2: 0.2737 Bleu_2_w: 0.1044 Bleu_3: 0.2024 Bleu_3_w: 0.0772 Bleu_4: 0.1539 Bleu_4_w: 0.0587 CIDEr: 0.8902 CIDEr_w: 0.3397 METEOR: 0.1863 METEOR_w: 0.0711 Evaluation datasets: * epickitchens/dialog_val | num samples: 150 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.1 ER@0.50: 0.848 (S=451, C=1091, M=4890, R=115) ER@0.55: 0.874 (S=615, C=927, M=4890, R=115) ER@0.60: 0.900 (S=782, C=760, M=4890, R=115) ER@0.65: 0.924 (S=938, C=604, M=4890, R=115) ER@0.70: 0.952 (S=1120, C=422, M=4890, R=115) ER@0.75: 0.972 (S=1245, C=297, M=4890, R=115) ER@0.80: 0.988 (S=1349, C=193, M=4890, R=115) ER@0.85: 1.002 (S=1437, C=105, M=4890, R=115) ER@0.90: 1.010 (S=1490, C=52, M=4890, R=115) ER@0.95: 1.016 (S=1527, C=15, M=4890, R=115) ER@1.00: 1.018 (S=1542, C=0, M=4890, R=115) Evalulation: epickitchens-dialog_val_L0_I1/stream/notalk0.1-maxlen_4k Metrics: missing_rate: 0.7603 redundant_rate: 0.0694 match_cost: 0.4235 semantic_score: 0.5959 mean_error_rate: 0.9548 mean_error_rate_v2: 0.9380 jaccard_index: 0.2355 jaccard_index_v2: 0.0620 AP: 0.2450 AR: 0.0631 Avg-F1: 0.1004 num_matched: 1542.0000 num_missed: 4890.0000 num_redundant: 115.0000 num_correct_5: 1091.0000 Bleu_1: 0.4046 Bleu_1_w: 0.0953 Bleu_2: 0.2934 Bleu_2_w: 0.0691 Bleu_3: 0.2259 Bleu_3_w: 0.0532 Bleu_4: 0.1808 Bleu_4_w: 0.0426 CIDEr: 1.2252 CIDEr_w: 0.2886 METEOR: 0.2066 METEOR_w: 0.0487 Updating eval setup: not_talk_threshold: 0.1 -> 0.2 ER@0.50: 0.849 (S=795, C=1417, M=4220, R=448) ER@0.55: 0.884 (S=1017, C=1195, M=4220, R=448) ER@0.60: 0.919 (S=1242, C=970, M=4220, R=448) ER@0.65: 0.952 (S=1457, C=755, M=4220, R=448) ER@0.70: 0.985 (S=1670, C=542, M=4220, R=448) ER@0.75: 1.012 (S=1839, C=373, M=4220, R=448) ER@0.80: 1.033 (S=1974, C=238, M=4220, R=448) ER@0.85: 1.050 (S=2087, C=125, M=4220, R=448) ER@0.90: 1.061 (S=2156, C=56, M=4220, R=448) ER@0.95: 1.067 (S=2195, C=17, M=4220, R=448) ER@1.00: 1.070 (S=2212, C=0, M=4220, R=448) Evalulation: epickitchens-dialog_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: missing_rate: 0.6561 redundant_rate: 0.1684 match_cost: 0.4707 semantic_score: 0.5700 mean_error_rate: 0.9893 mean_error_rate_v2: 0.9248 jaccard_index: 0.3215 jaccard_index_v2: 0.0752 AP: 0.1944 AR: 0.0804 Avg-F1: 0.1137 num_matched: 2212.0000 num_missed: 4220.0000 num_redundant: 448.0000 num_correct_5: 1417.0000 Bleu_1: 0.3931 Bleu_1_w: 0.1264 Bleu_2: 0.2744 Bleu_2_w: 0.0882 Bleu_3: 0.2034 Bleu_3_w: 0.0654 Bleu_4: 0.1582 Bleu_4_w: 0.0509 CIDEr: 1.1124 CIDEr_w: 0.3576 METEOR: 0.1942 METEOR_w: 0.0625 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 ER@0.50: 1.237 (S=1785, C=2098, M=2549, R=3624) ER@0.55: 1.302 (S=2204, C=1679, M=2549, R=3624) ER@0.60: 1.358 (S=2563, C=1320, M=2549, R=3624) ER@0.65: 1.412 (S=2908, C=975, M=2549, R=3624) ER@0.70: 1.463 (S=3236, C=647, M=2549, R=3624) ER@0.75: 1.497 (S=3458, C=425, M=2549, R=3624) ER@0.80: 1.526 (S=3643, C=240, M=2549, R=3624) ER@0.85: 1.545 (S=3762, C=121, M=2549, R=3624) ER@0.90: 1.555 (S=3828, C=55, M=2549, R=3624) ER@0.95: 1.560 (S=3860, C=23, M=2549, R=3624) ER@1.00: 1.563 (S=3883, C=0, M=2549, R=3624) Evalulation: epickitchens-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.3963 redundant_rate: 0.4827 match_cost: 0.5297 semantic_score: 0.5236 mean_error_rate: 1.4563 mean_error_rate_v2: 0.9314 jaccard_index: 0.3861 jaccard_index_v2: 0.0686 AP: 0.0918 AR: 0.1072 Avg-F1: 0.0989 num_matched: 3883.0000 num_missed: 2549.0000 num_redundant: 3624.0000 num_correct_5: 2098.0000 Bleu_1: 0.3749 Bleu_1_w: 0.1448 Bleu_2: 0.2515 Bleu_2_w: 0.0971 Bleu_3: 0.1796 Bleu_3_w: 0.0693 Bleu_4: 0.1355 Bleu_4_w: 0.0523 CIDEr: 0.9289 CIDEr_w: 0.3587 METEOR: 0.1781 METEOR_w: 0.0688 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 ER@0.50: 2.488 (S=2307, C=2666, M=1459, R=12234) ER@0.55: 2.574 (S=2864, C=2109, M=1459, R=12234) ER@0.60: 2.652 (S=3364, C=1609, M=1459, R=12234) ER@0.65: 2.718 (S=3790, C=1183, M=1459, R=12234) ER@0.70: 2.782 (S=4203, C=770, M=1459, R=12234) ER@0.75: 2.828 (S=4495, C=478, M=1459, R=12234) ER@0.80: 2.859 (S=4695, C=278, M=1459, R=12234) ER@0.85: 2.881 (S=4835, C=138, M=1459, R=12234) ER@0.90: 2.892 (S=4909, C=64, M=1459, R=12234) ER@0.95: 2.900 (S=4957, C=16, M=1459, R=12234) ER@1.00: 2.902 (S=4970, C=3, M=1459, R=12234) Evalulation: epickitchens-dialog_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: missing_rate: 0.2268 redundant_rate: 0.7110 match_cost: 0.5235 semantic_score: 0.5205 mean_error_rate: 2.7704 mean_error_rate_v2: 0.9546 jaccard_index: 0.2664 jaccard_index_v2: 0.0454 AP: 0.0492 AR: 0.1316 Avg-F1: 0.0716 num_matched: 4973.0000 num_missed: 1459.0000 num_redundant: 12234.0000 num_correct_5: 2666.0000 Bleu_1: 0.3546 Bleu_1_w: 0.0945 Bleu_2: 0.2300 Bleu_2_w: 0.0613 Bleu_3: 0.1577 Bleu_3_w: 0.0420 Bleu_4: 0.1148 Bleu_4_w: 0.0306 CIDEr: 0.7842 CIDEr_w: 0.2089 METEOR: 0.1704 METEOR_w: 0.0454 Evaluation datasets: * egoexolearn/dialog_val | num samples: 123 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.1 ER@0.50: 0.874 (S=423, C=1564, M=10004, R=58) ER@0.55: 0.889 (S=599, C=1388, M=10004, R=58) ER@0.60: 0.908 (S=820, C=1167, M=10004, R=58) ER@0.65: 0.926 (S=1043, C=944, M=10004, R=58) ER@0.70: 0.944 (S=1261, C=726, M=10004, R=58) ER@0.75: 0.961 (S=1467, C=520, M=10004, R=58) ER@0.80: 0.976 (S=1645, C=342, M=10004, R=58) ER@0.85: 0.988 (S=1784, C=203, M=10004, R=58) ER@0.90: 0.997 (S=1892, C=95, M=10004, R=58) ER@0.95: 1.003 (S=1961, C=26, M=10004, R=58) ER@1.00: 1.005 (S=1987, C=0, M=10004, R=58) Evalulation: egoexolearn-dialog_val_L0_I1/stream/notalk0.1-maxlen_4k Metrics: missing_rate: 0.8343 redundant_rate: 0.0284 match_cost: 0.3814 semantic_score: 0.6332 mean_error_rate: 0.9520 mean_error_rate_v2: 0.9474 jaccard_index: 0.1649 jaccard_index_v2: 0.0526 AP: 0.3101 AR: 0.0529 Avg-F1: 0.0904 num_matched: 1987.0000 num_missed: 10004.0000 num_redundant: 58.0000 num_correct_5: 1564.0000 Bleu_1: 0.4243 Bleu_1_w: 0.0700 Bleu_2: 0.3063 Bleu_2_w: 0.0505 Bleu_3: 0.2337 Bleu_3_w: 0.0385 Bleu_4: 0.1837 Bleu_4_w: 0.0303 CIDEr: 1.1154 CIDEr_w: 0.1839 METEOR: 0.2019 METEOR_w: 0.0333 Updating eval setup: not_talk_threshold: 0.1 -> 0.2 ER@0.50: 0.874 (S=442, C=1660, M=9889, R=149) ER@0.55: 0.889 (S=620, C=1482, M=9889, R=149) ER@0.60: 0.911 (S=886, C=1216, M=9889, R=149) ER@0.65: 0.929 (S=1102, C=1000, M=9889, R=149) ER@0.70: 0.948 (S=1324, C=778, M=9889, R=149) ER@0.75: 0.966 (S=1545, C=557, M=9889, R=149) ER@0.80: 0.982 (S=1734, C=368, M=9889, R=149) ER@0.85: 0.995 (S=1889, C=213, M=9889, R=149) ER@0.90: 1.004 (S=1997, C=105, M=9889, R=149) ER@0.95: 1.010 (S=2071, C=31, M=9889, R=149) ER@1.00: 1.012 (S=2101, C=1, M=9889, R=149) Evalulation: egoexolearn-dialog_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: missing_rate: 0.8247 redundant_rate: 0.0662 match_cost: 0.3861 semantic_score: 0.6338 mean_error_rate: 0.9562 mean_error_rate_v2: 0.9445 jaccard_index: 0.1731 jaccard_index_v2: 0.0555 AP: 0.2993 AR: 0.0562 Avg-F1: 0.0946 num_matched: 2102.0000 num_missed: 9889.0000 num_redundant: 149.0000 num_correct_5: 1660.0000 Bleu_1: 0.4253 Bleu_1_w: 0.0736 Bleu_2: 0.3033 Bleu_2_w: 0.0525 Bleu_3: 0.2291 Bleu_3_w: 0.0397 Bleu_4: 0.1786 Bleu_4_w: 0.0309 CIDEr: 1.0711 CIDEr_w: 0.1855 METEOR: 0.1994 METEOR_w: 0.0345 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 ER@0.50: 0.903 (S=709, C=1881, M=9401, R=720) ER@0.55: 0.922 (S=931, C=1659, M=9401, R=720) ER@0.60: 0.944 (S=1197, C=1393, M=9401, R=720) ER@0.65: 0.968 (S=1481, C=1109, M=9401, R=720) ER@0.70: 0.989 (S=1741, C=849, M=9401, R=720) ER@0.75: 1.012 (S=2013, C=577, M=9401, R=720) ER@0.80: 1.028 (S=2200, C=390, M=9401, R=720) ER@0.85: 1.041 (S=2363, C=227, M=9401, R=720) ER@0.90: 1.052 (S=2495, C=95, M=9401, R=720) ER@0.95: 1.057 (S=2554, C=36, M=9401, R=720) ER@1.00: 1.060 (S=2590, C=0, M=9401, R=720) Evalulation: egoexolearn-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.7840 redundant_rate: 0.2175 match_cost: 0.4205 semantic_score: 0.6107 mean_error_rate: 0.9978 mean_error_rate_v2: 0.9412 jaccard_index: 0.2038 jaccard_index_v2: 0.0588 AP: 0.2257 AR: 0.0623 Avg-F1: 0.0976 num_matched: 2590.0000 num_missed: 9401.0000 num_redundant: 720.0000 num_correct_5: 1881.0000 Bleu_1: 0.4183 Bleu_1_w: 0.0852 Bleu_2: 0.2954 Bleu_2_w: 0.0602 Bleu_3: 0.2212 Bleu_3_w: 0.0451 Bleu_4: 0.1718 Bleu_4_w: 0.0350 CIDEr: 0.9797 CIDEr_w: 0.1996 METEOR: 0.1946 METEOR_w: 0.0397 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 ER@0.50: 1.334 (S=1620, C=2807, M=7564, R=6816) ER@0.55: 1.374 (S=2096, C=2331, M=7564, R=6816) ER@0.60: 1.411 (S=2535, C=1892, M=7564, R=6816) ER@0.65: 1.448 (S=2977, C=1450, M=7564, R=6816) ER@0.70: 1.480 (S=3371, C=1056, M=7564, R=6816) ER@0.75: 1.507 (S=3695, C=732, M=7564, R=6816) ER@0.80: 1.531 (S=3982, C=445, M=7564, R=6816) ER@0.85: 1.548 (S=4186, C=241, M=7564, R=6816) ER@0.90: 1.560 (S=4327, C=100, M=7564, R=6816) ER@0.95: 1.565 (S=4391, C=36, M=7564, R=6816) ER@1.00: 1.568 (S=4426, C=1, M=7564, R=6816) Evalulation: egoexolearn-dialog_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: missing_rate: 0.6308 redundant_rate: 0.6062 match_cost: 0.4840 semantic_score: 0.5691 mean_error_rate: 1.4843 mean_error_rate_v2: 0.9464 jaccard_index: 0.2354 jaccard_index_v2: 0.0536 AP: 0.0897 AR: 0.0841 Avg-F1: 0.0868 num_matched: 4427.0000 num_missed: 7564.0000 num_redundant: 6816.0000 num_correct_5: 2807.0000 Bleu_1: 0.3987 Bleu_1_w: 0.0939 Bleu_2: 0.2722 Bleu_2_w: 0.0641 Bleu_3: 0.1974 Bleu_3_w: 0.0465 Bleu_4: 0.1489 Bleu_4_w: 0.0350 CIDEr: 0.8196 CIDEr_w: 0.1929 METEOR: 0.1795 METEOR_w: 0.0423 Evaluation datasets: * wtag/dialog_val | num samples: 21 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.1 ER@0.50: 0.968 (S=61, C=74, M=938, R=40) ER@0.55: 0.975 (S=68, C=67, M=938, R=40) ER@0.60: 0.984 (S=78, C=57, M=938, R=40) ER@0.65: 0.992 (S=86, C=49, M=938, R=40) ER@0.70: 1.003 (S=98, C=37, M=938, R=40) ER@0.75: 1.014 (S=110, C=25, M=938, R=40) ER@0.80: 1.022 (S=119, C=16, M=938, R=40) ER@0.85: 1.027 (S=124, C=11, M=938, R=40) ER@0.90: 1.033 (S=130, C=5, M=938, R=40) ER@0.95: 1.036 (S=134, C=1, M=938, R=40) ER@1.00: 1.037 (S=135, C=0, M=938, R=40) Evalulation: wtag-dialog_val_L0_I1/stream/notalk0.1-maxlen_4k Metrics: missing_rate: 0.8742 redundant_rate: 0.2286 match_cost: 0.5259 semantic_score: 0.5400 mean_error_rate: 1.0083 mean_error_rate_v2: 0.9721 jaccard_index: 0.1213 jaccard_index_v2: 0.0279 AP: 0.1777 AR: 0.0290 Avg-F1: 0.0498 num_matched: 135.0000 num_missed: 938.0000 num_redundant: 40.0000 num_correct_5: 74.0000 Bleu_1: 0.3643 Bleu_1_w: 0.0442 Bleu_2: 0.2780 Bleu_2_w: 0.0337 Bleu_3: 0.2230 Bleu_3_w: 0.0270 Bleu_4: 0.1830 Bleu_4_w: 0.0222 CIDEr: 0.9718 CIDEr_w: 0.1179 METEOR: 0.2276 METEOR_w: 0.0276 Updating eval setup: not_talk_threshold: 0.1 -> 0.2 ER@0.50: 0.898 (S=286, C=250, M=537, R=141) ER@0.55: 0.942 (S=333, C=203, M=537, R=141) ER@0.60: 0.968 (S=361, C=175, M=537, R=141) ER@0.65: 1.010 (S=406, C=130, M=537, R=141) ER@0.70: 1.042 (S=440, C=96, M=537, R=141) ER@0.75: 1.073 (S=473, C=63, M=537, R=141) ER@0.80: 1.092 (S=494, C=42, M=537, R=141) ER@0.85: 1.105 (S=508, C=28, M=537, R=141) ER@0.90: 1.117 (S=521, C=15, M=537, R=141) ER@0.95: 1.125 (S=529, C=7, M=537, R=141) ER@1.00: 1.131 (S=536, C=0, M=537, R=141) Evalulation: wtag-dialog_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: missing_rate: 0.5005 redundant_rate: 0.2083 match_cost: 0.5574 semantic_score: 0.4763 mean_error_rate: 1.0459 mean_error_rate_v2: 0.9244 jaccard_index: 0.4415 jaccard_index_v2: 0.0756 AP: 0.1355 AR: 0.0855 Avg-F1: 0.1048 num_matched: 536.0000 num_missed: 537.0000 num_redundant: 141.0000 num_correct_5: 250.0000 Bleu_1: 0.3320 Bleu_1_w: 0.1466 Bleu_2: 0.2397 Bleu_2_w: 0.1058 Bleu_3: 0.1773 Bleu_3_w: 0.0783 Bleu_4: 0.1347 Bleu_4_w: 0.0595 CIDEr: 0.9619 CIDEr_w: 0.4247 METEOR: 0.2105 METEOR_w: 0.0930 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 ER@0.50: 1.052 (S=343, C=285, M=445, R=341) ER@0.55: 1.096 (S=390, C=238, M=445, R=341) ER@0.60: 1.129 (S=425, C=203, M=445, R=341) ER@0.65: 1.176 (S=476, C=152, M=445, R=341) ER@0.70: 1.212 (S=514, C=114, M=445, R=341) ER@0.75: 1.252 (S=557, C=71, M=445, R=341) ER@0.80: 1.273 (S=580, C=48, M=445, R=341) ER@0.85: 1.287 (S=595, C=33, M=445, R=341) ER@0.90: 1.302 (S=611, C=17, M=445, R=341) ER@0.95: 1.312 (S=622, C=6, M=445, R=341) ER@1.00: 1.318 (S=628, C=0, M=445, R=341) Evalulation: wtag-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.4147 redundant_rate: 0.3519 match_cost: 0.5530 semantic_score: 0.4763 mean_error_rate: 1.2189 mean_error_rate_v2: 0.9250 jaccard_index: 0.4441 jaccard_index_v2: 0.0750 AP: 0.1095 AR: 0.0989 Avg-F1: 0.1039 num_matched: 628.0000 num_missed: 445.0000 num_redundant: 341.0000 num_correct_5: 285.0000 Bleu_1: 0.3256 Bleu_1_w: 0.1446 Bleu_2: 0.2305 Bleu_2_w: 0.1024 Bleu_3: 0.1717 Bleu_3_w: 0.0762 Bleu_4: 0.1319 Bleu_4_w: 0.0586 CIDEr: 0.8370 CIDEr_w: 0.3717 METEOR: 0.2128 METEOR_w: 0.0945 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 ER@0.50: 1.342 (S=373, C=317, M=383, R=684) ER@0.55: 1.386 (S=420, C=270, M=383, R=684) ER@0.60: 1.429 (S=466, C=224, M=383, R=684) ER@0.65: 1.486 (S=527, C=163, M=383, R=684) ER@0.70: 1.535 (S=580, C=110, M=383, R=684) ER@0.75: 1.578 (S=626, C=64, M=383, R=684) ER@0.80: 1.593 (S=642, C=48, M=383, R=684) ER@0.85: 1.607 (S=657, C=33, M=383, R=684) ER@0.90: 1.626 (S=678, C=12, M=383, R=684) ER@0.95: 1.636 (S=688, C=2, M=383, R=684) ER@1.00: 1.637 (S=690, C=0, M=383, R=684) Evalulation: wtag-dialog_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: missing_rate: 0.3569 redundant_rate: 0.4978 match_cost: 0.5560 semantic_score: 0.4779 mean_error_rate: 1.5322 mean_error_rate_v2: 0.9357 jaccard_index: 0.3927 jaccard_index_v2: 0.0643 AP: 0.0822 AR: 0.1053 Avg-F1: 0.0924 num_matched: 690.0000 num_missed: 383.0000 num_redundant: 684.0000 num_correct_5: 317.0000 Bleu_1: 0.3075 Bleu_1_w: 0.1208 Bleu_2: 0.2124 Bleu_2_w: 0.0834 Bleu_3: 0.1528 Bleu_3_w: 0.0600 Bleu_4: 0.1136 Bleu_4_w: 0.0446 CIDEr: 0.6633 CIDEr_w: 0.2605 METEOR: 0.2026 METEOR_w: 0.0796 Evaluation datasets: * assembly101/dialog_val | num samples: 336 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.1 ER@0.50: 0.821 (S=543, C=1519, M=6256, R=32) ER@0.55: 0.841 (S=710, C=1352, M=6256, R=32) ER@0.60: 0.865 (S=907, C=1155, M=6256, R=32) ER@0.65: 0.887 (S=1094, C=968, M=6256, R=32) ER@0.70: 0.908 (S=1263, C=799, M=6256, R=32) ER@0.75: 0.933 (S=1472, C=590, M=6256, R=32) ER@0.80: 0.952 (S=1633, C=429, M=6256, R=32) ER@0.85: 0.967 (S=1759, C=303, M=6256, R=32) ER@0.90: 0.986 (S=1912, C=150, M=6256, R=32) ER@0.95: 0.996 (S=2000, C=62, M=6256, R=32) ER@1.00: 1.003 (S=2058, C=4, M=6256, R=32) Evalulation: assembly101-dialog_val_L0_I1/stream/notalk0.1-maxlen_4k Metrics: missing_rate: 0.7521 redundant_rate: 0.0153 match_cost: 0.3791 semantic_score: 0.6275 mean_error_rate: 0.9237 mean_error_rate_v2: 0.9202 jaccard_index: 0.2469 jaccard_index_v2: 0.0798 AP: 0.3183 AR: 0.0801 Avg-F1: 0.1280 num_matched: 2062.0000 num_missed: 6256.0000 num_redundant: 32.0000 num_correct_5: 1519.0000 Bleu_1: 0.4728 Bleu_1_w: 0.1168 Bleu_2: 0.3662 Bleu_2_w: 0.0904 Bleu_3: 0.2930 Bleu_3_w: 0.0723 Bleu_4: 0.2422 Bleu_4_w: 0.0598 CIDEr: 1.3705 CIDEr_w: 0.3384 METEOR: 0.2369 METEOR_w: 0.0585 Updating eval setup: not_talk_threshold: 0.1 -> 0.2 ER@0.50: 0.809 (S=698, C=1751, M=5869, R=161) ER@0.55: 0.837 (S=932, C=1517, M=5869, R=161) ER@0.60: 0.864 (S=1158, C=1291, M=5869, R=161) ER@0.65: 0.895 (S=1411, C=1038, M=5869, R=161) ER@0.70: 0.917 (S=1597, C=852, M=5869, R=161) ER@0.75: 0.943 (S=1814, C=635, M=5869, R=161) ER@0.80: 0.966 (S=2007, C=442, M=5869, R=161) ER@0.85: 0.982 (S=2139, C=310, M=5869, R=161) ER@0.90: 1.002 (S=2302, C=147, M=5869, R=161) ER@0.95: 1.012 (S=2387, C=62, M=5869, R=161) ER@1.00: 1.019 (S=2444, C=5, M=5869, R=161) Evalulation: assembly101-dialog_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: missing_rate: 0.7056 redundant_rate: 0.0617 match_cost: 0.4023 semantic_score: 0.6119 mean_error_rate: 0.9314 mean_error_rate_v2: 0.9137 jaccard_index: 0.2888 jaccard_index_v2: 0.0863 AP: 0.2804 AR: 0.0880 Avg-F1: 0.1339 num_matched: 2449.0000 num_missed: 5869.0000 num_redundant: 161.0000 num_correct_5: 1751.0000 Bleu_1: 0.4639 Bleu_1_w: 0.1340 Bleu_2: 0.3552 Bleu_2_w: 0.1026 Bleu_3: 0.2819 Bleu_3_w: 0.0814 Bleu_4: 0.2313 Bleu_4_w: 0.0668 CIDEr: 1.2826 CIDEr_w: 0.3704 METEOR: 0.2301 METEOR_w: 0.0665 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 ER@0.50: 0.806 (S=1159, C=2326, M=4833, R=711) ER@0.55: 0.851 (S=1533, C=1952, M=4833, R=711) ER@0.60: 0.895 (S=1897, C=1588, M=4833, R=711) ER@0.65: 0.937 (S=2251, C=1234, M=4833, R=711) ER@0.70: 0.966 (S=2495, C=990, M=4833, R=711) ER@0.75: 1.000 (S=2770, C=715, M=4833, R=711) ER@0.80: 1.027 (S=2999, C=486, M=4833, R=711) ER@0.85: 1.049 (S=3183, C=302, M=4833, R=711) ER@0.90: 1.069 (S=3348, C=137, M=4833, R=711) ER@0.95: 1.079 (S=3429, C=56, M=4833, R=711) ER@1.00: 1.085 (S=3482, C=3, M=4833, R=711) Evalulation: assembly101-dialog_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: missing_rate: 0.5810 redundant_rate: 0.1694 match_cost: 0.4482 semantic_score: 0.5854 mean_error_rate: 0.9785 mean_error_rate_v2: 0.9014 jaccard_index: 0.3860 jaccard_index_v2: 0.0986 AP: 0.2121 AR: 0.1070 Avg-F1: 0.1422 num_matched: 3485.0000 num_missed: 4833.0000 num_redundant: 711.0000 num_correct_5: 2326.0000 Bleu_1: 0.4411 Bleu_1_w: 0.1703 Bleu_2: 0.3273 Bleu_2_w: 0.1263 Bleu_3: 0.2531 Bleu_3_w: 0.0977 Bleu_4: 0.2032 Bleu_4_w: 0.0784 CIDEr: 1.0574 CIDEr_w: 0.4081 METEOR: 0.2128 METEOR_w: 0.0821 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 ER@0.50: 1.241 (S=2486, C=3515, M=2317, R=5522) ER@0.55: 1.328 (S=3209, C=2792, M=2317, R=5522) ER@0.60: 1.404 (S=3837, C=2164, M=2317, R=5522) ER@0.65: 1.469 (S=4381, C=1620, M=2317, R=5522) ER@0.70: 1.516 (S=4771, C=1230, M=2317, R=5522) ER@0.75: 1.560 (S=5136, C=865, M=2317, R=5522) ER@0.80: 1.597 (S=5449, C=552, M=2317, R=5522) ER@0.85: 1.623 (S=5663, C=338, M=2317, R=5522) ER@0.90: 1.646 (S=5851, C=150, M=2317, R=5522) ER@0.95: 1.657 (S=5947, C=54, M=2317, R=5522) ER@1.00: 1.664 (S=5998, C=3, M=2317, R=5522) Evalulation: assembly101-dialog_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: missing_rate: 0.2786 redundant_rate: 0.4792 match_cost: 0.5017 semantic_score: 0.5476 mean_error_rate: 1.5187 mean_error_rate_v2: 0.9127 jaccard_index: 0.4336 jaccard_index_v2: 0.0873 AP: 0.1048 AR: 0.1452 Avg-F1: 0.1217 num_matched: 6001.0000 num_missed: 2317.0000 num_redundant: 5522.0000 num_correct_5: 3515.0000 Bleu_1: 0.4078 Bleu_1_w: 0.1768 Bleu_2: 0.2904 Bleu_2_w: 0.1259 Bleu_3: 0.2163 Bleu_3_w: 0.0938 Bleu_4: 0.1677 Bleu_4_w: 0.0727 CIDEr: 0.8164 CIDEr_w: 0.3540 METEOR: 0.1891 METEOR_w: 0.0820 All Finished! Time: 100.57 minutes Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 Runs: ego4d/dialog_val_L0_I1|stream|4k|0.05|summarize_and_drop ego4d/dialog_val_L0_I1|stream|4k|0.1|summarize_and_drop holoassist/dialog_val_L0_I1|stream|4k|0.1|summarize_and_drop epickitchens/dialog_val_L0_I1|stream|4k|0.1|summarize_and_drop egoexolearn/dialog_val_L0_I1|stream|4k|0.1|summarize_and_drop wtag/dialog_val_L0_I1|stream|4k|0.1|summarize_and_drop assembly101/dialog_val_L0_I1|stream|4k|0.1|summarize_and_drop ego4d/dialog_val_L0_I1|stream|4k|0.2|summarize_and_drop holoassist/dialog_val_L0_I1|stream|4k|0.2|summarize_and_drop epickitchens/dialog_val_L0_I1|stream|4k|0.2|summarize_and_drop egoexolearn/dialog_val_L0_I1|stream|4k|0.2|summarize_and_drop wtag/dialog_val_L0_I1|stream|4k|0.2|summarize_and_drop assembly101/dialog_val_L0_I1|stream|4k|0.2|summarize_and_drop ego4d/dialog_val_L0_I1|stream|4k|0.3|summarize_and_drop holoassist/dialog_val_L0_I1|stream|4k|0.3|summarize_and_drop epickitchens/dialog_val_L0_I1|stream|4k|0.3|summarize_and_drop egoexolearn/dialog_val_L0_I1|stream|4k|0.3|summarize_and_drop wtag/dialog_val_L0_I1|stream|4k|0.3|summarize_and_drop assembly101/dialog_val_L0_I1|stream|4k|0.3|summarize_and_drop ego4d/dialog_val_L0_I1|stream|4k|0.4|summarize_and_drop holoassist/dialog_val_L0_I1|stream|4k|0.4|summarize_and_drop epickitchens/dialog_val_L0_I1|stream|4k|0.4|summarize_and_drop egoexolearn/dialog_val_L0_I1|stream|4k|0.4|summarize_and_drop wtag/dialog_val_L0_I1|stream|4k|0.4|summarize_and_drop assembly101/dialog_val_L0_I1|stream|4k|0.4|summarize_and_drop usage: eval.py [-h] --model_path MODEL_PATH --inference_setups INFERENCE_SETUPS [--data_root_dir DATA_ROOT_DIR] [--force_rerun [FORCE_RERUN]] [--fps FPS] [--sts_model_type STS_MODEL_TYPE] [--match_window_time MATCH_WINDOW_TIME] [--match_dist_func_factor MATCH_DIST_FUNC_FACTOR] [--match_dist_func_power MATCH_DIST_FUNC_POWER] --job_name JOB_NAME [--num_nodes NUM_NODES] [--tasks_per_node TASKS_PER_NODE] [--gpus_per_node GPUS_PER_NODE] [--cpus_per_node CPUS_PER_NODE] [--mem_gb MEM_GB] [--timeout_min TIMEOUT_MIN] [--partition PARTITION] [--account ACCOUNT] [--log_dir LOG_DIR] [--slurm_exclude SLURM_EXCLUDE] eval.py: error: argument --match_window_time: invalid float value: 'auto' Traceback (most recent call last): File "/opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/mmassist/eval/eval.py", line 144, in main(eval_args, slurm_args) File "/opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/mmassist/eval/eval.py", line 133, in main job.results() # wait for the job to finish File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 287, in results return [tp.cast(R, sub_job.result()) for sub_job in self._sub_jobs] File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 287, in return [tp.cast(R, sub_job.result()) for sub_job in self._sub_jobs] File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 266, in result r = self.results() File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 289, in results outcome, result = self._get_outcome_and_result() File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 384, in _get_outcome_and_result raise utils.UncompletedJobError("\n".join(message)) submitit.core.utils.UncompletedJobError: Job 14225 (task: 0) with path /opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/slurm_logs/14225/14225_0_result.pkl has not produced any output (state: CANCELLED by 649731) Error stream produced: ---------------------------------------- Loading checkpoint shards: 0%| | 0/4 [00:00 main(eval_args, slurm_args) File "/opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/mmassist/eval/eval.py", line 133, in main job.results() # wait for the job to finish File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 287, in results return [tp.cast(R, sub_job.result()) for sub_job in self._sub_jobs] File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 287, in return [tp.cast(R, sub_job.result()) for sub_job in self._sub_jobs] File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 266, in result r = self.results() File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/core.py", line 294, in results raise job_exception # pylint: disable=raising-bad-type submitit.core.utils.FailedJobError: Job (task=0) failed during processing with trace: ---------------------- Traceback (most recent call last): File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/submission.py", line 55, in process_job result = delayed.result() File "/data/home/imzyc/miniconda3/envs/mm/lib/python3.10/site-packages/submitit/core/utils.py", line 133, in result self._result = self.function(*self.args, **self.kwargs) File "/opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/mmassist/eval/eval.py", line 77, in run_eval evaluator_cls = evaluator_name_to_cls[evaluator_name] KeyError: '' ---------------------- You can check full logs with 'job.stderr(0)' and 'job.stdout(0)'or at paths: - /opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/slurm_logs/14289/14289_0_log.err - /opt/hpcaas/.mounts/fs-036153e63d56f4dc2/home/imzyc/project/proactive-assist/slurm_logs/14289/14289_0_log.out Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 {'assembly101/dialog-klg_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.5}]}, 'ego4d/dialog-klg_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.5}]}, 'ego4d/dialog_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.05}]}, 'egoexolearn/dialog-klg_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.5}]}, 'epickitchens/dialog-klg_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.5}]}, 'holoassist/dialog-klg_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.5}]}, 'wtag/dialog-klg_val_L0_I1': {'stream': [{'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.2}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.3}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.4}, {'context_handling_method': 'summarize_and_drop', 'eval_max_seq_len': 4096, 'eval_max_seq_len_str': '4k', 'inference_runner_type': 'stream', 'not_talk_threshold': 0.5}]}} Evaluation datasets: * ego4d/dialog_val | num samples: 96 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.05 Evalulation: ego4d-dialog_val_L0_I1/stream/notalk0.05-maxlen_4k Metrics: jaccard_index: 0.1410 missing_rate: 0.8247 redundant_rate: 0.0254 semantic_score: 0.7123 time_diff: 0.2867 precision: 0.7875 recall: 0.1417 F1: 0.2401 num_matched: 682.0000 num_mismatched: 162.0000 num_missed: 3970.0000 num_redundant: 22.0000 Bleu_1: 0.3991 Bleu_1_w: 0.0563 Bleu_2: 0.2909 Bleu_2_w: 0.0410 Bleu_3: 0.2258 Bleu_3_w: 0.0319 Bleu_4: 0.1818 Bleu_4_w: 0.0256 CIDEr: 1.1137 CIDEr_w: 0.1571 METEOR: 0.2074 METEOR_w: 0.0293 Evaluation datasets: * ego4d/dialog-klg_val | num samples: 96 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.2 Evalulation: ego4d-dialog-klg_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: jaccard_index: 0.2468 missing_rate: 0.6348 redundant_rate: 0.2078 semantic_score: 0.6974 time_diff: 1.1156 precision: 0.5868 recall: 0.2705 F1: 0.3703 num_matched: 1302.0000 num_mismatched: 456.0000 num_missed: 3056.0000 num_redundant: 461.0000 Bleu_1: 0.3854 Bleu_1_w: 0.0951 Bleu_2: 0.2700 Bleu_2_w: 0.0666 Bleu_3: 0.2003 Bleu_3_w: 0.0494 Bleu_4: 0.1556 Bleu_4_w: 0.0384 CIDEr: 0.8833 CIDEr_w: 0.2180 METEOR: 0.1913 METEOR_w: 0.0472 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 Evalulation: ego4d-dialog-klg_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: jaccard_index: 0.2743 missing_rate: 0.4904 redundant_rate: 0.3830 semantic_score: 0.6882 time_diff: 1.3104 precision: 0.4371 recall: 0.3610 F1: 0.3954 num_matched: 1738.0000 num_mismatched: 715.0000 num_missed: 2361.0000 num_redundant: 1523.0000 Bleu_1: 0.3652 Bleu_1_w: 0.1002 Bleu_2: 0.2507 Bleu_2_w: 0.0688 Bleu_3: 0.1837 Bleu_3_w: 0.0504 Bleu_4: 0.1416 Bleu_4_w: 0.0388 CIDEr: 0.7549 CIDEr_w: 0.2070 METEOR: 0.1825 METEOR_w: 0.0500 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 Evalulation: ego4d-dialog-klg_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: jaccard_index: 0.2248 missing_rate: 0.3635 redundant_rate: 0.6168 semantic_score: 0.6899 time_diff: 1.4115 precision: 0.2740 recall: 0.4551 F1: 0.3421 num_matched: 2191.0000 num_mismatched: 873.0000 num_missed: 1750.0000 num_redundant: 4932.0000 Bleu_1: 0.3759 Bleu_1_w: 0.0845 Bleu_2: 0.2574 Bleu_2_w: 0.0579 Bleu_3: 0.1876 Bleu_3_w: 0.0422 Bleu_4: 0.1437 Bleu_4_w: 0.0323 CIDEr: 0.7896 CIDEr_w: 0.1775 METEOR: 0.1780 METEOR_w: 0.0400 Updating eval setup: not_talk_threshold: 0.4 -> 0.5 Evalulation: ego4d-dialog-klg_val_L0_I1/stream/notalk0.5-maxlen_4k Metrics: jaccard_index: 0.1401 missing_rate: 0.2532 redundant_rate: 0.7929 semantic_score: 0.6834 time_diff: 1.3554 precision: 0.1499 recall: 0.5407 F1: 0.2348 num_matched: 2603.0000 num_mismatched: 992.0000 num_missed: 1219.0000 num_redundant: 13767.0000 Bleu_1: 0.3662 Bleu_1_w: 0.0513 Bleu_2: 0.2504 Bleu_2_w: 0.0351 Bleu_3: 0.1822 Bleu_3_w: 0.0255 Bleu_4: 0.1395 Bleu_4_w: 0.0195 CIDEr: 0.7510 CIDEr_w: 0.1052 METEOR: 0.1762 METEOR_w: 0.0247 Evaluation datasets: * holoassist/dialog-klg_val | num samples: 291 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.2 Evalulation: holoassist-dialog-klg_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: jaccard_index: 0.2309 missing_rate: 0.6792 redundant_rate: 0.0274 semantic_score: 0.7122 time_diff: 0.1928 precision: 0.7064 recall: 0.2330 F1: 0.3504 num_matched: 3556.0000 num_mismatched: 1340.0000 num_missed: 10365.0000 num_redundant: 138.0000 Bleu_1: 0.4523 Bleu_1_w: 0.1044 Bleu_2: 0.3360 Bleu_2_w: 0.0776 Bleu_3: 0.2622 Bleu_3_w: 0.0605 Bleu_4: 0.2096 Bleu_4_w: 0.0484 CIDEr: 1.3210 CIDEr_w: 0.3050 METEOR: 0.2230 METEOR_w: 0.0515 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 Evalulation: holoassist-dialog-klg_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: jaccard_index: 0.2914 missing_rate: 0.5893 redundant_rate: 0.0740 semantic_score: 0.7093 time_diff: 0.2722 precision: 0.6785 recall: 0.3010 F1: 0.4170 num_matched: 4593.0000 num_mismatched: 1675.0000 num_missed: 8993.0000 num_redundant: 501.0000 Bleu_1: 0.4412 Bleu_1_w: 0.1286 Bleu_2: 0.3238 Bleu_2_w: 0.0944 Bleu_3: 0.2501 Bleu_3_w: 0.0729 Bleu_4: 0.1980 Bleu_4_w: 0.0577 CIDEr: 1.2303 CIDEr_w: 0.3585 METEOR: 0.2185 METEOR_w: 0.0637 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 Evalulation: holoassist-dialog-klg_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: jaccard_index: 0.2395 missing_rate: 0.3510 redundant_rate: 0.5117 semantic_score: 0.6945 time_diff: 0.3960 precision: 0.3028 recall: 0.4025 F1: 0.3456 num_matched: 6142.0000 num_mismatched: 3763.0000 num_missed: 5356.0000 num_redundant: 10381.0000 Bleu_1: 0.4184 Bleu_1_w: 0.1002 Bleu_2: 0.2997 Bleu_2_w: 0.0718 Bleu_3: 0.2278 Bleu_3_w: 0.0546 Bleu_4: 0.1783 Bleu_4_w: 0.0427 CIDEr: 1.0872 CIDEr_w: 0.2604 METEOR: 0.1996 METEOR_w: 0.0478 Updating eval setup: not_talk_threshold: 0.4 -> 0.5 Evalulation: holoassist-dialog-klg_val_L0_I1/stream/notalk0.5-maxlen_4k Metrics: jaccard_index: 0.1543 missing_rate: 0.1765 redundant_rate: 0.7351 semantic_score: 0.6862 time_diff: 0.4228 precision: 0.1630 recall: 0.5068 F1: 0.2467 num_matched: 7735.0000 num_mismatched: 4833.0000 num_missed: 2693.0000 num_redundant: 34874.0000 Bleu_1: 0.3959 Bleu_1_w: 0.0611 Bleu_2: 0.2795 Bleu_2_w: 0.0431 Bleu_3: 0.2109 Bleu_3_w: 0.0325 Bleu_4: 0.1632 Bleu_4_w: 0.0252 CIDEr: 0.9431 CIDEr_w: 0.1455 METEOR: 0.1880 METEOR_w: 0.0290 Evaluation datasets: * epickitchens/dialog-klg_val | num samples: 150 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.2 Evalulation: epickitchens-dialog-klg_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: jaccard_index: 0.2117 missing_rate: 0.6014 redundant_rate: 0.2852 semantic_score: 0.6767 time_diff: 0.4677 precision: 0.4399 recall: 0.2453 F1: 0.3150 num_matched: 1578.0000 num_mismatched: 986.0000 num_missed: 3868.0000 num_redundant: 1023.0000 Bleu_1: 0.3939 Bleu_1_w: 0.0834 Bleu_2: 0.2761 Bleu_2_w: 0.0584 Bleu_3: 0.2048 Bleu_3_w: 0.0433 Bleu_4: 0.1586 Bleu_4_w: 0.0336 CIDEr: 1.0914 CIDEr_w: 0.2310 METEOR: 0.1975 METEOR_w: 0.0418 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 Evalulation: epickitchens-dialog-klg_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: jaccard_index: 0.2132 missing_rate: 0.3789 redundant_rate: 0.5009 semantic_score: 0.6657 time_diff: 0.6098 precision: 0.2781 recall: 0.3461 F1: 0.3084 num_matched: 2226.0000 num_mismatched: 1769.0000 num_missed: 2437.0000 num_redundant: 4010.0000 Bleu_1: 0.3712 Bleu_1_w: 0.0791 Bleu_2: 0.2508 Bleu_2_w: 0.0535 Bleu_3: 0.1796 Bleu_3_w: 0.0383 Bleu_4: 0.1355 Bleu_4_w: 0.0289 CIDEr: 0.9349 CIDEr_w: 0.1993 METEOR: 0.1834 METEOR_w: 0.0391 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 Evalulation: epickitchens-dialog-klg_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: jaccard_index: 0.1429 missing_rate: 0.1957 redundant_rate: 0.7132 semantic_score: 0.6608 time_diff: 0.5977 precision: 0.1529 recall: 0.4288 F1: 0.2254 num_matched: 2758.0000 num_mismatched: 2415.0000 num_missed: 1259.0000 num_redundant: 12864.0000 Bleu_1: 0.3656 Bleu_1_w: 0.0523 Bleu_2: 0.2397 Bleu_2_w: 0.0343 Bleu_3: 0.1664 Bleu_3_w: 0.0238 Bleu_4: 0.1222 Bleu_4_w: 0.0175 CIDEr: 0.8853 CIDEr_w: 0.1265 METEOR: 0.1771 METEOR_w: 0.0253 Updating eval setup: not_talk_threshold: 0.4 -> 0.5 Evalulation: epickitchens-dialog-klg_val_L0_I1/stream/notalk0.5-maxlen_4k Metrics: jaccard_index: 0.1049 missing_rate: 0.0973 redundant_rate: 0.8029 semantic_score: 0.6564 time_diff: 0.5558 precision: 0.1071 recall: 0.4905 F1: 0.1758 num_matched: 3155.0000 num_mismatched: 2651.0000 num_missed: 626.0000 num_redundant: 23656.0000 Bleu_1: 0.3567 Bleu_1_w: 0.0374 Bleu_2: 0.2318 Bleu_2_w: 0.0243 Bleu_3: 0.1588 Bleu_3_w: 0.0167 Bleu_4: 0.1153 Bleu_4_w: 0.0121 CIDEr: 0.8132 CIDEr_w: 0.0853 METEOR: 0.1724 METEOR_w: 0.0181 Evaluation datasets: * egoexolearn/dialog-klg_val | num samples: 123 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.2 Evalulation: egoexolearn-dialog-klg_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: jaccard_index: 0.1387 missing_rate: 0.8104 redundant_rate: 0.1285 semantic_score: 0.7007 time_diff: 0.2868 precision: 0.6557 recall: 0.1426 F1: 0.2343 num_matched: 1710.0000 num_mismatched: 563.0000 num_missed: 9718.0000 num_redundant: 335.0000 Bleu_1: 0.4345 Bleu_1_w: 0.0603 Bleu_2: 0.3128 Bleu_2_w: 0.0434 Bleu_3: 0.2377 Bleu_3_w: 0.0330 Bleu_4: 0.1861 Bleu_4_w: 0.0258 CIDEr: 1.0884 CIDEr_w: 0.1510 METEOR: 0.2044 METEOR_w: 0.0284 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 Evalulation: egoexolearn-dialog-klg_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: jaccard_index: 0.1610 missing_rate: 0.7390 redundant_rate: 0.2805 semantic_score: 0.6830 time_diff: 0.5484 precision: 0.4890 recall: 0.1774 F1: 0.2603 num_matched: 2127.0000 num_mismatched: 1003.0000 num_missed: 8861.0000 num_redundant: 1220.0000 Bleu_1: 0.4062 Bleu_1_w: 0.0654 Bleu_2: 0.2817 Bleu_2_w: 0.0454 Bleu_3: 0.2067 Bleu_3_w: 0.0333 Bleu_4: 0.1572 Bleu_4_w: 0.0253 CIDEr: 0.8980 CIDEr_w: 0.1446 METEOR: 0.1898 METEOR_w: 0.0306 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 Evalulation: egoexolearn-dialog-klg_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: jaccard_index: 0.1324 missing_rate: 0.5542 redundant_rate: 0.6966 semantic_score: 0.6658 time_diff: 0.6759 precision: 0.1823 recall: 0.2679 F1: 0.2170 num_matched: 3212.0000 num_mismatched: 2133.0000 num_missed: 6646.0000 num_redundant: 12271.0000 Bleu_1: 0.3942 Bleu_1_w: 0.0522 Bleu_2: 0.2667 Bleu_2_w: 0.0353 Bleu_3: 0.1900 Bleu_3_w: 0.0252 Bleu_4: 0.1404 Bleu_4_w: 0.0186 CIDEr: 0.7886 CIDEr_w: 0.1044 METEOR: 0.1764 METEOR_w: 0.0234 Updating eval setup: not_talk_threshold: 0.4 -> 0.5 Evalulation: egoexolearn-dialog-klg_val_L0_I1/stream/notalk0.5-maxlen_4k Metrics: jaccard_index: 0.0834 missing_rate: 0.2899 redundant_rate: 0.8431 semantic_score: 0.6500 time_diff: 0.6787 precision: 0.0887 recall: 0.4014 F1: 0.1453 num_matched: 4813.0000 num_mismatched: 3702.0000 num_missed: 3476.0000 num_redundant: 45741.0000 Bleu_1: 0.3714 Bleu_1_w: 0.0310 Bleu_2: 0.2428 Bleu_2_w: 0.0202 Bleu_3: 0.1678 Bleu_3_w: 0.0140 Bleu_4: 0.1212 Bleu_4_w: 0.0101 CIDEr: 0.6385 CIDEr_w: 0.0532 METEOR: 0.1630 METEOR_w: 0.0136 Evaluation datasets: * wtag/dialog-klg_val | num samples: 21 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.2 Evalulation: wtag-dialog-klg_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: jaccard_index: 0.2066 missing_rate: 0.4949 redundant_rate: 0.1597 semantic_score: 0.6875 time_diff: 0.4794 precision: 0.3767 recall: 0.2265 F1: 0.2829 num_matched: 243.0000 num_mismatched: 299.0000 num_missed: 531.0000 num_redundant: 103.0000 Bleu_1: 0.3329 Bleu_1_w: 0.0688 Bleu_2: 0.2397 Bleu_2_w: 0.0495 Bleu_3: 0.1807 Bleu_3_w: 0.0373 Bleu_4: 0.1397 Bleu_4_w: 0.0289 CIDEr: 0.9238 CIDEr_w: 0.1909 METEOR: 0.2236 METEOR_w: 0.0462 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 Evalulation: wtag-dialog-klg_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: jaccard_index: 0.2219 missing_rate: 0.4473 redundant_rate: 0.2238 semantic_score: 0.6947 time_diff: 0.5145 precision: 0.3613 recall: 0.2572 F1: 0.3005 num_matched: 276.0000 num_mismatched: 317.0000 num_missed: 480.0000 num_redundant: 171.0000 Bleu_1: 0.3682 Bleu_1_w: 0.0817 Bleu_2: 0.2718 Bleu_2_w: 0.0603 Bleu_3: 0.2092 Bleu_3_w: 0.0464 Bleu_4: 0.1651 Bleu_4_w: 0.0366 CIDEr: 1.1512 CIDEr_w: 0.2554 METEOR: 0.2267 METEOR_w: 0.0503 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 Evalulation: wtag-dialog-klg_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: jaccard_index: 0.2564 missing_rate: 0.3327 redundant_rate: 0.2701 semantic_score: 0.6876 time_diff: 0.8907 precision: 0.3496 recall: 0.3197 F1: 0.3340 num_matched: 343.0000 num_mismatched: 373.0000 num_missed: 357.0000 num_redundant: 265.0000 Bleu_1: 0.3570 Bleu_1_w: 0.0915 Bleu_2: 0.2619 Bleu_2_w: 0.0671 Bleu_3: 0.2009 Bleu_3_w: 0.0515 Bleu_4: 0.1585 Bleu_4_w: 0.0406 CIDEr: 0.9873 CIDEr_w: 0.2531 METEOR: 0.2162 METEOR_w: 0.0554 Updating eval setup: not_talk_threshold: 0.4 -> 0.5 Evalulation: wtag-dialog-klg_val_L0_I1/stream/notalk0.5-maxlen_4k Metrics: jaccard_index: 0.2339 missing_rate: 0.3094 redundant_rate: 0.4072 semantic_score: 0.6828 time_diff: 0.9459 precision: 0.2960 recall: 0.3448 F1: 0.3186 num_matched: 370.0000 num_mismatched: 371.0000 num_missed: 332.0000 num_redundant: 509.0000 Bleu_1: 0.3504 Bleu_1_w: 0.0820 Bleu_2: 0.2562 Bleu_2_w: 0.0599 Bleu_3: 0.1955 Bleu_3_w: 0.0457 Bleu_4: 0.1535 Bleu_4_w: 0.0359 CIDEr: 0.9531 CIDEr_w: 0.2229 METEOR: 0.2106 METEOR_w: 0.0493 Evaluation datasets: * assembly101/dialog-klg_val | num samples: 336 Updating eval setup: inference_runner_type: None -> stream Updating eval setup: not_talk_threshold: 0.5 -> 0.2 Evalulation: assembly101-dialog-klg_val_L0_I1/stream/notalk0.2-maxlen_4k Metrics: jaccard_index: 0.2184 missing_rate: 0.7032 redundant_rate: 0.0746 semantic_score: 0.7307 time_diff: 0.1704 precision: 0.6972 recall: 0.2236 F1: 0.3386 num_matched: 1860.0000 num_mismatched: 609.0000 num_missed: 5849.0000 num_redundant: 199.0000 Bleu_1: 0.4928 Bleu_1_w: 0.1076 Bleu_2: 0.3895 Bleu_2_w: 0.0851 Bleu_3: 0.3180 Bleu_3_w: 0.0695 Bleu_4: 0.2670 Bleu_4_w: 0.0583 CIDEr: 1.5431 CIDEr_w: 0.3370 METEOR: 0.2440 METEOR_w: 0.0533 Updating eval setup: not_talk_threshold: 0.2 -> 0.3 Evalulation: assembly101-dialog-klg_val_L0_I1/stream/notalk0.3-maxlen_4k Metrics: jaccard_index: 0.2826 missing_rate: 0.5628 redundant_rate: 0.1831 semantic_score: 0.7158 time_diff: 0.4370 precision: 0.5797 recall: 0.3103 F1: 0.4042 num_matched: 2581.0000 num_mismatched: 1056.0000 num_missed: 4681.0000 num_redundant: 815.0000 Bleu_1: 0.4638 Bleu_1_w: 0.1311 Bleu_2: 0.3574 Bleu_2_w: 0.1010 Bleu_3: 0.2848 Bleu_3_w: 0.0805 Bleu_4: 0.2343 Bleu_4_w: 0.0662 CIDEr: 1.3007 CIDEr_w: 0.3676 METEOR: 0.2275 METEOR_w: 0.0643 Updating eval setup: not_talk_threshold: 0.3 -> 0.4 Evalulation: assembly101-dialog-klg_val_L0_I1/stream/notalk0.4-maxlen_4k Metrics: jaccard_index: 0.2835 missing_rate: 0.2888 redundant_rate: 0.4715 semantic_score: 0.7024 time_diff: 0.7532 precision: 0.3443 recall: 0.4633 F1: 0.3951 num_matched: 3854.0000 num_mismatched: 2062.0000 num_missed: 2402.0000 num_redundant: 5277.0000 Bleu_1: 0.4325 Bleu_1_w: 0.1226 Bleu_2: 0.3229 Bleu_2_w: 0.0915 Bleu_3: 0.2512 Bleu_3_w: 0.0712 Bleu_4: 0.2029 Bleu_4_w: 0.0575 CIDEr: 1.1226 CIDEr_w: 0.3182 METEOR: 0.2074 METEOR_w: 0.0588 Updating eval setup: not_talk_threshold: 0.4 -> 0.5 Evalulation: assembly101-dialog-klg_val_L0_I1/stream/notalk0.5-maxlen_4k Metrics: jaccard_index: 0.1584 missing_rate: 0.1077 redundant_rate: 0.7417 semantic_score: 0.6834 time_diff: 0.7472 precision: 0.1634 recall: 0.5644 F1: 0.2534 num_matched: 4695.0000 num_mismatched: 2727.0000 num_missed: 896.0000 num_redundant: 21314.0000 Bleu_1: 0.4066 Bleu_1_w: 0.0644 Bleu_2: 0.2959 Bleu_2_w: 0.0469 Bleu_3: 0.2251 Bleu_3_w: 0.0357 Bleu_4: 0.1783 Bleu_4_w: 0.0282 CIDEr: 0.9545 CIDEr_w: 0.1512 METEOR: 0.1920 METEOR_w: 0.0304 All Finished! Time: 70.23 minutes Model: /fsx_0/user/imzyc/proact_exps/20240821-L4096-I1-ep4-NOSEP-nr0.1-klgmix-1s-lora-bs256 Runs: ego4d/dialog_val_L0_I1|stream|4k|0.05|summarize_and_drop ego4d/dialog-klg_val_L0_I1|stream|4k|0.2|summarize_and_drop holoassist/dialog-klg_val_L0_I1|stream|4k|0.2|summarize_and_drop epickitchens/dialog-klg_val_L0_I1|stream|4k|0.2|summarize_and_drop egoexolearn/dialog-klg_val_L0_I1|stream|4k|0.2|summarize_and_drop wtag/dialog-klg_val_L0_I1|stream|4k|0.2|summarize_and_drop assembly101/dialog-klg_val_L0_I1|stream|4k|0.2|summarize_and_drop ego4d/dialog-klg_val_L0_I1|stream|4k|0.3|summarize_and_drop holoassist/dialog-klg_val_L0_I1|stream|4k|0.3|summarize_and_drop epickitchens/dialog-klg_val_L0_I1|stream|4k|0.3|summarize_and_drop egoexolearn/dialog-klg_val_L0_I1|stream|4k|0.3|summarize_and_drop wtag/dialog-klg_val_L0_I1|stream|4k|0.3|summarize_and_drop assembly101/dialog-klg_val_L0_I1|stream|4k|0.3|summarize_and_drop ego4d/dialog-klg_val_L0_I1|stream|4k|0.4|summarize_and_drop holoassist/dialog-klg_val_L0_I1|stream|4k|0.4|summarize_and_drop epickitchens/dialog-klg_val_L0_I1|stream|4k|0.4|summarize_and_drop egoexolearn/dialog-klg_val_L0_I1|stream|4k|0.4|summarize_and_drop wtag/dialog-klg_val_L0_I1|stream|4k|0.4|summarize_and_drop assembly101/dialog-klg_val_L0_I1|stream|4k|0.4|summarize_and_drop ego4d/dialog-klg_val_L0_I1|stream|4k|0.5|summarize_and_drop holoassist/dialog-klg_val_L0_I1|stream|4k|0.5|summarize_and_drop epickitchens/dialog-klg_val_L0_I1|stream|4k|0.5|summarize_and_drop egoexolearn/dialog-klg_val_L0_I1|stream|4k|0.5|summarize_and_drop wtag/dialog-klg_val_L0_I1|stream|4k|0.5|summarize_and_drop assembly101/dialog-klg_val_L0_I1|stream|4k|0.5|summarize_and_drop