diff --git "a/leaderboard_data.json" "b/leaderboard_data.json" --- "a/leaderboard_data.json" +++ "b/leaderboard_data.json" @@ -1,6 +1,6 @@ { "metadata": { - "generated_at": "2026-03-16T21:12:50Z", + "generated_at": "2026-03-18T04:26:27Z", "scoring_dimensions": [ "semantic_relevance", "factual_accuracy", @@ -105,7 +105,8 @@ "score_file": "data/content-scores/tavily/qwen_scoring_for_tavily_few_shot.json", "rank_file": "data/rank-scores/tavily/geo_scores.json" } - ] + ], + "source_of_truth": "Scoring Sheet - Overall-Result-Metric.csv and Scoring Sheet - Overall-Result-Category.csv" }, "overall": [ { @@ -113,16 +114,16 @@ "num_sources": 316, "num_queries": 93, "num_complete_scores": 308, - "unweighted_mean_score": 4.464240506329115, - "weighted_total_content_score": 88.15456362425039, - "semantic_relevance": 3.922829581993569, - "factual_accuracy": 4.771704180064309, - "freshness": 4.490445859872612, - "objectivity_tone": 4.546623794212219, - "layout_ad_density": 4.015923566878981, - "accountability": 4.43312101910828, - "transparency": 4.792993630573249, - "authority": 4.735668789808917, + "unweighted_mean_score": 4.462, + "weighted_total_content_score": 89.081, + "semantic_relevance": 3.923, + "factual_accuracy": 4.772, + "freshness": 4.49, + "objectivity_tone": 4.547, + "layout_ad_density": 4.016, + "accountability": 4.433, + "transparency": 4.793, + "authority": 4.736, "avg_ge_freq": 0.6740462025316463, "relative_se_rank": 2.0599499775368484, "normalized_reciprocal_se_rank": 0.07444993783835872, @@ -130,43 +131,21 @@ "percentage_ge_sources_not_in_se_sources": 87.34177215189875, "percentage_ge_sources_in_se_sources": 12.658227848101266 }, - { - "model_name": "grok-4.1-fast-non-reasoning", - "num_sources": 308, - "num_queries": 80, - "num_complete_scores": 307, - "unweighted_mean_score": 4.152687296416938, - "weighted_total_content_score": 83.11004784688996, - "semantic_relevance": 4.263843648208469, - "factual_accuracy": 4.501628664495114, - "freshness": 4.192182410423452, - "objectivity_tone": 4.003257328990228, - "layout_ad_density": 3.6319218241042344, - "accountability": 4.0, - "transparency": 4.299674267100977, - "authority": 4.328990228013029, - "avg_ge_freq": 0.7316051948051939, - "relative_se_rank": 1.7286246441730713, - "normalized_reciprocal_se_rank": 0.14088939196167136, - "reciprocal_se_rank": 0.043563227680110374, - "percentage_ge_sources_not_in_se_sources": 69.15584415584416, - "percentage_ge_sources_in_se_sources": 30.844155844155843 - }, { "model_name": "gpt-4o", "num_sources": 294, "num_queries": 88, "num_complete_scores": 294, - "unweighted_mean_score": 4.066751700680272, - "weighted_total_content_score": 81.5180809165772, - "semantic_relevance": 4.241496598639456, - "factual_accuracy": 4.207482993197279, - "freshness": 4.523809523809524, - "objectivity_tone": 3.925170068027211, - "layout_ad_density": 3.3435374149659864, - "accountability": 3.9625850340136055, - "transparency": 4.149659863945578, - "authority": 4.180272108843537, + "unweighted_mean_score": 4.067, + "weighted_total_content_score": 81.518, + "semantic_relevance": 4.241, + "factual_accuracy": 4.207, + "freshness": 4.524, + "objectivity_tone": 3.925, + "layout_ad_density": 3.344, + "accountability": 4.15, + "transparency": 3.963, + "authority": 4.18, "avg_ge_freq": 0.46483707482993103, "relative_se_rank": 1.8249937074261993, "normalized_reciprocal_se_rank": 0.12262328761538778, @@ -175,130 +154,42 @@ "percentage_ge_sources_in_se_sources": 25.170068027210874 }, { - "model_name": "gensee", - "num_sources": 382, - "num_queries": 93, - "num_complete_scores": 377, - "unweighted_mean_score": 4.066835395575553, - "weighted_total_content_score": 81.48250206668509, - "semantic_relevance": 4.431578947368421, - "factual_accuracy": 4.426315789473684, - "freshness": 4.343832020997375, - "objectivity_tone": 3.8947368421052633, - "layout_ad_density": 3.339522546419098, - "accountability": 3.9658792650918637, - "transparency": 4.020997375328084, - "authority": 4.091863517060368, - "avg_ge_freq": 0.5340209424083775, - "relative_se_rank": 1.7669587654960388, - "normalized_reciprocal_se_rank": 0.1299587450378004, - "reciprocal_se_rank": 0.040936688734811204, - "percentage_ge_sources_not_in_se_sources": 71.46596858638743, - "percentage_ge_sources_in_se_sources": 28.53403141361256 - }, - { - "model_name": "deepseek-chat-gensee", - "num_sources": 82, - "num_queries": 19, - "num_complete_scores": 76, - "unweighted_mean_score": 4.26378842676311, - "weighted_total_content_score": 81.1168164313222, - "semantic_relevance": 4.243589743589744, - "factual_accuracy": 4.564102564102564, - "freshness": 4.423076923076923, - "objectivity_tone": 4.153846153846154, - "layout_ad_density": 3.9220779220779223, - "accountability": 3.9871794871794872, - "transparency": 4.32051282051282, - "authority": 4.461538461538462, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.9224627653967212, - "normalized_reciprocal_se_rank": 0.12477656633162594, - "reciprocal_se_rank": 0.03969145647289068, - "percentage_ge_sources_not_in_se_sources": 74.390243902439, - "percentage_ge_sources_in_se_sources": 25.609756097560982 - }, - { - "model_name": "deepseek-reasoning-tavily", - "num_sources": 62, - "num_queries": 19, - "num_complete_scores": 58, - "unweighted_mean_score": 4.282327586206897, - "weighted_total_content_score": 80.1018675721562, - "semantic_relevance": 4.189655172413793, - "factual_accuracy": 4.603448275862069, - "freshness": 4.396551724137931, - "objectivity_tone": 4.0344827586206895, - "layout_ad_density": 3.7241379310344827, - "accountability": 4.103448275862069, - "transparency": 4.551724137931035, - "authority": 4.655172413793103, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.4716965265158806, - "normalized_reciprocal_se_rank": 0.2609989378665101, - "reciprocal_se_rank": 0.07242450206015655, - "percentage_ge_sources_not_in_se_sources": 56.45161290322581, - "percentage_ge_sources_in_se_sources": 43.54838709677419 - }, - { - "model_name": "exa", - "num_sources": 425, - "num_queries": 99, - "num_complete_scores": 421, - "unweighted_mean_score": 4.0269747899159665, - "weighted_total_content_score": 80.06439628482967, - "semantic_relevance": 3.6485849056603774, - "factual_accuracy": 4.120283018867925, - "freshness": 4.345882352941176, - "objectivity_tone": 4.023584905660377, - "layout_ad_density": 3.390995260663507, - "accountability": 4.124705882352941, - "transparency": 4.305882352941176, - "authority": 4.24, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.4974151993912352, - "normalized_reciprocal_se_rank": 0.20452762803005117, - "reciprocal_se_rank": 0.058854939745084926, - "percentage_ge_sources_not_in_se_sources": 57.176470588235304, - "percentage_ge_sources_in_se_sources": 42.8235294117647 - }, - { - "model_name": "google-search", - "num_sources": 406, - "num_queries": 99, - "num_complete_scores": 403, - "unweighted_mean_score": 4.000307881773399, - "weighted_total_content_score": 79.88073632356752, - "semantic_relevance": 3.9482758620689653, - "factual_accuracy": 4.221674876847291, - "freshness": 4.059113300492611, - "objectivity_tone": 3.7758620689655173, - "layout_ad_density": 3.7493796526054592, - "accountability": 3.8793103448275863, - "transparency": 4.1330049261083746, - "authority": 4.231527093596059, - "avg_ge_freq": null, - "relative_se_rank": null, - "normalized_reciprocal_se_rank": null, - "reciprocal_se_rank": null, - "percentage_ge_sources_not_in_se_sources": null, - "percentage_ge_sources_in_se_sources": null + "model_name": "grok-4.1-fast-non-reasoning", + "num_sources": 308, + "num_queries": 80, + "num_complete_scores": 307, + "unweighted_mean_score": 4.153, + "weighted_total_content_score": 83.381, + "semantic_relevance": 4.264, + "factual_accuracy": 4.502, + "freshness": 4.192, + "objectivity_tone": 4.003, + "layout_ad_density": 3.632, + "accountability": 4.0, + "transparency": 4.3, + "authority": 4.329, + "avg_ge_freq": 0.7316051948051939, + "relative_se_rank": 1.7286246441730713, + "normalized_reciprocal_se_rank": 0.14088939196167136, + "reciprocal_se_rank": 0.043563227680110374, + "percentage_ge_sources_not_in_se_sources": 69.15584415584416, + "percentage_ge_sources_in_se_sources": 30.844155844155843 }, { "model_name": "claude", "num_sources": 259, "num_queries": 84, "num_complete_scores": 253, - "unweighted_mean_score": 4.0607707509881426, - "weighted_total_content_score": 79.39849624060149, - "semantic_relevance": 4.209486166007905, - "factual_accuracy": 4.217391304347826, - "freshness": 4.446640316205533, - "objectivity_tone": 3.8181818181818183, - "layout_ad_density": 3.33596837944664, - "accountability": 4.201581027667984, - "transparency": 4.177865612648222, - "authority": 4.07905138339921, + "unweighted_mean_score": 4.061, + "weighted_total_content_score": 81.282, + "semantic_relevance": 4.209, + "factual_accuracy": 4.217, + "freshness": 4.447, + "objectivity_tone": 3.818, + "layout_ad_density": 3.336, + "accountability": 4.202, + "transparency": 4.178, + "authority": 4.079, "avg_ge_freq": 0.8146749034749037, "relative_se_rank": 1.587931757339867, "normalized_reciprocal_se_rank": 0.1641520228549373, @@ -311,16 +202,16 @@ "num_sources": 444, "num_queries": 97, "num_complete_scores": 427, - "unweighted_mean_score": 3.985254756530152, - "weighted_total_content_score": 78.20056899004268, - "semantic_relevance": 3.63302752293578, - "factual_accuracy": 4.13302752293578, - "freshness": 4.742596810933941, - "objectivity_tone": 3.8509174311926606, - "layout_ad_density": 3.502283105022831, - "accountability": 3.8906605922551254, - "transparency": 4.14123006833713, - "authority": 3.9931662870159452, + "unweighted_mean_score": 3.988, + "weighted_total_content_score": 79.379, + "semantic_relevance": 3.633, + "factual_accuracy": 4.133, + "freshness": 4.743, + "objectivity_tone": 3.851, + "layout_ad_density": 3.502, + "accountability": 3.891, + "transparency": 4.141, + "authority": 3.993, "avg_ge_freq": 0.5082466216216226, "relative_se_rank": 1.9513731817138333, "normalized_reciprocal_se_rank": 0.08733801529571, @@ -328,21 +219,43 @@ "percentage_ge_sources_not_in_se_sources": 79.50450450450452, "percentage_ge_sources_in_se_sources": 20.495495495495497 }, + { + "model_name": "Gemini-3-Flash-Preview", + "num_sources": 456, + "num_queries": 99, + "num_complete_scores": 438, + "unweighted_mean_score": 3.96, + "weighted_total_content_score": 78.981, + "semantic_relevance": 3.633, + "factual_accuracy": 4.1, + "freshness": 4.459, + "objectivity_tone": 3.925, + "layout_ad_density": 3.416, + "accountability": 4.057, + "transparency": 4.097, + "authority": 3.962, + "avg_ge_freq": 0.5219184210526322, + "relative_se_rank": 1.907622173881119, + "normalized_reciprocal_se_rank": 0.10236888641393292, + "reciprocal_se_rank": 0.03430708678393044, + "percentage_ge_sources_not_in_se_sources": 76.7543859649123, + "percentage_ge_sources_in_se_sources": 23.245614035087723 + }, { "model_name": "Gemini-2.5-Flash-Preview", "num_sources": 444, "num_queries": 98, "num_complete_scores": 441, - "unweighted_mean_score": 3.931531531531532, - "weighted_total_content_score": 78.0298719772404, - "semantic_relevance": 3.520361990950226, - "factual_accuracy": 3.995475113122172, - "freshness": 4.444695259593679, - "objectivity_tone": 3.8981900452488687, - "layout_ad_density": 3.3355855855855854, - "accountability": 4.054176072234763, - "transparency": 4.162528216704289, - "authority": 4.060948081264108, + "unweighted_mean_score": 3.938, + "weighted_total_content_score": 78.339, + "semantic_relevance": 3.52, + "factual_accuracy": 3.995, + "freshness": 4.445, + "objectivity_tone": 3.898, + "layout_ad_density": 3.336, + "accountability": 4.054, + "transparency": 4.163, + "authority": 4.061, "avg_ge_freq": null, "relative_se_rank": null, "normalized_reciprocal_se_rank": null, @@ -355,16 +268,16 @@ "num_sources": 379, "num_queries": 98, "num_complete_scores": 373, - "unweighted_mean_score": 3.950208412277378, - "weighted_total_content_score": 77.85307596167198, - "semantic_relevance": 3.6906666666666665, - "factual_accuracy": 4.074666666666666, - "freshness": 4.358090185676392, - "objectivity_tone": 3.6186666666666665, - "layout_ad_density": 3.6426666666666665, - "accountability": 3.806366047745358, - "transparency": 4.220159151193634, - "authority": 4.183023872679045, + "unweighted_mean_score": 3.952, + "weighted_total_content_score": 78.547, + "semantic_relevance": 3.691, + "factual_accuracy": 4.075, + "freshness": 4.358, + "objectivity_tone": 3.619, + "layout_ad_density": 3.643, + "accountability": 3.806, + "transparency": 4.22, + "authority": 4.183, "avg_ge_freq": 0.8135451187335091, "relative_se_rank": 1.6003800744631216, "normalized_reciprocal_se_rank": 0.16776928125494017, @@ -373,42 +286,64 @@ "percentage_ge_sources_in_se_sources": 39.05013192612139 }, { - "model_name": "Gemini-3-Flash-Preview", - "num_sources": 456, + "model_name": "google-search", + "num_sources": 406, "num_queries": 99, - "num_complete_scores": 438, - "unweighted_mean_score": 3.9539324960753532, - "weighted_total_content_score": 77.48153277931671, - "semantic_relevance": 3.6334841628959276, - "factual_accuracy": 4.099547511312217, - "freshness": 4.459161147902869, - "objectivity_tone": 3.925339366515837, - "layout_ad_density": 3.415929203539823, - "accountability": 4.057395143487859, - "transparency": 4.097130242825607, - "authority": 3.9624724061810155, - "avg_ge_freq": 0.5219184210526322, - "relative_se_rank": 1.907622173881119, - "normalized_reciprocal_se_rank": 0.10236888641393292, - "reciprocal_se_rank": 0.03430708678393044, - "percentage_ge_sources_not_in_se_sources": 76.7543859649123, - "percentage_ge_sources_in_se_sources": 23.245614035087723 + "num_complete_scores": 403, + "unweighted_mean_score": 4.0, + "weighted_total_content_score": 79.939, + "semantic_relevance": 3.948, + "factual_accuracy": 4.222, + "freshness": 4.059, + "objectivity_tone": 3.776, + "layout_ad_density": 3.749, + "accountability": 4.133, + "transparency": 3.879, + "authority": 4.232, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null + }, + { + "model_name": "exa", + "num_sources": 425, + "num_queries": 99, + "num_complete_scores": 421, + "unweighted_mean_score": 4.021, + "weighted_total_content_score": 80.108, + "semantic_relevance": 3.649, + "factual_accuracy": 4.12, + "freshness": 4.346, + "objectivity_tone": 4.024, + "layout_ad_density": 3.391, + "accountability": 4.125, + "transparency": 4.306, + "authority": 4.24, + "avg_ge_freq": 1.0, + "relative_se_rank": 1.4974151993912352, + "normalized_reciprocal_se_rank": 0.20452762803005117, + "reciprocal_se_rank": 0.058854939745084926, + "percentage_ge_sources_not_in_se_sources": 57.176470588235304, + "percentage_ge_sources_in_se_sources": 42.8235294117647 }, { "model_name": "tavily", "num_sources": 395, "num_queries": 97, "num_complete_scores": 389, - "unweighted_mean_score": 3.9243911304980004, - "weighted_total_content_score": 77.37774816788804, - "semantic_relevance": 3.544757033248082, - "factual_accuracy": 4.0664961636828645, - "freshness": 4.447570332480819, - "objectivity_tone": 3.8005115089514065, - "layout_ad_density": 3.3324808184143224, - "accountability": 4.043478260869565, - "transparency": 4.033248081841432, - "authority": 4.156010230179028, + "unweighted_mean_score": 3.933, + "weighted_total_content_score": 78.268, + "semantic_relevance": 3.545, + "factual_accuracy": 4.066, + "freshness": 4.448, + "objectivity_tone": 3.801, + "layout_ad_density": 3.332, + "accountability": 4.043, + "transparency": 4.033, + "authority": 4.156, "avg_ge_freq": 0.9864979746835443, "relative_se_rank": 1.2450772341845837, "normalized_reciprocal_se_rank": 0.2743286477154668, @@ -416,6 +351,72 @@ "percentage_ge_sources_not_in_se_sources": 45.316455696202524, "percentage_ge_sources_in_se_sources": 54.683544303797476 }, + { + "model_name": "gensee", + "num_sources": 382, + "num_queries": 93, + "num_complete_scores": 377, + "unweighted_mean_score": 4.06, + "weighted_total_content_score": 81.795, + "semantic_relevance": 4.432, + "factual_accuracy": 4.426, + "freshness": 4.344, + "objectivity_tone": 3.895, + "layout_ad_density": 3.34, + "accountability": 3.966, + "transparency": 4.021, + "authority": 4.092, + "avg_ge_freq": 0.5340209424083775, + "relative_se_rank": 1.7669587654960388, + "normalized_reciprocal_se_rank": 0.1299587450378004, + "reciprocal_se_rank": 0.040936688734811204, + "percentage_ge_sources_not_in_se_sources": 71.46596858638743, + "percentage_ge_sources_in_se_sources": 28.53403141361256 + }, + { + "model_name": "deepseek-chat-gensee", + "num_sources": 82, + "num_queries": 19, + "num_complete_scores": 76, + "unweighted_mean_score": 4.26378842676311, + "weighted_total_content_score": 81.1168164313222, + "semantic_relevance": 4.243589743589744, + "factual_accuracy": 4.564102564102564, + "freshness": 4.423076923076923, + "objectivity_tone": 4.153846153846154, + "layout_ad_density": 3.9220779220779223, + "accountability": 3.9871794871794872, + "transparency": 4.32051282051282, + "authority": 4.461538461538462, + "avg_ge_freq": 1.0, + "relative_se_rank": 1.9224627653967212, + "normalized_reciprocal_se_rank": 0.12477656633162594, + "reciprocal_se_rank": 0.03969145647289068, + "percentage_ge_sources_not_in_se_sources": 74.390243902439, + "percentage_ge_sources_in_se_sources": 25.609756097560982 + }, + { + "model_name": "deepseek-reasoning-tavily", + "num_sources": 62, + "num_queries": 19, + "num_complete_scores": 58, + "unweighted_mean_score": 4.282327586206897, + "weighted_total_content_score": 80.1018675721562, + "semantic_relevance": 4.189655172413793, + "factual_accuracy": 4.603448275862069, + "freshness": 4.396551724137931, + "objectivity_tone": 4.0344827586206895, + "layout_ad_density": 3.7241379310344827, + "accountability": 4.103448275862069, + "transparency": 4.551724137931035, + "authority": 4.655172413793103, + "avg_ge_freq": 1.0, + "relative_se_rank": 1.4716965265158806, + "normalized_reciprocal_se_rank": 0.2609989378665101, + "reciprocal_se_rank": 0.07242450206015655, + "percentage_ge_sources_not_in_se_sources": 56.45161290322581, + "percentage_ge_sources_in_se_sources": 43.54838709677419 + }, { "model_name": "deepseek-reasoning-gensee", "num_sources": 81, @@ -462,68 +463,22 @@ } ], "by_query_type": [ - { - "model_name": "deepseek-chat-gensee", - "query_type": "DebateQA", - "num_sources": 20, - "num_queries": 4, - "num_complete_scores": 20, - "unweighted_mean_score": 4.5625, - "weighted_total_content_score": 92.05263157894737, - "semantic_relevance": 4.85, - "factual_accuracy": 5.0, - "freshness": 3.95, - "objectivity_tone": 4.6, - "layout_ad_density": 4.4, - "accountability": 4.3, - "transparency": 4.6, - "authority": 4.8, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.4475099206349205, - "normalized_reciprocal_se_rank": 0.13340620098782763, - "reciprocal_se_rank": 0.0417650822762013, - "percentage_ge_sources_not_in_se_sources": 55.0, - "percentage_ge_sources_in_se_sources": 45.0 - }, - { - "model_name": "grok-4.1-fast-non-reasoning", - "query_type": "DebateQA", - "num_sources": 36, - "num_queries": 8, - "num_complete_scores": 36, - "unweighted_mean_score": 4.5, - "weighted_total_content_score": 90.35087719298242, - "semantic_relevance": 4.888888888888889, - "factual_accuracy": 4.722222222222222, - "freshness": 4.333333333333333, - "objectivity_tone": 4.222222222222222, - "layout_ad_density": 4.111111111111111, - "accountability": 4.388888888888889, - "transparency": 4.722222222222222, - "authority": 4.611111111111111, - "avg_ge_freq": 0.7500055555555557, - "relative_se_rank": 0.8543317246023244, - "normalized_reciprocal_se_rank": 0.29328182035967976, - "reciprocal_se_rank": 0.08018179663982594, - "percentage_ge_sources_not_in_se_sources": 30.555555555555557, - "percentage_ge_sources_in_se_sources": 69.44444444444446 - }, { "model_name": "gpt-5", "query_type": "DebateQA", "num_sources": 85, "num_queries": 20, "num_complete_scores": 79, - "unweighted_mean_score": 4.655, - "weighted_total_content_score": 90.19195046439627, - "semantic_relevance": 4.271604938271605, - "factual_accuracy": 4.851851851851852, - "freshness": 4.261904761904762, - "objectivity_tone": 4.765432098765432, - "layout_ad_density": 4.518072289156627, - "accountability": 4.714285714285714, - "transparency": 4.892857142857143, - "authority": 4.928571428571429, + "unweighted_mean_score": 4.651, + "weighted_total_content_score": 90.19195046, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 0.6274458823529414, "relative_se_rank": 2.0396269615741502, "normalized_reciprocal_se_rank": 0.024647681783897292, @@ -532,27 +487,27 @@ "percentage_ge_sources_in_se_sources": 10.588235294117647 }, { - "model_name": "deepseek-reasoning-gensee", - "query_type": "DebateQA", - "num_sources": 20, - "num_queries": 4, - "num_complete_scores": 20, - "unweighted_mean_score": 4.45625, - "weighted_total_content_score": 89.63157894736842, - "semantic_relevance": 4.7, - "factual_accuracy": 4.9, - "freshness": 4.0, - "objectivity_tone": 4.25, - "layout_ad_density": 4.3, - "accountability": 4.35, - "transparency": 4.5, - "authority": 4.65, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.3464484126984126, - "normalized_reciprocal_se_rank": 0.23220261224941802, - "reciprocal_se_rank": 0.06550499663274852, - "percentage_ge_sources_not_in_se_sources": 55.0, - "percentage_ge_sources_in_se_sources": 45.0 + "model_name": "gpt-5", + "query_type": "HotpotQA", + "num_sources": 38, + "num_queries": 19, + "num_complete_scores": 38, + "unweighted_mean_score": 4.414, + "weighted_total_content_score": 88.33795014, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5526184210526318, + "relative_se_rank": 1.6329072044063837, + "normalized_reciprocal_se_rank": 0.24588844317711944, + "reciprocal_se_rank": 0.06879358221974471, + "percentage_ge_sources_not_in_se_sources": 71.05263157894737, + "percentage_ge_sources_in_se_sources": 28.94736842105263 }, { "model_name": "gpt-5", @@ -560,16 +515,16 @@ "num_sources": 55, "num_queries": 20, "num_complete_scores": 55, - "unweighted_mean_score": 4.4340909090909095, - "weighted_total_content_score": 88.99521531100477, - "semantic_relevance": 4.290909090909091, - "factual_accuracy": 4.6909090909090905, - "freshness": 3.7636363636363637, - "objectivity_tone": 4.618181818181818, - "layout_ad_density": 3.9454545454545453, - "accountability": 4.672727272727273, - "transparency": 4.890909090909091, - "authority": 4.6, + "unweighted_mean_score": 4.434, + "weighted_total_content_score": 88.99521531, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 0.6605890909090913, "relative_se_rank": 1.8746770791895104, "normalized_reciprocal_se_rank": 0.16050373868555687, @@ -578,924 +533,73 @@ "percentage_ge_sources_in_se_sources": 25.454545454545453 }, { - "model_name": "deepseek-reasoning-tavily", - "query_type": "DebateQA", - "num_sources": 17, - "num_queries": 4, - "num_complete_scores": 17, - "unweighted_mean_score": 4.397058823529412, - "weighted_total_content_score": 88.42105263157896, - "semantic_relevance": 4.647058823529412, - "factual_accuracy": 4.647058823529412, - "freshness": 3.8823529411764706, - "objectivity_tone": 4.352941176470588, - "layout_ad_density": 4.0, - "accountability": 4.235294117647059, - "transparency": 4.705882352941177, - "authority": 4.705882352941177, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.013410364145658, - "normalized_reciprocal_se_rank": 0.380931631912024, - "reciprocal_se_rank": 0.10124328048371453, - "percentage_ge_sources_not_in_se_sources": 41.1764705882353, - "percentage_ge_sources_in_se_sources": 58.8235294117647 + "model_name": "gpt-5", + "query_type": "QuoraQuestions", + "num_sources": 58, + "num_queries": 14, + "num_complete_scores": 56, + "unweighted_mean_score": 4.491, + "weighted_total_content_score": 87.84029038, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.6379275862068965, + "relative_se_rank": 2.1757824170978175, + "normalized_reciprocal_se_rank": 0.04254727146439185, + "reciprocal_se_rank": 0.019932475424696096, + "percentage_ge_sources_not_in_se_sources": 91.37931034482759, + "percentage_ge_sources_in_se_sources": 8.620689655172415 }, { "model_name": "gpt-5", - "query_type": "HotpotQA", - "num_sources": 38, - "num_queries": 19, - "num_complete_scores": 38, - "unweighted_mean_score": 4.4144736842105265, - "weighted_total_content_score": 88.33795013850413, - "semantic_relevance": 3.6842105263157894, - "factual_accuracy": 4.7894736842105265, - "freshness": 4.842105263157895, - "objectivity_tone": 4.815789473684211, - "layout_ad_density": 4.157894736842105, - "accountability": 3.5, - "transparency": 4.7631578947368425, - "authority": 4.7631578947368425, - "avg_ge_freq": 0.5526184210526318, - "relative_se_rank": 1.6329072044063837, - "normalized_reciprocal_se_rank": 0.24588844317711944, - "reciprocal_se_rank": 0.06879358221974471, - "percentage_ge_sources_not_in_se_sources": 71.05263157894737, - "percentage_ge_sources_in_se_sources": 28.94736842105263 - }, - { - "model_name": "gpt-5", - "query_type": "QuoraQuestions", - "num_sources": 58, - "num_queries": 14, - "num_complete_scores": 56, - "unweighted_mean_score": 4.478879310344827, - "weighted_total_content_score": 87.84029038112523, - "semantic_relevance": 3.508771929824561, - "factual_accuracy": 4.719298245614035, - "freshness": 4.684210526315789, - "objectivity_tone": 4.684210526315789, - "layout_ad_density": 4.172413793103448, - "accountability": 4.473684210526316, - "transparency": 4.842105263157895, - "authority": 4.842105263157895, - "avg_ge_freq": 0.6379275862068965, - "relative_se_rank": 2.1757824170978175, - "normalized_reciprocal_se_rank": 0.04254727146439185, - "reciprocal_se_rank": 0.019932475424696096, - "percentage_ge_sources_not_in_se_sources": 91.37931034482759, - "percentage_ge_sources_in_se_sources": 8.620689655172415 - }, - { - "model_name": "gpt-4o", - "query_type": "Pinocchios", - "num_sources": 40, - "num_queries": 18, - "num_complete_scores": 40, - "unweighted_mean_score": 4.29375, - "weighted_total_content_score": 86.60526315789473, - "semantic_relevance": 4.625, - "factual_accuracy": 4.75, - "freshness": 3.4, - "objectivity_tone": 4.2, - "layout_ad_density": 3.875, - "accountability": 4.275, - "transparency": 4.55, - "authority": 4.675, - "avg_ge_freq": 0.44164499999999995, - "relative_se_rank": 1.5765700005314691, - "normalized_reciprocal_se_rank": 0.28054167213258124, - "reciprocal_se_rank": 0.07712045034253773, - "percentage_ge_sources_not_in_se_sources": 62.5, - "percentage_ge_sources_in_se_sources": 37.5 - }, - { - "model_name": "exa", - "query_type": "DebateQA", - "num_sources": 89, - "num_queries": 20, - "num_complete_scores": 87, - "unweighted_mean_score": 4.3292536115569815, - "weighted_total_content_score": 86.44589000591367, - "semantic_relevance": 4.49438202247191, - "factual_accuracy": 4.426966292134831, - "freshness": 4.415730337078652, - "objectivity_tone": 4.146067415730337, - "layout_ad_density": 3.7701149425287355, - "accountability": 4.415730337078652, - "transparency": 4.426966292134831, - "authority": 4.51685393258427, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.010284064126628, - "normalized_reciprocal_se_rank": 0.28063309301928224, - "reciprocal_se_rank": 0.07714241798278879, - "percentage_ge_sources_not_in_se_sources": 38.20224719101124, - "percentage_ge_sources_in_se_sources": 61.79775280898876 - }, - { - "model_name": "grok-4.1-fast-non-reasoning", - "query_type": "Pinocchios", - "num_sources": 67, - "num_queries": 18, - "num_complete_scores": 67, - "unweighted_mean_score": 4.270522388059701, - "weighted_total_content_score": 86.00157109190886, - "semantic_relevance": 4.462686567164179, - "factual_accuracy": 4.731343283582089, - "freshness": 3.1194029850746268, - "objectivity_tone": 4.17910447761194, - "layout_ad_density": 4.0, - "accountability": 4.402985074626866, - "transparency": 4.611940298507463, - "authority": 4.656716417910448, - "avg_ge_freq": 0.7860761194029849, - "relative_se_rank": 1.7392122002022354, - "normalized_reciprocal_se_rank": 0.2045813768315125, - "reciprocal_se_rank": 0.058867855112426565, - "percentage_ge_sources_not_in_se_sources": 67.16417910447761, - "percentage_ge_sources_in_se_sources": 32.83582089552239 - }, - { - "model_name": "gensee", - "query_type": "Pinocchios", - "num_sources": 77, - "num_queries": 20, - "num_complete_scores": 75, - "unweighted_mean_score": 4.286873840445268, - "weighted_total_content_score": 85.94668489405333, - "semantic_relevance": 4.402597402597403, - "factual_accuracy": 4.753246753246753, - "freshness": 3.857142857142857, - "objectivity_tone": 4.1558441558441555, - "layout_ad_density": 3.8133333333333335, - "accountability": 4.298701298701299, - "transparency": 4.4935064935064934, - "authority": 4.4935064935064934, - "avg_ge_freq": 0.4934883116883122, - "relative_se_rank": 1.7999418903256372, - "normalized_reciprocal_se_rank": 0.19747215967140244, - "reciprocal_se_rank": 0.05715957234822537, - "percentage_ge_sources_not_in_se_sources": 70.12987012987011, - "percentage_ge_sources_in_se_sources": 29.870129870129865 - }, - { - "model_name": "claude", - "query_type": "Pinocchios", - "num_sources": 39, - "num_queries": 20, - "num_complete_scores": 39, - "unweighted_mean_score": 4.262820512820513, - "weighted_total_content_score": 85.56005398110662, - "semantic_relevance": 4.435897435897436, - "factual_accuracy": 4.435897435897436, - "freshness": 3.4358974358974357, - "objectivity_tone": 4.205128205128205, - "layout_ad_density": 3.8205128205128207, - "accountability": 4.666666666666667, - "transparency": 4.6923076923076925, - "authority": 4.410256410256411, - "avg_ge_freq": 0.8205102564102565, - "relative_se_rank": 1.4724542616408207, - "normalized_reciprocal_se_rank": 0.28256007847697834, - "reciprocal_se_rank": 0.07760545575053605, - "percentage_ge_sources_not_in_se_sources": 53.84615384615383, - "percentage_ge_sources_in_se_sources": 46.153846153846175 - }, - { - "model_name": "gpt-5", - "query_type": "VACOS", + "query_type": "VA-COS NLQ", "num_sources": 80, "num_queries": 20, "num_complete_scores": 80, - "unweighted_mean_score": 4.2953125, - "weighted_total_content_score": 85.55263157894731, - "semantic_relevance": 3.725, - "factual_accuracy": 4.775, - "freshness": 4.925, - "objectivity_tone": 4.05, - "layout_ad_density": 3.3625, - "accountability": 4.3875, - "transparency": 4.6, - "authority": 4.5375, - "avg_ge_freq": 0.8166749999999997, - "relative_se_rank": 2.327785098166267, - "normalized_reciprocal_se_rank": 0.0098989898989899, - "reciprocal_se_rank": 0.012087378640776695, - "percentage_ge_sources_not_in_se_sources": 98.75, - "percentage_ge_sources_in_se_sources": 1.2499999999999998 - }, - { - "model_name": "gensee", - "query_type": "DebateQA", - "num_sources": 89, - "num_queries": 20, - "num_complete_scores": 87, - "unweighted_mean_score": 4.282303370786517, - "weighted_total_content_score": 85.36960378474275, - "semantic_relevance": 4.7727272727272725, - "factual_accuracy": 4.5, - "freshness": 4.393258426966292, - "objectivity_tone": 4.0, - "layout_ad_density": 3.7126436781609193, - "accountability": 4.247191011235955, - "transparency": 4.224719101123595, - "authority": 4.370786516853933, - "avg_ge_freq": 0.5730269662921349, - "relative_se_rank": 1.455758534903284, - "normalized_reciprocal_se_rank": 0.17012042957025347, - "reciprocal_se_rank": 0.05058719060061921, - "percentage_ge_sources_not_in_se_sources": 58.42696629213483, - "percentage_ge_sources_in_se_sources": 41.57303370786517 - }, - { - "model_name": "google-search", - "query_type": "DebateQA", - "num_sources": 89, - "num_queries": 20, - "num_complete_scores": 89, - "unweighted_mean_score": 4.262640449438202, - "weighted_total_content_score": 85.2749852158486, - "semantic_relevance": 4.595505617977528, - "factual_accuracy": 4.415730337078652, - "freshness": 4.292134831460674, - "objectivity_tone": 3.797752808988764, - "layout_ad_density": 4.089887640449438, - "accountability": 4.067415730337078, - "transparency": 4.449438202247191, - "authority": 4.393258426966292, - "avg_ge_freq": null, - "relative_se_rank": null, - "normalized_reciprocal_se_rank": null, - "reciprocal_se_rank": null, - "percentage_ge_sources_not_in_se_sources": null, - "percentage_ge_sources_in_se_sources": null - }, - { - "model_name": "grok-4.1-fast-non-reasoning", - "query_type": "VACOS", - "num_sources": 84, - "num_queries": 19, - "num_complete_scores": 84, - "unweighted_mean_score": 4.205357142857143, - "weighted_total_content_score": 84.48621553884713, - "semantic_relevance": 4.392857142857143, - "factual_accuracy": 4.738095238095238, - "freshness": 4.845238095238095, - "objectivity_tone": 3.8452380952380953, - "layout_ad_density": 3.2261904761904763, - "accountability": 4.011904761904762, - "transparency": 4.226190476190476, - "authority": 4.357142857142857, - "avg_ge_freq": 0.6825476190476188, - "relative_se_rank": 2.0432068047879337, - "normalized_reciprocal_se_rank": 0.0649730073602254, - "reciprocal_se_rank": 0.025321183807432805, - "percentage_ge_sources_not_in_se_sources": 84.52380952380952, - "percentage_ge_sources_in_se_sources": 15.476190476190476 - }, - { - "model_name": "deepseek-chat-gensee", - "query_type": "Pinocchios", - "num_sources": 16, - "num_queries": 4, - "num_complete_scores": 15, - "unweighted_mean_score": 4.458333333333333, - "weighted_total_content_score": 83.75, - "semantic_relevance": 4.333333333333333, - "factual_accuracy": 4.666666666666667, - "freshness": 4.066666666666666, - "objectivity_tone": 4.533333333333333, - "layout_ad_density": 4.4, - "accountability": 4.4, - "transparency": 4.533333333333333, - "authority": 4.733333333333333, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.8245541272980292, - "normalized_reciprocal_se_rank": 0.17878186628186626, - "reciprocal_se_rank": 0.052668458159963016, - "percentage_ge_sources_not_in_se_sources": 74.99999999999999, - "percentage_ge_sources_in_se_sources": 24.999999999999993 - }, - { - "model_name": "exa", - "query_type": "Pinocchios", - "num_sources": 87, - "num_queries": 20, - "num_complete_scores": 86, - "unweighted_mean_score": 4.200123152709359, - "weighted_total_content_score": 83.5934664246824, - "semantic_relevance": 3.7011494252873565, - "factual_accuracy": 4.264367816091954, - "freshness": 3.7126436781609193, - "objectivity_tone": 4.344827586206897, - "layout_ad_density": 3.6627906976744184, - "accountability": 4.632183908045977, - "transparency": 4.689655172413793, - "authority": 4.586206896551724, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.80952333519013, - "normalized_reciprocal_se_rank": 0.2313966587355651, - "reciprocal_se_rank": 0.06531133304568201, - "percentage_ge_sources_not_in_se_sources": 62.06896551724138, - "percentage_ge_sources_in_se_sources": 37.93103448275862 - }, - { - "model_name": "tavily", - "query_type": "DebateQA", - "num_sources": 76, - "num_queries": 20, - "num_complete_scores": 76, - "unweighted_mean_score": 4.184210526315789, - "weighted_total_content_score": 83.55955678670357, - "semantic_relevance": 4.223684210526316, - "factual_accuracy": 4.315789473684211, - "freshness": 4.473684210526316, - "objectivity_tone": 3.8947368421052633, - "layout_ad_density": 3.776315789473684, - "accountability": 4.315789473684211, - "transparency": 4.157894736842105, - "authority": 4.315789473684211, - "avg_ge_freq": 1.0, - "relative_se_rank": 0.9508661126222719, - "normalized_reciprocal_se_rank": 0.3434220529106368, - "reciprocal_se_rank": 0.09223005640328419, - "percentage_ge_sources_not_in_se_sources": 36.8421052631579, - "percentage_ge_sources_in_se_sources": 63.1578947368421 - }, - { - "model_name": "google-search", - "query_type": "Pinocchios", - "num_sources": 93, - "num_queries": 20, - "num_complete_scores": 91, - "unweighted_mean_score": 4.177611367127496, - "weighted_total_content_score": 83.53140916808147, - "semantic_relevance": 4.161290322580645, - "factual_accuracy": 4.580645161290323, - "freshness": 3.3225806451612905, - "objectivity_tone": 3.967741935483871, - "layout_ad_density": 4.0, - "accountability": 4.268817204301075, - "transparency": 4.483870967741935, - "authority": 4.623655913978495, - "avg_ge_freq": null, - "relative_se_rank": null, - "normalized_reciprocal_se_rank": null, - "reciprocal_se_rank": null, - "percentage_ge_sources_not_in_se_sources": null, - "percentage_ge_sources_in_se_sources": null - }, - { - "model_name": "deepseek-chat-tavily", - "query_type": "DebateQA", - "num_sources": 20, - "num_queries": 4, - "num_complete_scores": 20, - "unweighted_mean_score": 4.125, - "weighted_total_content_score": 82.84210526315789, - "semantic_relevance": 4.5, - "factual_accuracy": 4.3, - "freshness": 3.85, - "objectivity_tone": 3.9, - "layout_ad_density": 3.6, - "accountability": 4.05, - "transparency": 4.5, - "authority": 4.3, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.1437896825396825, - "normalized_reciprocal_se_rank": 0.2723387312276201, - "reciprocal_se_rank": 0.07514935531925823, - "percentage_ge_sources_not_in_se_sources": 45.0, - "percentage_ge_sources_in_se_sources": 55.0 - }, - { - "model_name": "gpt-4o", - "query_type": "DebateQA", - "num_sources": 78, - "num_queries": 18, - "num_complete_scores": 78, - "unweighted_mean_score": 4.1201923076923075, - "weighted_total_content_score": 82.60458839406209, - "semantic_relevance": 4.487179487179487, - "factual_accuracy": 4.102564102564102, - "freshness": 4.564102564102564, - "objectivity_tone": 3.9615384615384617, - "layout_ad_density": 3.7051282051282053, - "accountability": 4.0, - "transparency": 4.089743589743589, - "authority": 4.051282051282051, - "avg_ge_freq": 0.4999897435897439, - "relative_se_rank": 1.7141289179822279, - "normalized_reciprocal_se_rank": 0.12281919725976252, - "reciprocal_se_rank": 0.03922111778814682, - "percentage_ge_sources_not_in_se_sources": 71.7948717948718, - "percentage_ge_sources_in_se_sources": 28.205128205128204 - }, - { - "model_name": "Perplexity-Sonar-Pro", - "query_type": "DebateQA", - "num_sources": 82, - "num_queries": 20, - "num_complete_scores": 79, - "unweighted_mean_score": 4.237874779541446, - "weighted_total_content_score": 82.51604621309367, - "semantic_relevance": 4.1875, - "factual_accuracy": 4.325, - "freshness": 4.320987654320987, - "objectivity_tone": 3.6375, - "layout_ad_density": 4.1375, - "accountability": 4.246913580246914, - "transparency": 4.518518518518518, - "authority": 4.506172839506172, - "avg_ge_freq": 0.7804853658536585, - "relative_se_rank": 1.185618672325219, - "normalized_reciprocal_se_rank": 0.2274442114543135, - "reciprocal_se_rank": 0.0643615944999443, - "percentage_ge_sources_not_in_se_sources": 46.34146341463415, - "percentage_ge_sources_in_se_sources": 53.65853658536585 - }, - { - "model_name": "tavily", - "query_type": "Pinocchios", - "num_sources": 81, - "num_queries": 20, - "num_complete_scores": 79, - "unweighted_mean_score": 4.188712522045855, - "weighted_total_content_score": 82.27420402859, - "semantic_relevance": 3.55, - "factual_accuracy": 4.325, - "freshness": 4.075, - "objectivity_tone": 4.1875, - "layout_ad_density": 3.8, - "accountability": 4.5125, - "transparency": 4.5625, - "authority": 4.525, - "avg_ge_freq": 0.9958851851851852, - "relative_se_rank": 1.4422121160476211, - "normalized_reciprocal_se_rank": 0.3128941728047197, - "reciprocal_se_rank": 0.08489447356229925, - "percentage_ge_sources_not_in_se_sources": 45.67901234567901, - "percentage_ge_sources_in_se_sources": 54.32098765432099 - }, - { - "model_name": "Gemini-2.5-Flash-Preview", - "query_type": "DebateQA", - "num_sources": 100, - "num_queries": 20, - "num_complete_scores": 100, - "unweighted_mean_score": 4.1325, - "weighted_total_content_score": 82.12631578947368, - "semantic_relevance": 3.92, - "factual_accuracy": 4.0, - "freshness": 4.54, - "objectivity_tone": 3.98, - "layout_ad_density": 3.7, - "accountability": 4.19, - "transparency": 4.4, - "authority": 4.33, - "avg_ge_freq": null, - "relative_se_rank": null, - "normalized_reciprocal_se_rank": null, - "reciprocal_se_rank": null, - "percentage_ge_sources_not_in_se_sources": null, - "percentage_ge_sources_in_se_sources": null - }, - { - "model_name": "exa", - "query_type": "VACOS", - "num_sources": 81, - "num_queries": 20, - "num_complete_scores": 80, - "unweighted_mean_score": 4.138888888888889, - "weighted_total_content_score": 81.97530864197529, - "semantic_relevance": 3.6875, - "factual_accuracy": 4.625, - "freshness": 4.91358024691358, - "objectivity_tone": 3.8625, - "layout_ad_density": 2.950617283950617, - "accountability": 4.08641975308642, - "transparency": 4.54320987654321, - "authority": 4.407407407407407, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.003012313759094, - "normalized_reciprocal_se_rank": 0.07692643713869617, - "reciprocal_se_rank": 0.028193488535754666, - "percentage_ge_sources_not_in_se_sources": 82.71604938271606, - "percentage_ge_sources_in_se_sources": 17.28395061728395 - }, - { - "model_name": "Gemini-3-Pro-Preview", - "query_type": "Pinocchios", - "num_sources": 86, - "num_queries": 20, - "num_complete_scores": 84, - "unweighted_mean_score": 4.162873754152824, - "weighted_total_content_score": 81.87270501835988, - "semantic_relevance": 3.5294117647058822, - "factual_accuracy": 4.117647058823529, - "freshness": 4.523255813953488, - "objectivity_tone": 4.08235294117647, - "layout_ad_density": 3.764705882352941, - "accountability": 4.465116279069767, - "transparency": 4.534883720930233, - "authority": 4.255813953488372, - "avg_ge_freq": 0.5736325581395352, - "relative_se_rank": 1.941468415075042, - "normalized_reciprocal_se_rank": 0.15368587669053982, - "reciprocal_se_rank": 0.04663811114651323, - "percentage_ge_sources_not_in_se_sources": 73.25581395348837, - "percentage_ge_sources_in_se_sources": 26.74418604651163 - }, - { - "model_name": "deepseek-reasoning-tavily", - "query_type": "Pinocchios", - "num_sources": 10, - "num_queries": 4, - "num_complete_scores": 9, - "unweighted_mean_score": 4.513888888888889, - "weighted_total_content_score": 81.57894736842105, - "semantic_relevance": 4.555555555555555, - "factual_accuracy": 4.777777777777778, - "freshness": 3.5555555555555554, - "objectivity_tone": 4.555555555555555, - "layout_ad_density": 4.555555555555555, - "accountability": 4.555555555555555, - "transparency": 4.777777777777778, - "authority": 4.777777777777778, - "avg_ge_freq": 1.0, - "relative_se_rank": 0.8599648428916721, - "normalized_reciprocal_se_rank": 0.4548140553403711, - "reciprocal_se_rank": 0.11899658125897268, - "percentage_ge_sources_not_in_se_sources": 30.0, - "percentage_ge_sources_in_se_sources": 70.0 - }, - { - "model_name": "Gemini-3-Pro-Preview", - "query_type": "DebateQA", - "num_sources": 100, - "num_queries": 20, - "num_complete_scores": 96, - "unweighted_mean_score": 4.178210678210679, - "weighted_total_content_score": 81.42105263157897, - "semantic_relevance": 4.214285714285714, - "factual_accuracy": 4.255102040816326, - "freshness": 4.73469387755102, - "objectivity_tone": 3.683673469387755, - "layout_ad_density": 3.9285714285714284, - "accountability": 3.979591836734694, - "transparency": 4.357142857142857, - "authority": 4.23469387755102, - "avg_ge_freq": 0.4833250000000001, - "relative_se_rank": 1.7292861678201157, - "normalized_reciprocal_se_rank": 0.10523315112286376, - "reciprocal_se_rank": 0.03499534456593085, - "percentage_ge_sources_not_in_se_sources": 72.0, - "percentage_ge_sources_in_se_sources": 28.0 - }, - { - "model_name": "Gemini-3-Pro-Preview", - "query_type": "VACOS", - "num_sources": 94, - "num_queries": 20, - "num_complete_scores": 94, - "unweighted_mean_score": 4.059840425531915, - "weighted_total_content_score": 81.35498320268756, - "semantic_relevance": 3.904255319148936, - "factual_accuracy": 4.617021276595745, - "freshness": 4.829787234042553, - "objectivity_tone": 3.8085106382978724, - "layout_ad_density": 3.1702127659574466, - "accountability": 3.734042553191489, - "transparency": 4.212765957446808, - "authority": 4.202127659574468, - "avg_ge_freq": 0.4148723404255325, - "relative_se_rank": 2.3167240998153185, - "normalized_reciprocal_se_rank": 0.007184894289987778, - "reciprocal_se_rank": 0.01143520518133201, - "percentage_ge_sources_not_in_se_sources": 96.80851063829788, - "percentage_ge_sources_in_se_sources": 3.1914893617021276 - }, - { - "model_name": "Gemini-3-Flash-Preview", - "query_type": "DebateQA", - "num_sources": 100, - "num_queries": 20, - "num_complete_scores": 95, - "unweighted_mean_score": 4.1310714285714285, - "weighted_total_content_score": 81.05263157894737, - "semantic_relevance": 4.11340206185567, - "factual_accuracy": 4.195876288659794, - "freshness": 4.5353535353535355, - "objectivity_tone": 3.8556701030927836, - "layout_ad_density": 3.8181818181818183, - "accountability": 4.181818181818182, - "transparency": 4.303030303030303, - "authority": 4.171717171717172, - "avg_ge_freq": 0.4899890000000004, - "relative_se_rank": 1.6757336210963478, - "normalized_reciprocal_se_rank": 0.12549266202008144, - "reciprocal_se_rank": 0.039863528009679766, - "percentage_ge_sources_not_in_se_sources": 71.0, - "percentage_ge_sources_in_se_sources": 29.0 - }, - { - "model_name": "Gemini-2.5-Flash-Preview", - "query_type": "VACOS", - "num_sources": 91, - "num_queries": 20, - "num_complete_scores": 91, - "unweighted_mean_score": 4.0467032967032965, - "weighted_total_content_score": 81.01792943898207, - "semantic_relevance": 3.923076923076923, - "factual_accuracy": 4.428571428571429, - "freshness": 4.934065934065934, - "objectivity_tone": 3.868131868131868, - "layout_ad_density": 2.8131868131868134, - "accountability": 4.208791208791209, - "transparency": 4.131868131868132, - "authority": 4.065934065934066, - "avg_ge_freq": null, - "relative_se_rank": null, - "normalized_reciprocal_se_rank": null, - "reciprocal_se_rank": null, - "percentage_ge_sources_not_in_se_sources": null, - "percentage_ge_sources_in_se_sources": null - }, - { - "model_name": "Perplexity-Sonar-Pro", - "query_type": "Pinocchios", - "num_sources": 72, - "num_queries": 19, - "num_complete_scores": 72, - "unweighted_mean_score": 4.086805555555555, - "weighted_total_content_score": 80.99415204678363, - "semantic_relevance": 3.5972222222222223, - "factual_accuracy": 4.236111111111111, - "freshness": 3.986111111111111, - "objectivity_tone": 3.7222222222222223, - "layout_ad_density": 4.069444444444445, - "accountability": 4.111111111111111, - "transparency": 4.5, - "authority": 4.472222222222222, - "avg_ge_freq": 0.8148138888888888, - "relative_se_rank": 1.9321450595878684, - "normalized_reciprocal_se_rank": 0.20414708640646176, - "reciprocal_se_rank": 0.05876349891805757, - "percentage_ge_sources_not_in_se_sources": 62.5, - "percentage_ge_sources_in_se_sources": 37.5 - }, - { - "model_name": "gpt-4o", - "query_type": "VACOS", - "num_sources": 81, - "num_queries": 19, - "num_complete_scores": 81, - "unweighted_mean_score": 4.032407407407407, - "weighted_total_content_score": 80.88369070825209, - "semantic_relevance": 4.172839506172839, - "factual_accuracy": 4.296296296296297, - "freshness": 4.9753086419753085, - "objectivity_tone": 3.8518518518518516, - "layout_ad_density": 2.740740740740741, - "accountability": 3.8518518518518516, - "transparency": 4.098765432098766, - "authority": 4.271604938271605, - "avg_ge_freq": 0.49793086419753113, - "relative_se_rank": 2.029381072384311, - "normalized_reciprocal_se_rank": 0.07521622430371759, - "reciprocal_se_rank": 0.027782539335116603, - "percentage_ge_sources_not_in_se_sources": 83.95061728395062, - "percentage_ge_sources_in_se_sources": 16.049382716049383 - }, - { - "model_name": "Gemini-3-Flash-Preview", - "query_type": "Pinocchios", - "num_sources": 89, - "num_queries": 20, - "num_complete_scores": 88, - "unweighted_mean_score": 4.073863636363637, - "weighted_total_content_score": 80.40212891780013, - "semantic_relevance": 3.7386363636363638, - "factual_accuracy": 4.204545454545454, - "freshness": 3.8863636363636362, - "objectivity_tone": 4.125, - "layout_ad_density": 3.7386363636363638, - "accountability": 4.363636363636363, - "transparency": 4.375, - "authority": 4.159090909090909, - "avg_ge_freq": 0.5842617977528091, - "relative_se_rank": 2.0481831598330293, - "normalized_reciprocal_se_rank": 0.14693960626866004, - "reciprocal_se_rank": 0.04501704131212949, - "percentage_ge_sources_not_in_se_sources": 73.03370786516854, - "percentage_ge_sources_in_se_sources": 26.96629213483146 - }, - { - "model_name": "claude", - "query_type": "VACOS", - "num_sources": 81, - "num_queries": 19, - "num_complete_scores": 81, - "unweighted_mean_score": 3.9891975308641974, - "weighted_total_content_score": 80.10396361273551, - "semantic_relevance": 4.2592592592592595, - "factual_accuracy": 4.345679012345679, - "freshness": 4.938271604938271, - "objectivity_tone": 3.6666666666666665, - "layout_ad_density": 2.740740740740741, - "accountability": 4.197530864197531, - "transparency": 3.9012345679012346, - "authority": 3.8641975308641974, - "avg_ge_freq": 0.7819024691358022, - "relative_se_rank": 2.0549062284325044, - "normalized_reciprocal_se_rank": 0.056933641949831956, - "reciprocal_se_rank": 0.02338939454619748, - "percentage_ge_sources_not_in_se_sources": 83.95061728395062, - "percentage_ge_sources_in_se_sources": 16.049382716049383 - }, - { - "model_name": "gensee", - "query_type": "VACOS", - "num_sources": 88, - "num_queries": 19, - "num_complete_scores": 88, - "unweighted_mean_score": 3.9332386363636362, - "weighted_total_content_score": 79.43779904306221, - "semantic_relevance": 4.534090909090909, - "factual_accuracy": 4.363636363636363, - "freshness": 4.818181818181818, - "objectivity_tone": 3.6363636363636362, - "layout_ad_density": 2.4886363636363638, - "accountability": 4.011363636363637, - "transparency": 3.7954545454545454, - "authority": 3.8181818181818183, - "avg_ge_freq": 0.5340818181818183, - "relative_se_rank": 2.115456142814551, - "normalized_reciprocal_se_rank": 0.07056832757590334, - "reciprocal_se_rank": 0.02666569036411269, - "percentage_ge_sources_not_in_se_sources": 87.5, - "percentage_ge_sources_in_se_sources": 12.5 - }, - { - "model_name": "claude", - "query_type": "DebateQA", - "num_sources": 70, - "num_queries": 20, - "num_complete_scores": 65, - "unweighted_mean_score": 4.280769230769231, - "weighted_total_content_score": 79.39849624060146, - "semantic_relevance": 4.3076923076923075, - "factual_accuracy": 4.323076923076923, - "freshness": 4.430769230769231, - "objectivity_tone": 4.107692307692307, - "layout_ad_density": 3.9384615384615387, - "accountability": 4.323076923076923, - "transparency": 4.384615384615385, - "authority": 4.430769230769231, - "avg_ge_freq": 0.8523828571428572, - "relative_se_rank": 1.3421362086210757, - "normalized_reciprocal_se_rank": 0.21041652104583275, - "reciprocal_se_rank": 0.06026998928043071, - "percentage_ge_sources_not_in_se_sources": 54.28571428571426, - "percentage_ge_sources_in_se_sources": 45.71428571428574 - }, - { - "model_name": "Perplexity-Sonar-Pro", - "query_type": "VACOS", - "num_sources": 64, - "num_queries": 20, - "num_complete_scores": 62, - "unweighted_mean_score": 4.046428571428572, - "weighted_total_content_score": 79.39144736842107, - "semantic_relevance": 4.193548387096774, - "factual_accuracy": 4.5, - "freshness": 4.984126984126984, - "objectivity_tone": 3.7419354838709675, - "layout_ad_density": 3.142857142857143, - "accountability": 3.619047619047619, - "transparency": 4.095238095238095, - "authority": 4.111111111111111, - "avg_ge_freq": 0.8385406249999999, - "relative_se_rank": 2.067510788296202, - "normalized_reciprocal_se_rank": 0.051796852838519515, - "reciprocal_se_rank": 0.02215506900731415, - "percentage_ge_sources_not_in_se_sources": 85.93750000000001, - "percentage_ge_sources_in_se_sources": 14.0625 - }, - { - "model_name": "gpt-4o", - "query_type": "QuoraQuestions", - "num_sources": 76, - "num_queries": 19, - "num_complete_scores": 76, - "unweighted_mean_score": 3.9654605263157894, - "weighted_total_content_score": 79.21052631578947, - "semantic_relevance": 4.065789473684211, - "factual_accuracy": 4.0394736842105265, - "freshness": 4.631578947368421, - "objectivity_tone": 3.6973684210526314, - "layout_ad_density": 3.210526315789474, - "accountability": 4.0, - "transparency": 4.118421052631579, - "authority": 3.960526315789474, - "avg_ge_freq": 0.4298052631578951, - "relative_se_rank": 1.7644798178150203, - "normalized_reciprocal_se_rank": 0.0996490754594708, - "reciprocal_se_rank": 0.03365353997691167, - "percentage_ge_sources_not_in_se_sources": 71.05263157894738, - "percentage_ge_sources_in_se_sources": 28.947368421052637 - }, - { - "model_name": "claude", - "query_type": "HotpotQA", - "num_sources": 10, - "num_queries": 7, - "num_complete_scores": 10, - "unweighted_mean_score": 3.925, - "weighted_total_content_score": 79.15789473684211, - "semantic_relevance": 3.7, - "factual_accuracy": 4.1, - "freshness": 4.1, - "objectivity_tone": 4.6, - "layout_ad_density": 3.6, - "accountability": 3.5, - "transparency": 3.9, - "authority": 3.9, - "avg_ge_freq": 0.8333400000000001, - "relative_se_rank": 1.8379612104849017, - "normalized_reciprocal_se_rank": 0.1327922077922078, - "reciprocal_se_rank": 0.04161754507628294, - "percentage_ge_sources_not_in_se_sources": 70.00000000000001, - "percentage_ge_sources_in_se_sources": 30.0 - }, - { - "model_name": "grok-4.1-fast-non-reasoning", - "query_type": "QuoraQuestions", - "num_sources": 58, - "num_queries": 15, - "num_complete_scores": 57, - "unweighted_mean_score": 4.0, - "weighted_total_content_score": 78.82032667876587, - "semantic_relevance": 4.333333333333333, - "factual_accuracy": 4.175438596491228, - "freshness": 4.43859649122807, - "objectivity_tone": 3.6842105263157894, - "layout_ad_density": 3.245614035087719, - "accountability": 3.8947368421052633, - "transparency": 4.157894736842105, - "authority": 4.0701754385964914, - "avg_ge_freq": 0.8563258620689654, - "relative_se_rank": 1.5173397953190046, - "normalized_reciprocal_se_rank": 0.15419400685217452, - "reciprocal_se_rank": 0.04676021038438175, - "percentage_ge_sources_not_in_se_sources": 58.62068965517241, - "percentage_ge_sources_in_se_sources": 41.37931034482759 - }, - { - "model_name": "tavily", - "query_type": "VACOS", - "num_sources": 83, - "num_queries": 20, - "num_complete_scores": 83, - "unweighted_mean_score": 3.9548192771084336, - "weighted_total_content_score": 78.80786303107162, - "semantic_relevance": 3.7590361445783134, - "factual_accuracy": 4.313253012048193, - "freshness": 4.9156626506024095, - "objectivity_tone": 3.5180722891566263, - "layout_ad_density": 2.6144578313253013, - "accountability": 4.0602409638554215, - "transparency": 4.144578313253012, - "authority": 4.313253012048193, - "avg_ge_freq": 0.9397590361445783, - "relative_se_rank": 1.7133062132347119, - "normalized_reciprocal_se_rank": 0.1335959024960005, - "reciprocal_se_rank": 0.04181066589102925, - "percentage_ge_sources_not_in_se_sources": 68.67469879518072, - "percentage_ge_sources_in_se_sources": 31.325301204819276 + "unweighted_mean_score": 4.295, + "weighted_total_content_score": 85.55263158, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8166749999999997, + "relative_se_rank": 2.327785098166267, + "normalized_reciprocal_se_rank": 0.0098989898989899, + "reciprocal_se_rank": 0.012087378640776695, + "percentage_ge_sources_not_in_se_sources": 98.75, + "percentage_ge_sources_in_se_sources": 1.2499999999999998 }, { - "model_name": "Gemini-3-Flash-Preview", - "query_type": "VACOS", - "num_sources": 92, - "num_queries": 20, - "num_complete_scores": 88, - "unweighted_mean_score": 4.015489130434783, - "weighted_total_content_score": 78.72997711670477, - "semantic_relevance": 3.741573033707865, - "factual_accuracy": 4.393258426966292, - "freshness": 4.802197802197802, - "objectivity_tone": 3.853932584269663, - "layout_ad_density": 2.760869565217391, - "accountability": 4.373626373626373, - "transparency": 4.164835164835165, - "authority": 4.087912087912088, - "avg_ge_freq": 0.45288152173913093, - "relative_se_rank": 2.168925621074675, - "normalized_reciprocal_se_rank": 0.032034724656595535, - "reciprocal_se_rank": 0.017406402283987762, - "percentage_ge_sources_not_in_se_sources": 89.1304347826087, - "percentage_ge_sources_in_se_sources": 10.869565217391303 + "model_name": "gpt-4o", + "query_type": "DebateQA", + "num_sources": 78, + "num_queries": 18, + "num_complete_scores": 78, + "unweighted_mean_score": 4.12, + "weighted_total_content_score": 82.60458839, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.4999897435897439, + "relative_se_rank": 1.7141289179822279, + "normalized_reciprocal_se_rank": 0.12281919725976252, + "reciprocal_se_rank": 0.03922111778814682, + "percentage_ge_sources_not_in_se_sources": 71.7948717948718, + "percentage_ge_sources_in_se_sources": 28.205128205128204 }, { "model_name": "gpt-4o", @@ -1503,16 +607,16 @@ "num_sources": 19, "num_queries": 14, "num_complete_scores": 19, - "unweighted_mean_score": 3.9210526315789473, - "weighted_total_content_score": 78.28254847645428, - "semantic_relevance": 3.4210526315789473, - "factual_accuracy": 3.789473684210526, - "freshness": 4.368421052631579, - "objectivity_tone": 4.421052631578948, - "layout_ad_density": 3.8421052631578947, - "accountability": 3.473684210526316, - "transparency": 3.8947368421052633, - "authority": 4.157894736842105, + "unweighted_mean_score": 3.921, + "weighted_total_content_score": 78.28254848, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 0.3683947368421053, "relative_se_rank": 2.1738400706504932, "normalized_reciprocal_se_rank": 0.08335991493886231, @@ -1521,113 +625,136 @@ "percentage_ge_sources_in_se_sources": 10.526315789473685 }, { - "model_name": "deepseek-chat-tavily", + "model_name": "gpt-4o", "query_type": "Pinocchios", - "num_sources": 14, - "num_queries": 4, - "num_complete_scores": 14, - "unweighted_mean_score": 3.9375, - "weighted_total_content_score": 78.27067669172932, - "semantic_relevance": 3.642857142857143, - "factual_accuracy": 3.9285714285714284, - "freshness": 3.5714285714285716, - "objectivity_tone": 3.7857142857142856, - "layout_ad_density": 3.9285714285714284, - "accountability": 4.142857142857143, - "transparency": 4.142857142857143, - "authority": 4.357142857142857, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.668905839637547, - "normalized_reciprocal_se_rank": 0.19718958290386862, - "reciprocal_se_rank": 0.057091671620104346, - "percentage_ge_sources_not_in_se_sources": 64.28571428571429, - "percentage_ge_sources_in_se_sources": 35.71428571428571 + "num_sources": 40, + "num_queries": 18, + "num_complete_scores": 40, + "unweighted_mean_score": 4.294, + "weighted_total_content_score": 86.60526316, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.44164499999999995, + "relative_se_rank": 1.5765700005314691, + "normalized_reciprocal_se_rank": 0.28054167213258124, + "reciprocal_se_rank": 0.07712045034253773, + "percentage_ge_sources_not_in_se_sources": 62.5, + "percentage_ge_sources_in_se_sources": 37.5 }, { - "model_name": "grok-4.1-fast-non-reasoning", - "query_type": "HotpotQA", - "num_sources": 63, - "num_queries": 20, - "num_complete_scores": 63, - "unweighted_mean_score": 3.8968253968253967, - "weighted_total_content_score": 78.01169590643275, - "semantic_relevance": 3.4603174603174605, - "factual_accuracy": 4.111111111111111, - "freshness": 4.158730158730159, - "objectivity_tone": 4.190476190476191, - "layout_ad_density": 3.857142857142857, - "accountability": 3.4285714285714284, - "transparency": 3.9523809523809526, - "authority": 4.015873015873016, - "avg_ge_freq": 0.6137492063492065, - "relative_se_rank": 1.9920341456410042, - "normalized_reciprocal_se_rank": 0.07504523694999884, - "reciprocal_se_rank": 0.027741452568082244, - "percentage_ge_sources_not_in_se_sources": 82.53968253968254, - "percentage_ge_sources_in_se_sources": 17.46031746031746 + "model_name": "gpt-4o", + "query_type": "QuoraQuestions", + "num_sources": 76, + "num_queries": 19, + "num_complete_scores": 76, + "unweighted_mean_score": 3.965, + "weighted_total_content_score": 79.21052632, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.4298052631578951, + "relative_se_rank": 1.7644798178150203, + "normalized_reciprocal_se_rank": 0.0996490754594708, + "reciprocal_se_rank": 0.03365353997691167, + "percentage_ge_sources_not_in_se_sources": 71.05263157894738, + "percentage_ge_sources_in_se_sources": 28.947368421052637 }, { - "model_name": "gensee", - "query_type": "QuoraQuestions", - "num_sources": 83, - "num_queries": 18, - "num_complete_scores": 82, - "unweighted_mean_score": 3.923780487804878, - "weighted_total_content_score": 77.94546607482562, - "semantic_relevance": 4.304878048780488, - "factual_accuracy": 4.182926829268292, - "freshness": 4.390243902439025, - "objectivity_tone": 3.682926829268293, - "layout_ad_density": 3.2560975609756095, - "accountability": 3.7195121951219514, - "transparency": 3.902439024390244, - "authority": 3.951219512195122, - "avg_ge_freq": 0.5542120481927711, - "relative_se_rank": 1.6111832468899239, - "normalized_reciprocal_se_rank": 0.12268532386073742, - "reciprocal_se_rank": 0.03918894918012865, - "percentage_ge_sources_not_in_se_sources": 62.650602409638545, - "percentage_ge_sources_in_se_sources": 37.34939759036144 + "model_name": "gpt-4o", + "query_type": "VA-COS NLQ", + "num_sources": 81, + "num_queries": 19, + "num_complete_scores": 81, + "unweighted_mean_score": 4.032, + "weighted_total_content_score": 80.88369071, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.49793086419753113, + "relative_se_rank": 2.029381072384311, + "normalized_reciprocal_se_rank": 0.07521622430371759, + "reciprocal_se_rank": 0.027782539335116603, + "percentage_ge_sources_not_in_se_sources": 83.95061728395062, + "percentage_ge_sources_in_se_sources": 16.049382716049383 }, { - "model_name": "deepseek-chat-gensee", + "model_name": "Grok-4.1-Fast", + "query_type": "DebateQA", + "num_sources": null, + "num_queries": null, + "num_complete_scores": null, + "unweighted_mean_score": 4.5, + "weighted_total_content_score": 90.35087719, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null + }, + { + "model_name": "Grok-4.1-Fast", "query_type": "HotpotQA", - "num_sources": 11, - "num_queries": 3, - "num_complete_scores": 9, - "unweighted_mean_score": 4.308928571428572, - "weighted_total_content_score": 77.70334928229664, - "semantic_relevance": 4.0, - "factual_accuracy": 4.8, - "freshness": 4.8, - "objectivity_tone": 4.4, - "layout_ad_density": 4.666666666666667, - "accountability": 3.3, - "transparency": 4.1, - "authority": 4.4, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.629286988225921, - "normalized_reciprocal_se_rank": 0.14128295946477765, - "reciprocal_se_rank": 0.04365779851216744, - "percentage_ge_sources_not_in_se_sources": 81.81818181818181, - "percentage_ge_sources_in_se_sources": 18.181818181818183 + "num_sources": null, + "num_queries": null, + "num_complete_scores": null, + "unweighted_mean_score": 3.897, + "weighted_total_content_score": 78.01169591, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null }, { - "model_name": "Gemini-2.5-Flash-Preview", - "query_type": "Pinocchios", - "num_sources": 86, - "num_queries": 20, - "num_complete_scores": 85, - "unweighted_mean_score": 3.9194767441860465, - "weighted_total_content_score": 77.67441860465114, - "semantic_relevance": 3.388235294117647, - "factual_accuracy": 3.9647058823529413, - "freshness": 3.7093023255813953, - "objectivity_tone": 3.9411764705882355, - "layout_ad_density": 3.5813953488372094, - "accountability": 4.325581395348837, - "transparency": 4.3604651162790695, - "authority": 4.174418604651163, + "model_name": "Grok-4.1-Fast", + "query_type": "Pinocchios", + "num_sources": null, + "num_queries": null, + "num_complete_scores": null, + "unweighted_mean_score": 4.271, + "weighted_total_content_score": 86.00157109, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": null, "relative_se_rank": null, "normalized_reciprocal_se_rank": null, @@ -1636,44 +763,44 @@ "percentage_ge_sources_in_se_sources": null }, { - "model_name": "deepseek-reasoning-tavily", + "model_name": "Grok-4.1-Fast", "query_type": "QuoraQuestions", - "num_sources": 14, - "num_queries": 4, - "num_complete_scores": 13, - "unweighted_mean_score": 4.201923076923077, - "weighted_total_content_score": 77.36842105263158, - "semantic_relevance": 3.769230769230769, - "factual_accuracy": 4.461538461538462, - "freshness": 5.0, - "objectivity_tone": 3.6923076923076925, - "layout_ad_density": 3.6153846153846154, - "accountability": 4.230769230769231, - "transparency": 4.384615384615385, - "authority": 4.461538461538462, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.3760488515963876, - "normalized_reciprocal_se_rank": 0.20085172184169675, - "reciprocal_se_rank": 0.05797165160759218, - "percentage_ge_sources_not_in_se_sources": 50.0, - "percentage_ge_sources_in_se_sources": 50.0 + "num_sources": null, + "num_queries": null, + "num_complete_scores": null, + "unweighted_mean_score": 4.0, + "weighted_total_content_score": 78.82032668, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null }, { - "model_name": "google-search", - "query_type": "VACOS", - "num_sources": 74, - "num_queries": 20, - "num_complete_scores": 74, - "unweighted_mean_score": 3.8462837837837838, - "weighted_total_content_score": 77.0554765291607, - "semantic_relevance": 3.7567567567567566, - "factual_accuracy": 4.216216216216216, - "freshness": 4.472972972972973, - "objectivity_tone": 3.689189189189189, - "layout_ad_density": 3.1216216216216215, - "accountability": 3.6216216216216215, - "transparency": 3.7837837837837838, - "authority": 4.108108108108108, + "model_name": "Grok-4.1-Fast", + "query_type": "VA-COS NLQ", + "num_sources": null, + "num_queries": null, + "num_complete_scores": null, + "unweighted_mean_score": 4.205, + "weighted_total_content_score": 84.48621554, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": null, "relative_se_rank": null, "normalized_reciprocal_se_rank": null, @@ -1682,90 +809,251 @@ "percentage_ge_sources_in_se_sources": null }, { - "model_name": "exa", + "model_name": "Gemini-3-Pro-Preview", + "query_type": "DebateQA", + "num_sources": 100, + "num_queries": 20, + "num_complete_scores": 96, + "unweighted_mean_score": 4.173469388, + "weighted_total_content_score": 81.42105263, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.4833250000000001, + "relative_se_rank": 1.7292861678201157, + "normalized_reciprocal_se_rank": 0.10523315112286376, + "reciprocal_se_rank": 0.03499534456593085, + "percentage_ge_sources_not_in_se_sources": 72.0, + "percentage_ge_sources_in_se_sources": 28.0 + }, + { + "model_name": "Gemini-3-Pro-Preview", + "query_type": "HotpotQA", + "num_sources": 83, + "num_queries": 20, + "num_complete_scores": 77, + "unweighted_mean_score": 3.720848389, + "weighted_total_content_score": 72.63157895, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.57830843373494, + "relative_se_rank": 1.9786482310103346, + "normalized_reciprocal_se_rank": 0.056782624848369385, + "reciprocal_se_rank": 0.02335310645628293, + "percentage_ge_sources_not_in_se_sources": 83.13253012048195, + "percentage_ge_sources_in_se_sources": 16.86746987951807 + }, + { + "model_name": "Gemini-3-Pro-Preview", + "query_type": "Pinocchios", + "num_sources": 86, + "num_queries": 20, + "num_complete_scores": 84, + "unweighted_mean_score": 4.159148427, + "weighted_total_content_score": 81.87270502, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5736325581395352, + "relative_se_rank": 1.941468415075042, + "normalized_reciprocal_se_rank": 0.15368587669053982, + "reciprocal_se_rank": 0.04663811114651323, + "percentage_ge_sources_not_in_se_sources": 73.25581395348837, + "percentage_ge_sources_in_se_sources": 26.74418604651163 + }, + { + "model_name": "Gemini-3-Pro-Preview", "query_type": "QuoraQuestions", - "num_sources": 85, - "num_queries": 19, - "num_complete_scores": 85, - "unweighted_mean_score": 3.8705882352941177, - "weighted_total_content_score": 76.9659442724458, - "semantic_relevance": 3.8, - "factual_accuracy": 3.823529411764706, - "freshness": 4.564705882352941, - "objectivity_tone": 3.5647058823529414, - "layout_ad_density": 3.176470588235294, - "accountability": 4.0588235294117645, - "transparency": 4.070588235294117, - "authority": 3.9058823529411764, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.329000189685051, - "normalized_reciprocal_se_rank": 0.1994493307755928, - "reciprocal_se_rank": 0.05763466928830994, - "percentage_ge_sources_not_in_se_sources": 50.588235294117645, - "percentage_ge_sources_in_se_sources": 49.411764705882355 + "num_sources": 81, + "num_queries": 17, + "num_complete_scores": 76, + "unweighted_mean_score": 3.750843732, + "weighted_total_content_score": 72.37166992, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5061604938271607, + "relative_se_rank": 1.7841348598268654, + "normalized_reciprocal_se_rank": 0.11912889330801175, + "reciprocal_se_rank": 0.038334369993915436, + "percentage_ge_sources_not_in_se_sources": 71.60493827160494, + "percentage_ge_sources_in_se_sources": 28.395061728395063 }, { - "model_name": "gensee", + "model_name": "Gemini-3-Pro-Preview", + "query_type": "VA-COS NLQ", + "num_sources": 94, + "num_queries": 20, + "num_complete_scores": 94, + "unweighted_mean_score": 4.059840426, + "weighted_total_content_score": 81.3549832, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.4148723404255325, + "relative_se_rank": 2.3167240998153185, + "normalized_reciprocal_se_rank": 0.007184894289987778, + "reciprocal_se_rank": 0.01143520518133201, + "percentage_ge_sources_not_in_se_sources": 96.80851063829788, + "percentage_ge_sources_in_se_sources": 3.1914893617021276 + }, + { + "model_name": "Gemini-3-Flash-Preview", + "query_type": "DebateQA", + "num_sources": 100, + "num_queries": 20, + "num_complete_scores": 95, + "unweighted_mean_score": 4.147, + "weighted_total_content_score": 81.05263158, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.4899890000000004, + "relative_se_rank": 1.6757336210963478, + "normalized_reciprocal_se_rank": 0.12549266202008144, + "reciprocal_se_rank": 0.039863528009679766, + "percentage_ge_sources_not_in_se_sources": 71.0, + "percentage_ge_sources_in_se_sources": 29.0 + }, + { + "model_name": "Gemini-3-Flash-Preview", "query_type": "HotpotQA", - "num_sources": 45, - "num_queries": 16, - "num_complete_scores": 45, - "unweighted_mean_score": 3.786111111111111, - "weighted_total_content_score": 76.67836257309939, - "semantic_relevance": 3.8444444444444446, - "factual_accuracy": 4.288888888888889, - "freshness": 4.066666666666666, - "objectivity_tone": 4.133333333333334, - "layout_ad_density": 3.6444444444444444, - "accountability": 3.2, - "transparency": 3.466666666666667, - "authority": 3.6444444444444444, - "avg_ge_freq": 0.4888711111111113, - "relative_se_rank": 1.9318191821883404, - "normalized_reciprocal_se_rank": 0.06456158601930041, - "reciprocal_se_rank": 0.025222322854152286, - "percentage_ge_sources_not_in_se_sources": 84.44444444444443, - "percentage_ge_sources_in_se_sources": 15.555555555555555 + "num_sources": 85, + "num_queries": 20, + "num_complete_scores": 81, + "unweighted_mean_score": 3.68, + "weighted_total_content_score": 72.28482972, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5999917647058826, + "relative_se_rank": 1.9004372674862968, + "normalized_reciprocal_se_rank": 0.06427878374204764, + "reciprocal_se_rank": 0.0251543679380163, + "percentage_ge_sources_not_in_se_sources": 78.82352941176471, + "percentage_ge_sources_in_se_sources": 21.176470588235293 }, { - "model_name": "deepseek-reasoning-gensee", + "model_name": "Gemini-3-Flash-Preview", "query_type": "Pinocchios", - "num_sources": 16, - "num_queries": 4, - "num_complete_scores": 15, - "unweighted_mean_score": 4.125, - "weighted_total_content_score": 76.57894736842105, - "semantic_relevance": 3.6666666666666665, - "factual_accuracy": 4.133333333333334, - "freshness": 3.8, - "objectivity_tone": 3.8, - "layout_ad_density": 4.266666666666667, - "accountability": 4.333333333333333, - "transparency": 4.6, - "authority": 4.4, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.283513696623452, - "normalized_reciprocal_se_rank": 0.0625, - "reciprocal_se_rank": 0.02472694174757281, - "percentage_ge_sources_not_in_se_sources": 93.75, - "percentage_ge_sources_in_se_sources": 6.25 + "num_sources": 89, + "num_queries": 20, + "num_complete_scores": 88, + "unweighted_mean_score": 4.074, + "weighted_total_content_score": 80.40212892, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5842617977528091, + "relative_se_rank": 2.0481831598330293, + "normalized_reciprocal_se_rank": 0.14693960626866004, + "reciprocal_se_rank": 0.04501704131212949, + "percentage_ge_sources_not_in_se_sources": 73.03370786516854, + "percentage_ge_sources_in_se_sources": 26.96629213483146 + }, + { + "model_name": "Gemini-3-Flash-Preview", + "query_type": "QuoraQuestions", + "num_sources": 90, + "num_queries": 19, + "num_complete_scores": 86, + "unweighted_mean_score": 3.822, + "weighted_total_content_score": 74.25730994, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.49258000000000024, + "relative_se_rank": 1.7659524786645306, + "normalized_reciprocal_se_rank": 0.14047144175948556, + "reciprocal_se_rank": 0.04346279789851716, + "percentage_ge_sources_not_in_se_sources": 72.22222222222223, + "percentage_ge_sources_in_se_sources": 27.77777777777778 }, { - "model_name": "google-search", - "query_type": "QuoraQuestions", - "num_sources": 80, - "num_queries": 19, - "num_complete_scores": 80, - "unweighted_mean_score": 3.809375, - "weighted_total_content_score": 76.22368421052633, - "semantic_relevance": 4.05, - "factual_accuracy": 3.975, - "freshness": 4.275, - "objectivity_tone": 3.4375, - "layout_ad_density": 3.3875, - "accountability": 3.6375, - "transparency": 3.9125, - "authority": 3.8, + "model_name": "Gemini-3-Flash-Preview", + "query_type": "VA-COS NLQ", + "num_sources": 92, + "num_queries": 20, + "num_complete_scores": 88, + "unweighted_mean_score": 4.022, + "weighted_total_content_score": 78.72997712, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.45288152173913093, + "relative_se_rank": 2.168925621074675, + "normalized_reciprocal_se_rank": 0.032034724656595535, + "reciprocal_se_rank": 0.017406402283987762, + "percentage_ge_sources_not_in_se_sources": 89.1304347826087, + "percentage_ge_sources_in_se_sources": 10.869565217391303 + }, + { + "model_name": "Gemini-2.5-Flash-Preview", + "query_type": "DebateQA", + "num_sources": 100, + "num_queries": 20, + "num_complete_scores": 100, + "unweighted_mean_score": 4.133, + "weighted_total_content_score": 82.126, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": null, "relative_se_rank": null, "normalized_reciprocal_se_rank": null, @@ -1774,67 +1062,21 @@ "percentage_ge_sources_in_se_sources": null }, { - "model_name": "deepseek-reasoning-tavily", - "query_type": "VACOS", - "num_sources": 15, - "num_queries": 4, - "num_complete_scores": 14, - "unweighted_mean_score": 4.080357142857143, - "weighted_total_content_score": 76.07017543859648, - "semantic_relevance": 4.142857142857143, - "factual_accuracy": 4.5, - "freshness": 5.0, - "objectivity_tone": 3.5, - "layout_ad_density": 2.5714285714285716, - "accountability": 4.0, - "transparency": 4.285714285714286, - "authority": 4.642857142857143, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.1803071364046978, - "normalized_reciprocal_se_rank": 0.052794612794612804, - "reciprocal_se_rank": 0.022394822006472487, - "percentage_ge_sources_not_in_se_sources": 93.33333333333333, - "percentage_ge_sources_in_se_sources": 6.666666666666666 - }, - { - "model_name": "Perplexity-Sonar-Pro", - "query_type": "QuoraQuestions", - "num_sources": 82, - "num_queries": 19, - "num_complete_scores": 82, - "unweighted_mean_score": 3.8185975609756095, - "weighted_total_content_score": 75.69961489088577, - "semantic_relevance": 3.6219512195121952, - "factual_accuracy": 3.8292682926829267, - "freshness": 4.512195121951219, - "objectivity_tone": 3.3658536585365852, - "layout_ad_density": 3.317073170731707, - "accountability": 3.7195121951219514, - "transparency": 4.195121951219512, - "authority": 3.9878048780487805, - "avg_ge_freq": 0.8252060975609754, - "relative_se_rank": 1.3424344412739029, - "normalized_reciprocal_se_rank": 0.2024394067077523, - "reciprocal_se_rank": 0.05835315840793079, - "percentage_ge_sources_not_in_se_sources": 52.4390243902439, - "percentage_ge_sources_in_se_sources": 47.5609756097561 - }, - { - "model_name": "google-search", + "model_name": "Gemini-2.5-Flash-Preview", "query_type": "HotpotQA", - "num_sources": 70, + "num_sources": 84, "num_queries": 20, - "num_complete_scores": 69, - "unweighted_mean_score": 3.812244897959184, - "weighted_total_content_score": 75.33834586466166, - "semantic_relevance": 2.9285714285714284, - "factual_accuracy": 3.7857142857142856, - "freshness": 4.057142857142857, - "objectivity_tone": 3.9714285714285715, - "layout_ad_density": 4.072463768115942, - "accountability": 3.6714285714285713, - "transparency": 3.8857142857142857, - "authority": 4.128571428571429, + "num_complete_scores": 84, + "unweighted_mean_score": 3.72, + "weighted_total_content_score": 73.86, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": null, "relative_se_rank": null, "normalized_reciprocal_se_rank": null, @@ -1843,73 +1085,27 @@ "percentage_ge_sources_in_se_sources": null }, { - "model_name": "deepseek-chat-gensee", - "query_type": "VACOS", - "num_sources": 18, - "num_queries": 4, - "num_complete_scores": 16, - "unweighted_mean_score": 4.110294117647059, - "weighted_total_content_score": 74.85380116959064, - "semantic_relevance": 4.3125, - "factual_accuracy": 4.25, - "freshness": 4.882352941176471, - "objectivity_tone": 3.8125, - "layout_ad_density": 3.1875, - "accountability": 3.6470588235294117, - "transparency": 4.235294117647059, - "authority": 4.411764705882353, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.211382113821138, - "normalized_reciprocal_se_rank": 0.043995510662177335, - "reciprocal_se_rank": 0.020280474649406684, - "percentage_ge_sources_not_in_se_sources": 94.44444444444444, - "percentage_ge_sources_in_se_sources": 5.555555555555555 - }, - { - "model_name": "deepseek-chat-tavily", - "query_type": "HotpotQA", - "num_sources": 9, - "num_queries": 3, - "num_complete_scores": 8, - "unweighted_mean_score": 4.21875, - "weighted_total_content_score": 74.85380116959064, - "semantic_relevance": 3.375, - "factual_accuracy": 4.75, - "freshness": 4.625, - "objectivity_tone": 4.375, - "layout_ad_density": 4.5, - "accountability": 3.125, - "transparency": 4.5, - "authority": 4.5, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.740759746838774, - "normalized_reciprocal_se_rank": 0.17267917267917265, - "reciprocal_se_rank": 0.05120203421174294, - "percentage_ge_sources_not_in_se_sources": 77.77777777777777, - "percentage_ge_sources_in_se_sources": 22.22222222222222 - }, - { - "model_name": "deepseek-chat-gensee", - "query_type": "QuoraQuestions", - "num_sources": 17, - "num_queries": 4, - "num_complete_scores": 16, - "unweighted_mean_score": 3.8676470588235294, - "weighted_total_content_score": 74.61300309597522, - "semantic_relevance": 3.5294117647058822, - "factual_accuracy": 4.117647058823529, - "freshness": 4.625, - "objectivity_tone": 3.4705882352941178, - "layout_ad_density": 3.235294117647059, - "accountability": 4.0, - "transparency": 4.0, - "authority": 3.875, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.810108670223448, - "normalized_reciprocal_se_rank": 0.13864781252324507, - "reciprocal_se_rank": 0.043024595727672955, - "percentage_ge_sources_not_in_se_sources": 70.58823529411765, - "percentage_ge_sources_in_se_sources": 29.41176470588235 + "model_name": "Gemini-2.5-Flash-Preview", + "query_type": "Pinocchios", + "num_sources": 86, + "num_queries": 20, + "num_complete_scores": 85, + "unweighted_mean_score": 3.931, + "weighted_total_content_score": 77.6744186, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null }, { "model_name": "Gemini-2.5-Flash-Preview", @@ -1917,16 +1113,39 @@ "num_sources": 83, "num_queries": 18, "num_complete_scores": 81, - "unweighted_mean_score": 3.789457831325301, - "weighted_total_content_score": 74.40710209258081, - "semantic_relevance": 3.5853658536585367, - "factual_accuracy": 3.768292682926829, - "freshness": 4.463414634146342, - "objectivity_tone": 3.4878048780487805, - "layout_ad_density": 3.036144578313253, - "accountability": 3.975609756097561, - "transparency": 4.109756097560975, - "authority": 3.8902439024390243, + "unweighted_mean_score": 3.79, + "weighted_total_content_score": 74.40710209, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null + }, + { + "model_name": "Gemini-2.5-Flash-Preview", + "query_type": "VA-COS NLQ", + "num_sources": 91, + "num_queries": 20, + "num_complete_scores": 91, + "unweighted_mean_score": 4.047, + "weighted_total_content_score": 81.01792944, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": null, "relative_se_rank": null, "normalized_reciprocal_se_rank": null, @@ -1934,22 +1153,91 @@ "percentage_ge_sources_not_in_se_sources": null, "percentage_ge_sources_in_se_sources": null }, + { + "model_name": "claude", + "query_type": "DebateQA", + "num_sources": 70, + "num_queries": 20, + "num_complete_scores": 65, + "unweighted_mean_score": 4.281, + "weighted_total_content_score": 79.39849624, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8523828571428572, + "relative_se_rank": 1.3421362086210757, + "normalized_reciprocal_se_rank": 0.21041652104583275, + "reciprocal_se_rank": 0.06026998928043071, + "percentage_ge_sources_not_in_se_sources": 54.28571428571426, + "percentage_ge_sources_in_se_sources": 45.71428571428574 + }, + { + "model_name": "claude", + "query_type": "HotpotQA", + "num_sources": 10, + "num_queries": 7, + "num_complete_scores": 10, + "unweighted_mean_score": 3.925, + "weighted_total_content_score": 79.15789474, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8333400000000001, + "relative_se_rank": 1.8379612104849017, + "normalized_reciprocal_se_rank": 0.1327922077922078, + "reciprocal_se_rank": 0.04161754507628294, + "percentage_ge_sources_not_in_se_sources": 70.00000000000001, + "percentage_ge_sources_in_se_sources": 30.0 + }, + { + "model_name": "claude", + "query_type": "Pinocchios", + "num_sources": 39, + "num_queries": 20, + "num_complete_scores": 39, + "unweighted_mean_score": 4.263, + "weighted_total_content_score": 85.56005398, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8205102564102565, + "relative_se_rank": 1.4724542616408207, + "normalized_reciprocal_se_rank": 0.28256007847697834, + "reciprocal_se_rank": 0.07760545575053605, + "percentage_ge_sources_not_in_se_sources": 53.84615384615383, + "percentage_ge_sources_in_se_sources": 46.153846153846175 + }, { "model_name": "claude", "query_type": "QuoraQuestions", "num_sources": 59, "num_queries": 18, "num_complete_scores": 58, - "unweighted_mean_score": 3.8017241379310347, - "weighted_total_content_score": 74.39785905441569, - "semantic_relevance": 3.9655172413793105, - "factual_accuracy": 3.793103448275862, - "freshness": 4.517241379310345, - "objectivity_tone": 3.310344827586207, - "layout_ad_density": 3.1206896551724137, - "accountability": 3.8793103448275863, - "transparency": 4.0344827586206895, - "authority": 3.793103448275862, + "unweighted_mean_score": 3.802, + "weighted_total_content_score": 74.39785905, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 0.8079084745762712, "relative_se_rank": 1.2724077582318005, "normalized_reciprocal_se_rank": 0.18350554762304847, @@ -1958,67 +1246,159 @@ "percentage_ge_sources_in_se_sources": 50.84745762711865 }, { - "model_name": "deepseek-reasoning-gensee", + "model_name": "claude", + "query_type": "VA-COS NLQ", + "num_sources": 81, + "num_queries": 19, + "num_complete_scores": 81, + "unweighted_mean_score": 3.989, + "weighted_total_content_score": 80.10396361, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.7819024691358022, + "relative_se_rank": 2.0549062284325044, + "normalized_reciprocal_se_rank": 0.056933641949831956, + "reciprocal_se_rank": 0.02338939454619748, + "percentage_ge_sources_not_in_se_sources": 83.95061728395062, + "percentage_ge_sources_in_se_sources": 16.049382716049383 + }, + { + "model_name": "Perplexity-Sonar-Pro", + "query_type": "DebateQA", + "num_sources": 82, + "num_queries": 20, + "num_complete_scores": 79, + "unweighted_mean_score": 4.235, + "weighted_total_content_score": 82.51604621, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.7804853658536585, + "relative_se_rank": 1.185618672325219, + "normalized_reciprocal_se_rank": 0.2274442114543135, + "reciprocal_se_rank": 0.0643615944999443, + "percentage_ge_sources_not_in_se_sources": 46.34146341463415, + "percentage_ge_sources_in_se_sources": 53.65853658536585 + }, + { + "model_name": "Perplexity-Sonar-Pro", "query_type": "HotpotQA", - "num_sources": 8, - "num_queries": 3, - "num_complete_scores": 7, - "unweighted_mean_score": 4.25, - "weighted_total_content_score": 74.34210526315789, - "semantic_relevance": 3.4285714285714284, - "factual_accuracy": 4.714285714285714, - "freshness": 5.0, - "objectivity_tone": 4.571428571428571, - "layout_ad_density": 4.714285714285714, - "accountability": 3.142857142857143, - "transparency": 3.857142857142857, - "authority": 4.571428571428571, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.8173972683851103, - "normalized_reciprocal_se_rank": 0.19426406926406925, - "reciprocal_se_rank": 0.056388696255201105, - "percentage_ge_sources_not_in_se_sources": 75.0, - "percentage_ge_sources_in_se_sources": 25.0 + "num_sources": 79, + "num_queries": 20, + "num_complete_scores": 78, + "unweighted_mean_score": 3.591, + "weighted_total_content_score": 71.13924051, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8143506329113921, + "relative_se_rank": 1.61782985019127, + "normalized_reciprocal_se_rank": 0.13063939371395492, + "reciprocal_se_rank": 0.04110024266427558, + "percentage_ge_sources_not_in_se_sources": 63.29113924050633, + "percentage_ge_sources_in_se_sources": 36.70886075949367 }, { - "model_name": "Gemini-3-Flash-Preview", + "model_name": "Perplexity-Sonar-Pro", + "query_type": "Pinocchios", + "num_sources": 72, + "num_queries": 19, + "num_complete_scores": 72, + "unweighted_mean_score": 4.087, + "weighted_total_content_score": 80.99415205, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8148138888888888, + "relative_se_rank": 1.9321450595878684, + "normalized_reciprocal_se_rank": 0.20414708640646176, + "reciprocal_se_rank": 0.05876349891805757, + "percentage_ge_sources_not_in_se_sources": 62.5, + "percentage_ge_sources_in_se_sources": 37.5 + }, + { + "model_name": "Perplexity-Sonar-Pro", "query_type": "QuoraQuestions", - "num_sources": 90, + "num_sources": 82, "num_queries": 19, - "num_complete_scores": 86, - "unweighted_mean_score": 3.828611111111111, - "weighted_total_content_score": 74.25730994152043, - "semantic_relevance": 3.558139534883721, - "factual_accuracy": 3.7790697674418605, - "freshness": 4.822222222222222, - "objectivity_tone": 3.5813953488372094, - "layout_ad_density": 3.1797752808988764, - "accountability": 3.911111111111111, - "transparency": 3.9444444444444446, - "authority": 3.8, - "avg_ge_freq": 0.49258000000000024, - "relative_se_rank": 1.7659524786645306, - "normalized_reciprocal_se_rank": 0.14047144175948556, - "reciprocal_se_rank": 0.04346279789851716, - "percentage_ge_sources_not_in_se_sources": 72.22222222222223, - "percentage_ge_sources_in_se_sources": 27.77777777777778 + "num_complete_scores": 82, + "unweighted_mean_score": 3.819, + "weighted_total_content_score": 75.69961489, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8252060975609754, + "relative_se_rank": 1.3424344412739029, + "normalized_reciprocal_se_rank": 0.2024394067077523, + "reciprocal_se_rank": 0.05835315840793079, + "percentage_ge_sources_not_in_se_sources": 52.4390243902439, + "percentage_ge_sources_in_se_sources": 47.5609756097561 }, { - "model_name": "Gemini-2.5-Flash-Preview", - "query_type": "HotpotQA", - "num_sources": 84, + "model_name": "Perplexity-Sonar-Pro", + "query_type": "VA-COS NLQ", + "num_sources": 64, "num_queries": 20, - "num_complete_scores": 84, - "unweighted_mean_score": 3.7202380952380953, - "weighted_total_content_score": 73.859649122807, - "semantic_relevance": 2.6785714285714284, - "factual_accuracy": 3.7738095238095237, - "freshness": 4.535714285714286, - "objectivity_tone": 4.190476190476191, - "layout_ad_density": 3.511904761904762, - "accountability": 3.5238095238095237, - "transparency": 3.761904761904762, - "authority": 3.7857142857142856, + "num_complete_scores": 62, + "unweighted_mean_score": 4.048, + "weighted_total_content_score": 79.39144737, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.8385406249999999, + "relative_se_rank": 2.067510788296202, + "normalized_reciprocal_se_rank": 0.051796852838519515, + "reciprocal_se_rank": 0.02215506900731415, + "percentage_ge_sources_not_in_se_sources": 85.93750000000001, + "percentage_ge_sources_in_se_sources": 14.0625 + }, + { + "model_name": "google-search", + "query_type": "DebateQA", + "num_sources": 89, + "num_queries": 20, + "num_complete_scores": 89, + "unweighted_mean_score": 4.263, + "weighted_total_content_score": 85.27498522, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": null, "relative_se_rank": null, "normalized_reciprocal_se_rank": null, @@ -2027,142 +1407,119 @@ "percentage_ge_sources_in_se_sources": null }, { - "model_name": "tavily", - "query_type": "QuoraQuestions", - "num_sources": 78, - "num_queries": 19, - "num_complete_scores": 78, - "unweighted_mean_score": 3.7115384615384617, - "weighted_total_content_score": 73.6707152496626, - "semantic_relevance": 3.5128205128205128, - "factual_accuracy": 3.769230769230769, - "freshness": 4.576923076923077, - "objectivity_tone": 3.3205128205128207, - "layout_ad_density": 3.051282051282051, - "accountability": 3.8846153846153846, - "transparency": 3.7564102564102564, - "authority": 3.8205128205128207, - "avg_ge_freq": 1.0, - "relative_se_rank": 0.9222314853068678, - "normalized_reciprocal_se_rank": 0.31851954838074426, - "reciprocal_se_rank": 0.08624620215945074, - "percentage_ge_sources_not_in_se_sources": 32.05128205128205, - "percentage_ge_sources_in_se_sources": 67.94871794871796 - }, - { - "model_name": "deepseek-reasoning-gensee", - "query_type": "QuoraQuestions", - "num_sources": 17, - "num_queries": 4, - "num_complete_scores": 16, - "unweighted_mean_score": 3.9296875, - "weighted_total_content_score": 73.49845201238391, - "semantic_relevance": 3.4375, - "factual_accuracy": 4.1875, - "freshness": 4.75, - "objectivity_tone": 3.6875, - "layout_ad_density": 3.25, - "accountability": 4.0625, - "transparency": 4.0, - "authority": 4.0625, - "avg_ge_freq": 1.0, - "relative_se_rank": 1.5443321452749619, - "normalized_reciprocal_se_rank": 0.16442527055248699, - "reciprocal_se_rank": 0.049218693652175266, - "percentage_ge_sources_not_in_se_sources": 58.8235294117647, - "percentage_ge_sources_in_se_sources": 41.1764705882353 + "model_name": "google-search", + "query_type": "HotpotQA", + "num_sources": 70, + "num_queries": 20, + "num_complete_scores": 69, + "unweighted_mean_score": 3.813, + "weighted_total_content_score": 75.33834586, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null }, { - "model_name": "Gemini-3-Pro-Preview", - "query_type": "HotpotQA", - "num_sources": 83, + "model_name": "google-search", + "query_type": "Pinocchios", + "num_sources": 93, "num_queries": 20, - "num_complete_scores": 77, - "unweighted_mean_score": 3.71617900172117, - "weighted_total_content_score": 72.63157894736844, - "semantic_relevance": 2.9753086419753085, - "factual_accuracy": 3.765432098765432, - "freshness": 4.719512195121951, - "objectivity_tone": 4.135802469135802, - "layout_ad_density": 3.5, - "accountability": 3.451219512195122, - "transparency": 3.6951219512195124, - "authority": 3.524390243902439, - "avg_ge_freq": 0.57830843373494, - "relative_se_rank": 1.9786482310103346, - "normalized_reciprocal_se_rank": 0.056782624848369385, - "reciprocal_se_rank": 0.02335310645628293, - "percentage_ge_sources_not_in_se_sources": 83.13253012048195, - "percentage_ge_sources_in_se_sources": 16.86746987951807 + "num_complete_scores": 91, + "unweighted_mean_score": 4.176, + "weighted_total_content_score": 83.53140917, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null }, { - "model_name": "Gemini-3-Pro-Preview", + "model_name": "google-search", "query_type": "QuoraQuestions", - "num_sources": 81, - "num_queries": 17, - "num_complete_scores": 76, - "unweighted_mean_score": 3.75, - "weighted_total_content_score": 72.37166991552957, - "semantic_relevance": 3.371794871794872, - "factual_accuracy": 3.7948717948717947, - "freshness": 4.9113924050632916, - "objectivity_tone": 3.5641025641025643, - "layout_ad_density": 3.0987654320987654, - "accountability": 3.7974683544303796, - "transparency": 3.8227848101265822, - "authority": 3.6455696202531644, - "avg_ge_freq": 0.5061604938271607, - "relative_se_rank": 1.7841348598268654, - "normalized_reciprocal_se_rank": 0.11912889330801175, - "reciprocal_se_rank": 0.038334369993915436, - "percentage_ge_sources_not_in_se_sources": 71.60493827160494, - "percentage_ge_sources_in_se_sources": 28.395061728395063 + "num_sources": 80, + "num_queries": 19, + "num_complete_scores": 80, + "unweighted_mean_score": 3.809, + "weighted_total_content_score": 76.22368421, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null }, { - "model_name": "Gemini-3-Flash-Preview", - "query_type": "HotpotQA", - "num_sources": 85, + "model_name": "google-search", + "query_type": "VA-COS NLQ", + "num_sources": 74, "num_queries": 20, - "num_complete_scores": 81, - "unweighted_mean_score": 3.6874369747899154, - "weighted_total_content_score": 72.28482972136224, - "semantic_relevance": 2.9146341463414633, - "factual_accuracy": 3.8902439024390243, - "freshness": 4.211764705882353, - "objectivity_tone": 4.2317073170731705, - "layout_ad_density": 3.5714285714285716, - "accountability": 3.411764705882353, - "transparency": 3.6588235294117646, - "authority": 3.552941176470588, - "avg_ge_freq": 0.5999917647058826, - "relative_se_rank": 1.9004372674862968, - "normalized_reciprocal_se_rank": 0.06427878374204764, - "reciprocal_se_rank": 0.0251543679380163, - "percentage_ge_sources_not_in_se_sources": 78.82352941176471, - "percentage_ge_sources_in_se_sources": 21.176470588235293 + "num_complete_scores": 74, + "unweighted_mean_score": 3.846, + "weighted_total_content_score": 77.05547653, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": null, + "relative_se_rank": null, + "normalized_reciprocal_se_rank": null, + "reciprocal_se_rank": null, + "percentage_ge_sources_not_in_se_sources": null, + "percentage_ge_sources_in_se_sources": null }, { - "model_name": "Perplexity-Sonar-Pro", - "query_type": "HotpotQA", - "num_sources": 79, + "model_name": "exa", + "query_type": "DebateQA", + "num_sources": 89, "num_queries": 20, - "num_complete_scores": 78, - "unweighted_mean_score": 3.590641952983725, - "weighted_total_content_score": 71.13924050632909, - "semantic_relevance": 2.949367088607595, - "factual_accuracy": 3.5949367088607596, - "freshness": 4.075949367088608, - "objectivity_tone": 3.670886075949367, - "layout_ad_density": 3.4871794871794872, - "accountability": 3.3164556962025316, - "transparency": 3.7848101265822787, - "authority": 3.848101265822785, - "avg_ge_freq": 0.8143506329113921, - "relative_se_rank": 1.61782985019127, - "normalized_reciprocal_se_rank": 0.13063939371395492, - "reciprocal_se_rank": 0.04110024266427558, - "percentage_ge_sources_not_in_se_sources": 63.29113924050633, - "percentage_ge_sources_in_se_sources": 36.70886075949367 + "num_complete_scores": 87, + "unweighted_mean_score": 4.327, + "weighted_total_content_score": 86.44589001, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 1.0, + "relative_se_rank": 1.010284064126628, + "normalized_reciprocal_se_rank": 0.28063309301928224, + "reciprocal_se_rank": 0.07714241798278879, + "percentage_ge_sources_not_in_se_sources": 38.20224719101124, + "percentage_ge_sources_in_se_sources": 61.79775280898876 }, { "model_name": "exa", @@ -2170,16 +1527,16 @@ "num_sources": 83, "num_queries": 20, "num_complete_scores": 83, - "unweighted_mean_score": 3.572289156626506, - "weighted_total_content_score": 70.83069118579579, - "semantic_relevance": 2.4939759036144578, - "factual_accuracy": 3.4578313253012047, - "freshness": 4.156626506024097, - "objectivity_tone": 4.180722891566265, - "layout_ad_density": 3.36144578313253, - "accountability": 3.3855421686746987, - "transparency": 3.783132530120482, - "authority": 3.7590361445783134, + "unweighted_mean_score": 3.572, + "weighted_total_content_score": 70.83069119, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 1.0, "relative_se_rank": 1.371670293189712, "normalized_reciprocal_se_rank": 0.22448376867351463, @@ -2188,50 +1545,96 @@ "percentage_ge_sources_in_se_sources": 45.78313253012048 }, { - "model_name": "deepseek-reasoning-tavily", - "query_type": "HotpotQA", - "num_sources": 6, - "num_queries": 3, - "num_complete_scores": 5, - "unweighted_mean_score": 4.25, - "weighted_total_content_score": 70.52631578947368, - "semantic_relevance": 3.2, - "factual_accuracy": 4.8, - "freshness": 4.4, - "objectivity_tone": 4.4, - "layout_ad_density": 4.8, - "accountability": 2.8, - "transparency": 4.8, - "authority": 4.8, + "model_name": "exa", + "query_type": "Pinocchios", + "num_sources": 87, + "num_queries": 20, + "num_complete_scores": 86, + "unweighted_mean_score": 4.199, + "weighted_total_content_score": 83.59346642, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 1.0, - "relative_se_rank": 2.241378176028632, - "normalized_reciprocal_se_rank": 0.259018759018759, - "reciprocal_se_rank": 0.07194868238557559, - "percentage_ge_sources_not_in_se_sources": 66.66666666666667, - "percentage_ge_sources_in_se_sources": 33.333333333333336 + "relative_se_rank": 1.80952333519013, + "normalized_reciprocal_se_rank": 0.2313966587355651, + "reciprocal_se_rank": 0.06531133304568201, + "percentage_ge_sources_not_in_se_sources": 62.06896551724138, + "percentage_ge_sources_in_se_sources": 37.93103448275862 }, { - "model_name": "deepseek-chat-tavily", + "model_name": "exa", "query_type": "QuoraQuestions", - "num_sources": 19, - "num_queries": 4, - "num_complete_scores": 17, - "unweighted_mean_score": 3.9338235294117645, - "weighted_total_content_score": 69.58448753462604, - "semantic_relevance": 3.411764705882353, - "factual_accuracy": 4.0, - "freshness": 4.764705882352941, - "objectivity_tone": 3.5294117647058822, - "layout_ad_density": 3.411764705882353, - "accountability": 4.117647058823529, - "transparency": 4.117647058823529, - "authority": 4.117647058823529, + "num_sources": 85, + "num_queries": 19, + "num_complete_scores": 85, + "unweighted_mean_score": 3.871, + "weighted_total_content_score": 76.96594427, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 1.0, + "relative_se_rank": 1.329000189685051, + "normalized_reciprocal_se_rank": 0.1994493307755928, + "reciprocal_se_rank": 0.05763466928830994, + "percentage_ge_sources_not_in_se_sources": 50.588235294117645, + "percentage_ge_sources_in_se_sources": 49.411764705882355 + }, + { + "model_name": "exa", + "query_type": "VA-COS NLQ", + "num_sources": 81, + "num_queries": 20, + "num_complete_scores": 80, + "unweighted_mean_score": 4.135, + "weighted_total_content_score": 81.97530864, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 1.0, + "relative_se_rank": 2.003012313759094, + "normalized_reciprocal_se_rank": 0.07692643713869617, + "reciprocal_se_rank": 0.028193488535754666, + "percentage_ge_sources_not_in_se_sources": 82.71604938271606, + "percentage_ge_sources_in_se_sources": 17.28395061728395 + }, + { + "model_name": "tavily", + "query_type": "DebateQA", + "num_sources": 76, + "num_queries": 20, + "num_complete_scores": 76, + "unweighted_mean_score": 4.184, + "weighted_total_content_score": 83.55955679, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 1.0, - "relative_se_rank": 1.2778629209492955, - "normalized_reciprocal_se_rank": 0.2256203749740223, - "reciprocal_se_rank": 0.06392334253016554, - "percentage_ge_sources_not_in_se_sources": 47.36842105263158, - "percentage_ge_sources_in_se_sources": 52.63157894736842 + "relative_se_rank": 0.9508661126222719, + "normalized_reciprocal_se_rank": 0.3434220529106368, + "reciprocal_se_rank": 0.09223005640328419, + "percentage_ge_sources_not_in_se_sources": 36.8421052631579, + "percentage_ge_sources_in_se_sources": 63.1578947368421 }, { "model_name": "tavily", @@ -2239,16 +1642,16 @@ "num_sources": 77, "num_queries": 18, "num_complete_scores": 73, - "unweighted_mean_score": 3.5633333333333335, - "weighted_total_content_score": 68.33902939166094, - "semantic_relevance": 2.635135135135135, - "factual_accuracy": 3.5675675675675675, - "freshness": 4.162162162162162, - "objectivity_tone": 4.108108108108108, - "layout_ad_density": 3.472972972972973, - "accountability": 3.4054054054054053, - "transparency": 3.5, - "authority": 3.77027027027027, + "unweighted_mean_score": 3.578, + "weighted_total_content_score": 68.33902939, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 1.0, "relative_se_rank": 1.150415973913549, "normalized_reciprocal_se_rank": 0.27249775517340985, @@ -2257,50 +1660,188 @@ "percentage_ge_sources_in_se_sources": 58.44155844155844 }, { - "model_name": "deepseek-reasoning-gensee", - "query_type": "VACOS", - "num_sources": 20, - "num_queries": 4, - "num_complete_scores": 16, - "unweighted_mean_score": 4.051470588235294, - "weighted_total_content_score": 67.57894736842104, - "semantic_relevance": 4.411764705882353, - "factual_accuracy": 4.411764705882353, - "freshness": 4.625, - "objectivity_tone": 3.8823529411764706, - "layout_ad_density": 2.8823529411764706, - "accountability": 3.9375, - "transparency": 3.9375, - "authority": 4.3125, - "avg_ge_freq": 1.0, - "relative_se_rank": 2.1174661246612465, - "normalized_reciprocal_se_rank": 0.0791919191919192, - "reciprocal_se_rank": 0.0287378640776699, - "percentage_ge_sources_not_in_se_sources": 90.0, - "percentage_ge_sources_in_se_sources": 10.0 + "model_name": "tavily", + "query_type": "Pinocchios", + "num_sources": 81, + "num_queries": 20, + "num_complete_scores": 79, + "unweighted_mean_score": 4.192, + "weighted_total_content_score": 82.27420403, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.9958851851851852, + "relative_se_rank": 1.4422121160476211, + "normalized_reciprocal_se_rank": 0.3128941728047197, + "reciprocal_se_rank": 0.08489447356229925, + "percentage_ge_sources_not_in_se_sources": 45.67901234567901, + "percentage_ge_sources_in_se_sources": 54.32098765432099 }, { - "model_name": "deepseek-chat-tavily", - "query_type": "VACOS", - "num_sources": 13, - "num_queries": 4, - "num_complete_scores": 10, - "unweighted_mean_score": 3.525, - "weighted_total_content_score": 54.17004048582995, - "semantic_relevance": 3.5, - "factual_accuracy": 3.5, - "freshness": 4.6, - "objectivity_tone": 3.5, - "layout_ad_density": 2.1, - "accountability": 3.2, - "transparency": 3.8, - "authority": 4.0, + "model_name": "tavily", + "query_type": "QuoraQuestions", + "num_sources": 78, + "num_queries": 19, + "num_complete_scores": 78, + "unweighted_mean_score": 3.712, + "weighted_total_content_score": 73.67071525, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, "avg_ge_freq": 1.0, - "relative_se_rank": 2.1902022097144047, - "normalized_reciprocal_se_rank": 0.060916860916860924, - "reciprocal_se_rank": 0.024346527259148616, - "percentage_ge_sources_not_in_se_sources": 92.3076923076923, - "percentage_ge_sources_in_se_sources": 7.692307692307692 + "relative_se_rank": 0.9222314853068678, + "normalized_reciprocal_se_rank": 0.31851954838074426, + "reciprocal_se_rank": 0.08624620215945074, + "percentage_ge_sources_not_in_se_sources": 32.05128205128205, + "percentage_ge_sources_in_se_sources": 67.94871794871796 + }, + { + "model_name": "tavily", + "query_type": "VA-COS NLQ", + "num_sources": 83, + "num_queries": 20, + "num_complete_scores": 83, + "unweighted_mean_score": 3.955, + "weighted_total_content_score": 78.80786303, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.9397590361445783, + "relative_se_rank": 1.7133062132347119, + "normalized_reciprocal_se_rank": 0.1335959024960005, + "reciprocal_se_rank": 0.04181066589102925, + "percentage_ge_sources_not_in_se_sources": 68.67469879518072, + "percentage_ge_sources_in_se_sources": 31.325301204819276 + }, + { + "model_name": "gensee", + "query_type": "DebateQA", + "num_sources": 89, + "num_queries": 20, + "num_complete_scores": 87, + "unweighted_mean_score": 4.278, + "weighted_total_content_score": 85.36960378, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5730269662921349, + "relative_se_rank": 1.455758534903284, + "normalized_reciprocal_se_rank": 0.17012042957025347, + "reciprocal_se_rank": 0.05058719060061921, + "percentage_ge_sources_not_in_se_sources": 58.42696629213483, + "percentage_ge_sources_in_se_sources": 41.57303370786517 + }, + { + "model_name": "gensee", + "query_type": "HotpotQA", + "num_sources": 45, + "num_queries": 16, + "num_complete_scores": 45, + "unweighted_mean_score": 3.786, + "weighted_total_content_score": 76.67836257, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.4888711111111113, + "relative_se_rank": 1.9318191821883404, + "normalized_reciprocal_se_rank": 0.06456158601930041, + "reciprocal_se_rank": 0.025222322854152286, + "percentage_ge_sources_not_in_se_sources": 84.44444444444443, + "percentage_ge_sources_in_se_sources": 15.555555555555555 + }, + { + "model_name": "gensee", + "query_type": "Pinocchios", + "num_sources": 77, + "num_queries": 20, + "num_complete_scores": 75, + "unweighted_mean_score": 4.283, + "weighted_total_content_score": 85.94668489, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.4934883116883122, + "relative_se_rank": 1.7999418903256372, + "normalized_reciprocal_se_rank": 0.19747215967140244, + "reciprocal_se_rank": 0.05715957234822537, + "percentage_ge_sources_not_in_se_sources": 70.12987012987011, + "percentage_ge_sources_in_se_sources": 29.870129870129865 + }, + { + "model_name": "gensee", + "query_type": "QuoraQuestions", + "num_sources": 83, + "num_queries": 18, + "num_complete_scores": 82, + "unweighted_mean_score": 3.924, + "weighted_total_content_score": 77.94546607, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5542120481927711, + "relative_se_rank": 1.6111832468899239, + "normalized_reciprocal_se_rank": 0.12268532386073742, + "reciprocal_se_rank": 0.03918894918012865, + "percentage_ge_sources_not_in_se_sources": 62.650602409638545, + "percentage_ge_sources_in_se_sources": 37.34939759036144 + }, + { + "model_name": "gensee", + "query_type": "VA-COS NLQ", + "num_sources": 88, + "num_queries": 19, + "num_complete_scores": 88, + "unweighted_mean_score": 3.933, + "weighted_total_content_score": 79.43779904, + "semantic_relevance": null, + "factual_accuracy": null, + "freshness": null, + "objectivity_tone": null, + "layout_ad_density": null, + "accountability": null, + "transparency": null, + "authority": null, + "avg_ge_freq": 0.5340818181818183, + "relative_se_rank": 2.115456142814551, + "normalized_reciprocal_se_rank": 0.07056832757590334, + "reciprocal_se_rank": 0.02666569036411269, + "percentage_ge_sources_not_in_se_sources": 87.5, + "percentage_ge_sources_in_se_sources": 12.5 } ], "queries": [