""" Model comparison: DeepSeek-V3.1 vs GPT-oss-120b Configurations: section/hybrid and md_recursive/hybrid With and without cross-encoder reranking Queries: 3 hardest questions from ablation (ones that caused faithfulness < 1) Runs 2 models × 2 strategies × 2 rerank modes × 3 queries = 24 evaluations. Results saved to data/model_comparison.csv """ import os import sys import time import pandas as pd sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from src.retrieval.query import load_model, init_pinecone, retrieve from src.retrieval.hybrid import build_bm25_index, load_reranker, hybrid_retrieve from src.generation.generate import generate_answer from src.evaluation.evaluate import ( compute_faithfulness, compute_relevancy, build_faithfulness_context, _load_cache, _save_cache, CACHE_PATH, ) BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) OUTPUT_PATH = os.path.join(BASE_DIR, "data", "model_comparison.csv") # 3 hardest questions — these caused faithfulness < 1 in the ablation study HARD_QUERIES = [ "Based strictly on AAR-00/03 for TWA Flight 800, which debris field was the smallest and what were the exact fuselage station markers for the wreckage it contained?", "Based strictly on AAR-00/01 for Korean Air Flight 801, what was the exact decision height for the ILS approach at Guam and at what altitude did the crew first receive a GPWS warning?", "Based strictly on AAR-14/01 for Asiana Airlines Flight 214, state the exact CVR timestamp in hours, minutes, and seconds when the stick shaker first activated, the exact indicated airspeed at that moment, and the exact radio altitude recorded simultaneously by the FDR.", ] STRATEGIES = ["section", "md_recursive"] MODELS = ["gpt", "deepseek"] RERANK_MODES = [True, False] # True = with cross-encoder, False = RRF only TOP_K = 15 def run_one(query, strategy, llm_model, use_reranker, jina_model, index, bm25_cache, reranker, cache): t_total = time.perf_counter() bm25, chunks = bm25_cache[strategy] t_ret = time.perf_counter() if use_reranker: matches = hybrid_retrieve( query, strategy, top_k=TOP_K, bm25=bm25, chunks=chunks, reranker=reranker, model=jina_model, index=index, ) else: matches = hybrid_retrieve( query, strategy, top_k=TOP_K, bm25=bm25, chunks=chunks, reranker=None, model=jina_model, index=index, ) retrieval_time = round(time.perf_counter() - t_ret, 3) faithfulness_context = build_faithfulness_context(query, matches) t_gen = time.perf_counter() answer = generate_answer(query, matches, llm_provider=llm_model) generation_time = round(time.perf_counter() - t_gen, 3) total_time = round(time.perf_counter() - t_total, 3) faith_score, _ = compute_faithfulness(answer, [faithfulness_context], query=query, cache=cache) rel_score, _ = compute_relevancy(query, answer, jina_model, cache=cache) return { "query": query[:80] + "..." if len(query) > 80 else query, "strategy": strategy, "llm_model": llm_model, "cross_encoder": use_reranker, "faithfulness": round(faith_score, 3), "relevancy": round(rel_score, 3), "retrieval_time": retrieval_time, "generation_time": generation_time, "total_time": total_time, "answer_snippet": answer[:300].replace("\n", " "), } def main(): print("Loading models...") jina_model = load_model() index = init_pinecone() reranker = load_reranker() cache = _load_cache(CACHE_PATH) bm25_cache = {} for s in STRATEGIES: print(f"Building BM25 index for {s}...") bm25_cache[s] = build_bm25_index(s) results = [] total = len(HARD_QUERIES) * len(STRATEGIES) * len(MODELS) * len(RERANK_MODES) done = 0 for query in HARD_QUERIES: for strategy in STRATEGIES: for llm_model in MODELS: for use_reranker in RERANK_MODES: done += 1 rerank_label = "w/ cross-encoder" if use_reranker else "RRF only" print(f" [{done:>2}/{total}] {strategy} | {llm_model} | {rerank_label}") try: row = run_one(query, strategy, llm_model, use_reranker, jina_model, index, bm25_cache, reranker, cache) results.append(row) except Exception as e: print(f" ERROR: {e}") _save_cache(cache, CACHE_PATH) df = pd.DataFrame(results) df.to_csv(OUTPUT_PATH, index=False) print(f"\nResults saved to {OUTPUT_PATH}") # Summary table summary = df.groupby(["strategy", "llm_model", "cross_encoder"])[ ["faithfulness", "relevancy", "retrieval_time", "generation_time"] ].mean().round(3) print("\n=== MODEL COMPARISON SUMMARY ===") print(summary.to_string()) if __name__ == "__main__": main()