"""Baseline RAG eval — runs with OLD pipeline (no reranking, no headers, no query expansion, sentence-aware chunking).""" import os, sys, json, time from pathlib import Path try: from dotenv import load_dotenv load_dotenv(Path(__file__).resolve().parent / ".env") except ImportError: pass SRC_DIR = Path(__file__).resolve().parent / "src" sys.path.insert(0, str(SRC_DIR)) if not os.environ.get("NOTEBOOKLM_DATA_ROOT"): _root = Path(__file__).resolve().parent / "tmp_eval_data" _root.mkdir(exist_ok=True) os.environ["NOTEBOOKLM_DATA_ROOT"] = str(_root) # Force baseline settings os.environ["NOTEBOOKLM_QUERY_EXPANSION"] = "off" from ingestion.chunking import sentence_aware_chunk from ingestion.embedder import embed_texts from ingestion.indexer import upsert_chunks from notebooklm_clone.notebooks import create_notebook from notebooklm_clone import retrieval as retrieval_mod # Monkey-patch _rerank to be a no-op (just return candidates as-is truncated to k) _original_rerank = retrieval_mod._rerank def _noop_rerank(query, candidates, k): return candidates[:k] retrieval_mod._rerank = _noop_rerank from notebooklm_clone.retrieval import retrieve import textwrap SAMPLE_DOCUMENT = textwrap.dedent("""\ The Solar System consists of the Sun and the objects that orbit it, whether they orbit it directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight planets. The four smaller inner system planets, Mercury, Venus, Earth, and Mars, are terrestrial planets, composed primarily of rock and metal. The four outer system planets are giant planets, being substantially more massive than the terrestrials. The two largest, Jupiter and Saturn, are gas giants, composed mainly of hydrogen and helium. The two outermost planets, Uranus and Neptune, are ice giants, composed mainly of substances with relatively high melting points compared with hydrogen and helium, called volatiles, such as water, ammonia, and methane. Earth is the third planet from the Sun and the only astronomical object known to harbor life. About 71% of Earth's surface is made up of the ocean, dwarfing Earth's polar ice, lakes, and rivers. The remaining 29% of Earth's surface is land, consisting of continents and islands. Mars is the fourth planet and has a thin atmosphere composed primarily of carbon dioxide. Mars has two small moons, Phobos and Deimos, which are thought to be captured asteroids. Mars is often called the "Red Planet" because iron oxide prevalent on its surface gives it a reddish appearance. Jupiter is the largest planet in the Solar System, with a mass more than two and a half times that of all the other planets combined. Jupiter has at least 95 known moons, including the four large Galilean moons discovered by Galileo Galilei in 1610. The Great Red Spot is a persistent high-pressure region in the atmosphere of Jupiter, producing an anticyclonic storm that is the largest in the Solar System. It has been continuously observed since 1830. Photosynthesis is a process used by plants and other organisms to convert light energy, normally from the Sun, into chemical energy that can later be released to fuel the organisms' activities. In most cases, oxygen is also released as a waste product. Most plants, algae, and cyanobacteria perform photosynthesis. Such organisms are called photoautotrophs. The water cycle, also known as the hydrological cycle, describes the continuous movement of water within the Earth and atmosphere. Water evaporates from the surface of the ocean, rises into the atmosphere, cools, condenses into rain or snow in clouds, and falls again to the surface as precipitation. About 90% of the water in the atmosphere comes from the evaporation of ocean water. """) EVAL_QUERIES = [ {"query": "What are the inner planets of the solar system?", "relevant_keywords": ["mercury", "venus", "earth", "mars", "terrestrial", "inner"], "topic": "Inner planets"}, {"query": "What is the Great Red Spot?", "relevant_keywords": ["jupiter", "great red spot", "anticyclonic", "storm", "high-pressure"], "topic": "Jupiter's GRS"}, {"query": "How does photosynthesis work?", "relevant_keywords": ["photosynthesis", "light energy", "chemical energy", "oxygen", "plants"], "topic": "Photosynthesis"}, {"query": "Describe the water cycle.", "relevant_keywords": ["water cycle", "hydrological", "evaporat", "precipitation", "condens"], "topic": "Water cycle"}, {"query": "What is the atmosphere of Mars like?", "relevant_keywords": ["mars", "atmosphere", "carbon dioxide", "thin"], "topic": "Mars atmosphere"}, {"query": "Which planets are gas giants?", "relevant_keywords": ["jupiter", "saturn", "gas giant", "hydrogen", "helium"], "topic": "Gas giants"}, {"query": "What percentage of Earth's surface is ocean?", "relevant_keywords": ["71%", "ocean", "earth", "surface"], "topic": "Earth's ocean"}, {"query": "What moons does Mars have?", "relevant_keywords": ["phobos", "deimos", "mars", "moons", "asteroid"], "topic": "Mars moons"}, ] def _keyword_hit(text, keywords): text_lower = text.lower() return any(kw.lower() in text_lower for kw in keywords) def precision_at_k(results, keywords, k): top_k = results[:k] if not top_k: return 0.0 return sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / len(top_k) def recall_at_k(results, keywords, k, total_relevant): if total_relevant == 0: return 1.0 top_k = results[:k] return min(sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / total_relevant, 1.0) def reciprocal_rank(results, keywords): for i, r in enumerate(results, 1): if _keyword_hit(r["text"], keywords): return 1.0 / i return 0.0 def main(): print("=== BASELINE EVAL (no reranking, no headers, no query expansion, sentence-aware chunks) ===\n") eval_user = "_eval_user_tmp" nb_name = f"Baseline {time.strftime('%H%M%S')}" notebook = create_notebook(eval_user, nb_name) notebook_id = notebook["id"] print(f"Notebook: {notebook_id}") # OLD pipeline: sentence_aware_chunk, NO header chunks = sentence_aware_chunk(SAMPLE_DOCUMENT, 1200, 200) embeddings = embed_texts([c["chunk_text"] for c in chunks]) location_hints = [{"start_char": c["start_char"], "end_char": c["end_char"]} for c in chunks] summary = upsert_chunks( username=eval_user, notebook_id=notebook_id, source_id="eval_source_001", chunks=chunks, embeddings=embeddings, meta={"source_name": "sample_article.txt", "location_hints": location_hints}, ) print(f"Indexed {summary['chunk_count']} chunks (sentence-aware, no headers)") retrieval_k = 5 results_per_query = [] for q in EVAL_QUERIES: t0 = time.perf_counter() results = retrieve(eval_user, notebook_id, q["query"], k=retrieval_k) latency = (time.perf_counter() - t0) * 1000 results_per_query.append({ "topic": q["topic"], "P@1": precision_at_k(results, q["relevant_keywords"], 1), "P@3": precision_at_k(results, q["relevant_keywords"], 3), "P@5": precision_at_k(results, q["relevant_keywords"], 5), "MRR": reciprocal_rank(results, q["relevant_keywords"]), "Recall@5": recall_at_k(results, q["relevant_keywords"], retrieval_k, 2), "latency_ms": latency, }) avg = lambda key: sum(r[key] for r in results_per_query) / len(results_per_query) output = { "config": "BASELINE: sentence_aware_chunk(1200/200), no headers, no reranking, no query expansion", "retrieval_metrics": { "avg_MRR": round(avg("MRR"), 4), "avg_P@1": round(avg("P@1"), 4), "avg_P@5": round(avg("P@5"), 4), "avg_Recall@5": round(avg("Recall@5"), 4), "avg_latency_ms": round(avg("latency_ms"), 1), }, "per_query": results_per_query, } out_path = Path(__file__).resolve().parent / "tmp_eval_baseline.json" out_path.write_text(json.dumps(output, indent=2, default=str), encoding="utf-8") print(f"\nBaseline results saved to: {out_path}") print(json.dumps(output["retrieval_metrics"], indent=2)) if __name__ == "__main__": main()