Spaces:
Running
Running
File size: 8,387 Bytes
182e0fa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | """Baseline RAG eval — runs with OLD pipeline (no reranking, no headers, no query expansion, sentence-aware chunking)."""
import os, sys, json, time
from pathlib import Path
try:
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parent / ".env")
except ImportError:
pass
SRC_DIR = Path(__file__).resolve().parent / "src"
sys.path.insert(0, str(SRC_DIR))
if not os.environ.get("NOTEBOOKLM_DATA_ROOT"):
_root = Path(__file__).resolve().parent / "tmp_eval_data"
_root.mkdir(exist_ok=True)
os.environ["NOTEBOOKLM_DATA_ROOT"] = str(_root)
# Force baseline settings
os.environ["NOTEBOOKLM_QUERY_EXPANSION"] = "off"
from ingestion.chunking import sentence_aware_chunk
from ingestion.embedder import embed_texts
from ingestion.indexer import upsert_chunks
from notebooklm_clone.notebooks import create_notebook
from notebooklm_clone import retrieval as retrieval_mod
# Monkey-patch _rerank to be a no-op (just return candidates as-is truncated to k)
_original_rerank = retrieval_mod._rerank
def _noop_rerank(query, candidates, k):
return candidates[:k]
retrieval_mod._rerank = _noop_rerank
from notebooklm_clone.retrieval import retrieve
import textwrap
SAMPLE_DOCUMENT = textwrap.dedent("""\
The Solar System consists of the Sun and the objects that orbit it, whether
they orbit it directly or indirectly. Of the objects that orbit the Sun
directly, the largest are the eight planets. The four smaller inner system
planets, Mercury, Venus, Earth, and Mars, are terrestrial planets, composed
primarily of rock and metal. The four outer system planets are giant planets,
being substantially more massive than the terrestrials. The two largest,
Jupiter and Saturn, are gas giants, composed mainly of hydrogen and helium.
The two outermost planets, Uranus and Neptune, are ice giants, composed
mainly of substances with relatively high melting points compared with
hydrogen and helium, called volatiles, such as water, ammonia, and methane.
Earth is the third planet from the Sun and the only astronomical object
known to harbor life. About 71% of Earth's surface is made up of the
ocean, dwarfing Earth's polar ice, lakes, and rivers. The remaining 29%
of Earth's surface is land, consisting of continents and islands.
Mars is the fourth planet and has a thin atmosphere composed primarily of
carbon dioxide. Mars has two small moons, Phobos and Deimos, which are
thought to be captured asteroids. Mars is often called the "Red Planet"
because iron oxide prevalent on its surface gives it a reddish appearance.
Jupiter is the largest planet in the Solar System, with a mass more than
two and a half times that of all the other planets combined. Jupiter has
at least 95 known moons, including the four large Galilean moons discovered
by Galileo Galilei in 1610. The Great Red Spot is a persistent high-pressure
region in the atmosphere of Jupiter, producing an anticyclonic storm that is
the largest in the Solar System. It has been continuously observed since 1830.
Photosynthesis is a process used by plants and other organisms to convert
light energy, normally from the Sun, into chemical energy that can later be
released to fuel the organisms' activities. In most cases, oxygen is also
released as a waste product. Most plants, algae, and cyanobacteria perform
photosynthesis. Such organisms are called photoautotrophs.
The water cycle, also known as the hydrological cycle, describes the
continuous movement of water within the Earth and atmosphere. Water
evaporates from the surface of the ocean, rises into the atmosphere,
cools, condenses into rain or snow in clouds, and falls again to the
surface as precipitation. About 90% of the water in the atmosphere comes
from the evaporation of ocean water.
""")
EVAL_QUERIES = [
{"query": "What are the inner planets of the solar system?", "relevant_keywords": ["mercury", "venus", "earth", "mars", "terrestrial", "inner"], "topic": "Inner planets"},
{"query": "What is the Great Red Spot?", "relevant_keywords": ["jupiter", "great red spot", "anticyclonic", "storm", "high-pressure"], "topic": "Jupiter's GRS"},
{"query": "How does photosynthesis work?", "relevant_keywords": ["photosynthesis", "light energy", "chemical energy", "oxygen", "plants"], "topic": "Photosynthesis"},
{"query": "Describe the water cycle.", "relevant_keywords": ["water cycle", "hydrological", "evaporat", "precipitation", "condens"], "topic": "Water cycle"},
{"query": "What is the atmosphere of Mars like?", "relevant_keywords": ["mars", "atmosphere", "carbon dioxide", "thin"], "topic": "Mars atmosphere"},
{"query": "Which planets are gas giants?", "relevant_keywords": ["jupiter", "saturn", "gas giant", "hydrogen", "helium"], "topic": "Gas giants"},
{"query": "What percentage of Earth's surface is ocean?", "relevant_keywords": ["71%", "ocean", "earth", "surface"], "topic": "Earth's ocean"},
{"query": "What moons does Mars have?", "relevant_keywords": ["phobos", "deimos", "mars", "moons", "asteroid"], "topic": "Mars moons"},
]
def _keyword_hit(text, keywords):
text_lower = text.lower()
return any(kw.lower() in text_lower for kw in keywords)
def precision_at_k(results, keywords, k):
top_k = results[:k]
if not top_k: return 0.0
return sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / len(top_k)
def recall_at_k(results, keywords, k, total_relevant):
if total_relevant == 0: return 1.0
top_k = results[:k]
return min(sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / total_relevant, 1.0)
def reciprocal_rank(results, keywords):
for i, r in enumerate(results, 1):
if _keyword_hit(r["text"], keywords): return 1.0 / i
return 0.0
def main():
print("=== BASELINE EVAL (no reranking, no headers, no query expansion, sentence-aware chunks) ===\n")
eval_user = "_eval_user_tmp"
nb_name = f"Baseline {time.strftime('%H%M%S')}"
notebook = create_notebook(eval_user, nb_name)
notebook_id = notebook["id"]
print(f"Notebook: {notebook_id}")
# OLD pipeline: sentence_aware_chunk, NO header
chunks = sentence_aware_chunk(SAMPLE_DOCUMENT, 1200, 200)
embeddings = embed_texts([c["chunk_text"] for c in chunks])
location_hints = [{"start_char": c["start_char"], "end_char": c["end_char"]} for c in chunks]
summary = upsert_chunks(
username=eval_user, notebook_id=notebook_id, source_id="eval_source_001",
chunks=chunks, embeddings=embeddings,
meta={"source_name": "sample_article.txt", "location_hints": location_hints},
)
print(f"Indexed {summary['chunk_count']} chunks (sentence-aware, no headers)")
retrieval_k = 5
results_per_query = []
for q in EVAL_QUERIES:
t0 = time.perf_counter()
results = retrieve(eval_user, notebook_id, q["query"], k=retrieval_k)
latency = (time.perf_counter() - t0) * 1000
results_per_query.append({
"topic": q["topic"],
"P@1": precision_at_k(results, q["relevant_keywords"], 1),
"P@3": precision_at_k(results, q["relevant_keywords"], 3),
"P@5": precision_at_k(results, q["relevant_keywords"], 5),
"MRR": reciprocal_rank(results, q["relevant_keywords"]),
"Recall@5": recall_at_k(results, q["relevant_keywords"], retrieval_k, 2),
"latency_ms": latency,
})
avg = lambda key: sum(r[key] for r in results_per_query) / len(results_per_query)
output = {
"config": "BASELINE: sentence_aware_chunk(1200/200), no headers, no reranking, no query expansion",
"retrieval_metrics": {
"avg_MRR": round(avg("MRR"), 4),
"avg_P@1": round(avg("P@1"), 4),
"avg_P@5": round(avg("P@5"), 4),
"avg_Recall@5": round(avg("Recall@5"), 4),
"avg_latency_ms": round(avg("latency_ms"), 1),
},
"per_query": results_per_query,
}
out_path = Path(__file__).resolve().parent / "tmp_eval_baseline.json"
out_path.write_text(json.dumps(output, indent=2, default=str), encoding="utf-8")
print(f"\nBaseline results saved to: {out_path}")
print(json.dumps(output["retrieval_metrics"], indent=2))
if __name__ == "__main__":
main()
|