Spaces:

abiju
/

notebook_lm_clone

Running

notebook_lm_clone / tmp_eval_baseline.py

Abhinav Biju

Deploying RAG pipeline changes (excluding binary data)

182e0fa 3 months ago

8.39 kB

	"""Baseline RAG eval — runs with OLD pipeline (no reranking, no headers, no query expansion, sentence-aware chunking)."""
	import os, sys, json, time
	from pathlib import Path

	try:
	from dotenv import load_dotenv
	load_dotenv(Path(__file__).resolve().parent / ".env")
	except ImportError:
	pass

	SRC_DIR = Path(__file__).resolve().parent / "src"
	sys.path.insert(0, str(SRC_DIR))

	if not os.environ.get("NOTEBOOKLM_DATA_ROOT"):
	_root = Path(__file__).resolve().parent / "tmp_eval_data"
	_root.mkdir(exist_ok=True)
	os.environ["NOTEBOOKLM_DATA_ROOT"] = str(_root)

	# Force baseline settings
	os.environ["NOTEBOOKLM_QUERY_EXPANSION"] = "off"

	from ingestion.chunking import sentence_aware_chunk
	from ingestion.embedder import embed_texts
	from ingestion.indexer import upsert_chunks
	from notebooklm_clone.notebooks import create_notebook
	from notebooklm_clone import retrieval as retrieval_mod

	# Monkey-patch _rerank to be a no-op (just return candidates as-is truncated to k)
	_original_rerank = retrieval_mod._rerank
	def _noop_rerank(query, candidates, k):
	return candidates[:k]
	retrieval_mod._rerank = _noop_rerank

	from notebooklm_clone.retrieval import retrieve

	import textwrap

	SAMPLE_DOCUMENT = textwrap.dedent("""\
	The Solar System consists of the Sun and the objects that orbit it, whether
	they orbit it directly or indirectly. Of the objects that orbit the Sun
	directly, the largest are the eight planets. The four smaller inner system
	planets, Mercury, Venus, Earth, and Mars, are terrestrial planets, composed
	primarily of rock and metal. The four outer system planets are giant planets,
	being substantially more massive than the terrestrials. The two largest,
	Jupiter and Saturn, are gas giants, composed mainly of hydrogen and helium.
	The two outermost planets, Uranus and Neptune, are ice giants, composed
	mainly of substances with relatively high melting points compared with
	hydrogen and helium, called volatiles, such as water, ammonia, and methane.

	Earth is the third planet from the Sun and the only astronomical object
	known to harbor life. About 71% of Earth's surface is made up of the
	ocean, dwarfing Earth's polar ice, lakes, and rivers. The remaining 29%
	of Earth's surface is land, consisting of continents and islands.

	Mars is the fourth planet and has a thin atmosphere composed primarily of
	carbon dioxide. Mars has two small moons, Phobos and Deimos, which are
	thought to be captured asteroids. Mars is often called the "Red Planet"
	because iron oxide prevalent on its surface gives it a reddish appearance.

	Jupiter is the largest planet in the Solar System, with a mass more than
	two and a half times that of all the other planets combined. Jupiter has
	at least 95 known moons, including the four large Galilean moons discovered
	by Galileo Galilei in 1610. The Great Red Spot is a persistent high-pressure
	region in the atmosphere of Jupiter, producing an anticyclonic storm that is
	the largest in the Solar System. It has been continuously observed since 1830.

	Photosynthesis is a process used by plants and other organisms to convert
	light energy, normally from the Sun, into chemical energy that can later be
	released to fuel the organisms' activities. In most cases, oxygen is also
	released as a waste product. Most plants, algae, and cyanobacteria perform
	photosynthesis. Such organisms are called photoautotrophs.

	The water cycle, also known as the hydrological cycle, describes the
	continuous movement of water within the Earth and atmosphere. Water
	evaporates from the surface of the ocean, rises into the atmosphere,
	cools, condenses into rain or snow in clouds, and falls again to the
	surface as precipitation. About 90% of the water in the atmosphere comes
	from the evaporation of ocean water.
	""")

	EVAL_QUERIES = [
	{"query": "What are the inner planets of the solar system?", "relevant_keywords": ["mercury", "venus", "earth", "mars", "terrestrial", "inner"], "topic": "Inner planets"},
	{"query": "What is the Great Red Spot?", "relevant_keywords": ["jupiter", "great red spot", "anticyclonic", "storm", "high-pressure"], "topic": "Jupiter's GRS"},
	{"query": "How does photosynthesis work?", "relevant_keywords": ["photosynthesis", "light energy", "chemical energy", "oxygen", "plants"], "topic": "Photosynthesis"},
	{"query": "Describe the water cycle.", "relevant_keywords": ["water cycle", "hydrological", "evaporat", "precipitation", "condens"], "topic": "Water cycle"},
	{"query": "What is the atmosphere of Mars like?", "relevant_keywords": ["mars", "atmosphere", "carbon dioxide", "thin"], "topic": "Mars atmosphere"},
	{"query": "Which planets are gas giants?", "relevant_keywords": ["jupiter", "saturn", "gas giant", "hydrogen", "helium"], "topic": "Gas giants"},
	{"query": "What percentage of Earth's surface is ocean?", "relevant_keywords": ["71%", "ocean", "earth", "surface"], "topic": "Earth's ocean"},
	{"query": "What moons does Mars have?", "relevant_keywords": ["phobos", "deimos", "mars", "moons", "asteroid"], "topic": "Mars moons"},
	]

	def _keyword_hit(text, keywords):
	text_lower = text.lower()
	return any(kw.lower() in text_lower for kw in keywords)

	def precision_at_k(results, keywords, k):
	top_k = results[:k]
	if not top_k: return 0.0
	return sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / len(top_k)

	def recall_at_k(results, keywords, k, total_relevant):
	if total_relevant == 0: return 1.0
	top_k = results[:k]
	return min(sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / total_relevant, 1.0)

	def reciprocal_rank(results, keywords):
	for i, r in enumerate(results, 1):
	if _keyword_hit(r["text"], keywords): return 1.0 / i
	return 0.0

	def main():
	print("=== BASELINE EVAL (no reranking, no headers, no query expansion, sentence-aware chunks) ===\n")

	eval_user = "_eval_user_tmp"
	nb_name = f"Baseline {time.strftime('%H%M%S')}"
	notebook = create_notebook(eval_user, nb_name)
	notebook_id = notebook["id"]
	print(f"Notebook: {notebook_id}")

	# OLD pipeline: sentence_aware_chunk, NO header
	chunks = sentence_aware_chunk(SAMPLE_DOCUMENT, 1200, 200)
	embeddings = embed_texts([c["chunk_text"] for c in chunks])
	location_hints = [{"start_char": c["start_char"], "end_char": c["end_char"]} for c in chunks]
	summary = upsert_chunks(
	username=eval_user, notebook_id=notebook_id, source_id="eval_source_001",
	chunks=chunks, embeddings=embeddings,
	meta={"source_name": "sample_article.txt", "location_hints": location_hints},
	)
	print(f"Indexed {summary['chunk_count']} chunks (sentence-aware, no headers)")

	retrieval_k = 5
	results_per_query = []
	for q in EVAL_QUERIES:
	t0 = time.perf_counter()
	results = retrieve(eval_user, notebook_id, q["query"], k=retrieval_k)
	latency = (time.perf_counter() - t0) * 1000
	results_per_query.append({
	"topic": q["topic"],
	"P@1": precision_at_k(results, q["relevant_keywords"], 1),
	"P@3": precision_at_k(results, q["relevant_keywords"], 3),
	"P@5": precision_at_k(results, q["relevant_keywords"], 5),
	"MRR": reciprocal_rank(results, q["relevant_keywords"]),
	"Recall@5": recall_at_k(results, q["relevant_keywords"], retrieval_k, 2),
	"latency_ms": latency,
	})

	avg = lambda key: sum(r[key] for r in results_per_query) / len(results_per_query)
	output = {
	"config": "BASELINE: sentence_aware_chunk(1200/200), no headers, no reranking, no query expansion",
	"retrieval_metrics": {
	"avg_MRR": round(avg("MRR"), 4),
	"avg_P@1": round(avg("P@1"), 4),
	"avg_P@5": round(avg("P@5"), 4),
	"avg_Recall@5": round(avg("Recall@5"), 4),
	"avg_latency_ms": round(avg("latency_ms"), 1),
	},
	"per_query": results_per_query,
	}

	out_path = Path(__file__).resolve().parent / "tmp_eval_baseline.json"
	out_path.write_text(json.dumps(output, indent=2, default=str), encoding="utf-8")
	print(f"\nBaseline results saved to: {out_path}")
	print(json.dumps(output["retrieval_metrics"], indent=2))

	if __name__ == "__main__":
	main()