File size: 8,387 Bytes
182e0fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""Baseline RAG eval — runs with OLD pipeline (no reranking, no headers, no query expansion, sentence-aware chunking)."""
import os, sys, json, time
from pathlib import Path

try:
    from dotenv import load_dotenv
    load_dotenv(Path(__file__).resolve().parent / ".env")
except ImportError:
    pass

SRC_DIR = Path(__file__).resolve().parent / "src"
sys.path.insert(0, str(SRC_DIR))

if not os.environ.get("NOTEBOOKLM_DATA_ROOT"):
    _root = Path(__file__).resolve().parent / "tmp_eval_data"
    _root.mkdir(exist_ok=True)
    os.environ["NOTEBOOKLM_DATA_ROOT"] = str(_root)

# Force baseline settings
os.environ["NOTEBOOKLM_QUERY_EXPANSION"] = "off"

from ingestion.chunking import sentence_aware_chunk
from ingestion.embedder import embed_texts
from ingestion.indexer import upsert_chunks
from notebooklm_clone.notebooks import create_notebook
from notebooklm_clone import retrieval as retrieval_mod

# Monkey-patch _rerank to be a no-op (just return candidates as-is truncated to k)
_original_rerank = retrieval_mod._rerank
def _noop_rerank(query, candidates, k):
    return candidates[:k]
retrieval_mod._rerank = _noop_rerank

from notebooklm_clone.retrieval import retrieve

import textwrap

SAMPLE_DOCUMENT = textwrap.dedent("""\
    The Solar System consists of the Sun and the objects that orbit it, whether
    they orbit it directly or indirectly. Of the objects that orbit the Sun
    directly, the largest are the eight planets. The four smaller inner system
    planets, Mercury, Venus, Earth, and Mars, are terrestrial planets, composed
    primarily of rock and metal. The four outer system planets are giant planets,
    being substantially more massive than the terrestrials. The two largest,
    Jupiter and Saturn, are gas giants, composed mainly of hydrogen and helium.
    The two outermost planets, Uranus and Neptune, are ice giants, composed
    mainly of substances with relatively high melting points compared with
    hydrogen and helium, called volatiles, such as water, ammonia, and methane.

    Earth is the third planet from the Sun and the only astronomical object
    known to harbor life. About 71% of Earth's surface is made up of the
    ocean, dwarfing Earth's polar ice, lakes, and rivers. The remaining 29%
    of Earth's surface is land, consisting of continents and islands.

    Mars is the fourth planet and has a thin atmosphere composed primarily of
    carbon dioxide. Mars has two small moons, Phobos and Deimos, which are
    thought to be captured asteroids. Mars is often called the "Red Planet"
    because iron oxide prevalent on its surface gives it a reddish appearance.

    Jupiter is the largest planet in the Solar System, with a mass more than
    two and a half times that of all the other planets combined. Jupiter has
    at least 95 known moons, including the four large Galilean moons discovered
    by Galileo Galilei in 1610. The Great Red Spot is a persistent high-pressure
    region in the atmosphere of Jupiter, producing an anticyclonic storm that is
    the largest in the Solar System. It has been continuously observed since 1830.

    Photosynthesis is a process used by plants and other organisms to convert
    light energy, normally from the Sun, into chemical energy that can later be
    released to fuel the organisms' activities. In most cases, oxygen is also
    released as a waste product. Most plants, algae, and cyanobacteria perform
    photosynthesis. Such organisms are called photoautotrophs.

    The water cycle, also known as the hydrological cycle, describes the
    continuous movement of water within the Earth and atmosphere. Water
    evaporates from the surface of the ocean, rises into the atmosphere,
    cools, condenses into rain or snow in clouds, and falls again to the
    surface as precipitation. About 90% of the water in the atmosphere comes
    from the evaporation of ocean water.
""")

EVAL_QUERIES = [
    {"query": "What are the inner planets of the solar system?", "relevant_keywords": ["mercury", "venus", "earth", "mars", "terrestrial", "inner"], "topic": "Inner planets"},
    {"query": "What is the Great Red Spot?", "relevant_keywords": ["jupiter", "great red spot", "anticyclonic", "storm", "high-pressure"], "topic": "Jupiter's GRS"},
    {"query": "How does photosynthesis work?", "relevant_keywords": ["photosynthesis", "light energy", "chemical energy", "oxygen", "plants"], "topic": "Photosynthesis"},
    {"query": "Describe the water cycle.", "relevant_keywords": ["water cycle", "hydrological", "evaporat", "precipitation", "condens"], "topic": "Water cycle"},
    {"query": "What is the atmosphere of Mars like?", "relevant_keywords": ["mars", "atmosphere", "carbon dioxide", "thin"], "topic": "Mars atmosphere"},
    {"query": "Which planets are gas giants?", "relevant_keywords": ["jupiter", "saturn", "gas giant", "hydrogen", "helium"], "topic": "Gas giants"},
    {"query": "What percentage of Earth's surface is ocean?", "relevant_keywords": ["71%", "ocean", "earth", "surface"], "topic": "Earth's ocean"},
    {"query": "What moons does Mars have?", "relevant_keywords": ["phobos", "deimos", "mars", "moons", "asteroid"], "topic": "Mars moons"},
]

def _keyword_hit(text, keywords):
    text_lower = text.lower()
    return any(kw.lower() in text_lower for kw in keywords)

def precision_at_k(results, keywords, k):
    top_k = results[:k]
    if not top_k: return 0.0
    return sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / len(top_k)

def recall_at_k(results, keywords, k, total_relevant):
    if total_relevant == 0: return 1.0
    top_k = results[:k]
    return min(sum(1 for r in top_k if _keyword_hit(r["text"], keywords)) / total_relevant, 1.0)

def reciprocal_rank(results, keywords):
    for i, r in enumerate(results, 1):
        if _keyword_hit(r["text"], keywords): return 1.0 / i
    return 0.0

def main():
    print("=== BASELINE EVAL (no reranking, no headers, no query expansion, sentence-aware chunks) ===\n")

    eval_user = "_eval_user_tmp"
    nb_name = f"Baseline {time.strftime('%H%M%S')}"
    notebook = create_notebook(eval_user, nb_name)
    notebook_id = notebook["id"]
    print(f"Notebook: {notebook_id}")

    # OLD pipeline: sentence_aware_chunk, NO header
    chunks = sentence_aware_chunk(SAMPLE_DOCUMENT, 1200, 200)
    embeddings = embed_texts([c["chunk_text"] for c in chunks])
    location_hints = [{"start_char": c["start_char"], "end_char": c["end_char"]} for c in chunks]
    summary = upsert_chunks(
        username=eval_user, notebook_id=notebook_id, source_id="eval_source_001",
        chunks=chunks, embeddings=embeddings,
        meta={"source_name": "sample_article.txt", "location_hints": location_hints},
    )
    print(f"Indexed {summary['chunk_count']} chunks (sentence-aware, no headers)")

    retrieval_k = 5
    results_per_query = []
    for q in EVAL_QUERIES:
        t0 = time.perf_counter()
        results = retrieve(eval_user, notebook_id, q["query"], k=retrieval_k)
        latency = (time.perf_counter() - t0) * 1000
        results_per_query.append({
            "topic": q["topic"],
            "P@1": precision_at_k(results, q["relevant_keywords"], 1),
            "P@3": precision_at_k(results, q["relevant_keywords"], 3),
            "P@5": precision_at_k(results, q["relevant_keywords"], 5),
            "MRR": reciprocal_rank(results, q["relevant_keywords"]),
            "Recall@5": recall_at_k(results, q["relevant_keywords"], retrieval_k, 2),
            "latency_ms": latency,
        })

    avg = lambda key: sum(r[key] for r in results_per_query) / len(results_per_query)
    output = {
        "config": "BASELINE: sentence_aware_chunk(1200/200), no headers, no reranking, no query expansion",
        "retrieval_metrics": {
            "avg_MRR": round(avg("MRR"), 4),
            "avg_P@1": round(avg("P@1"), 4),
            "avg_P@5": round(avg("P@5"), 4),
            "avg_Recall@5": round(avg("Recall@5"), 4),
            "avg_latency_ms": round(avg("latency_ms"), 1),
        },
        "per_query": results_per_query,
    }

    out_path = Path(__file__).resolve().parent / "tmp_eval_baseline.json"
    out_path.write_text(json.dumps(output, indent=2, default=str), encoding="utf-8")
    print(f"\nBaseline results saved to: {out_path}")
    print(json.dumps(output["retrieval_metrics"], indent=2))

if __name__ == "__main__":
    main()