JackSparrow89 commited on
Commit
1bbc850
·
verified ·
1 Parent(s): 4f7687e

Update searcher/sparse_retriever.py

Browse files
Files changed (1) hide show
  1. searcher/sparse_retriever.py +127 -117
searcher/sparse_retriever.py CHANGED
@@ -1,117 +1,127 @@
1
- # searcher/sparse_retriever.py
2
-
3
- import sqlite3
4
- import math
5
- import yaml
6
- from collections import defaultdict
7
-
8
-
9
- class SparseRetriever:
10
- """
11
- BM25 (Okapi BM25) lexical retrieval over the SQLite chunk store.
12
-
13
- Why BM25 alongside semantic search?
14
- - Dense retrieval can miss exact keyword matches (product codes, names, IDs)
15
- - BM25 is great for rare/specific terms that embeddings smooth over
16
- - Hybrid = best of both worlds
17
-
18
- BM25 formula:
19
- score(q, d) = Σ IDF(t) × (tf × (k1+1)) / (tf + k1 × (1 - b + b × dl/avgdl))
20
- """
21
-
22
- def __init__(self, config_path="config.yaml"):
23
- with open(config_path) as f:
24
- config = yaml.safe_load(f)
25
-
26
- self.db_path = f"{config['data_dir']}/metadata.db"
27
- self.k1 = 1.5 # term frequency saturation
28
- self.b = 0.75 # length normalisation
29
-
30
- # Build in-memory BM25 index from SQLite on startup
31
- self._corpus = [] # list of (chunk_id, token_list)
32
- self._avgdl = 0.0
33
- self._df = defaultdict(int) # term doc frequency
34
- self._build_index()
35
-
36
- def _build_index(self):
37
- """Load all chunks from SQLite and compute BM25 statistics."""
38
- conn = sqlite3.connect(self.db_path)
39
- rows = conn.execute("SELECT id, chunk_text FROM chunks").fetchall()
40
- conn.close()
41
-
42
- total_len = 0
43
- for chunk_id, text in rows:
44
- tokens = text.lower().split()
45
- self._corpus.append((chunk_id, tokens))
46
- total_len += len(tokens)
47
- for token in set(tokens):
48
- self._df[token] += 1
49
-
50
- self._avgdl = total_len / len(rows) if rows else 1.0
51
- self._N = len(rows)
52
-
53
- def _idf(self, term: str) -> float:
54
- """Inverse document frequency for a term."""
55
- df = self._df.get(term, 0)
56
- return math.log((self._N - df + 0.5) / (df + 0.5) + 1)
57
-
58
- def retrieve(self, query: str, top_k: int = 20) -> list[dict]:
59
- """
60
- Run BM25 retrieval over the corpus.
61
-
62
- Args:
63
- query (str) raw or rewritten query (NOT expanded BM25 is lexical)
64
- top_k (int) — number of results to return
65
-
66
- Returns:
67
- list[dict] with chunk_id and sparse_score, sorted descending
68
- """
69
- query_terms = query.lower().split()
70
- scores = {}
71
-
72
- for chunk_id, tokens in self._corpus:
73
- dl = len(tokens)
74
- score = 0.0
75
- tf_map = defaultdict(int)
76
- for t in tokens:
77
- tf_map[t] += 1
78
-
79
- for term in query_terms:
80
- if term not in tf_map:
81
- continue
82
- tf = tf_map[term]
83
- idf = self._idf(term)
84
- numerator = tf * (self.k1 + 1)
85
- denominator = tf + self.k1 * (1 - self.b + self.b * dl / self._avgdl)
86
- score += idf * numerator / denominator
87
-
88
- if score > 0:
89
- scores[chunk_id] = score
90
-
91
- sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
92
-
93
- # Fetch text for top results
94
- conn = sqlite3.connect(self.db_path)
95
- results = []
96
- for chunk_id, score in sorted_results:
97
- row = conn.execute(
98
- "SELECT chunk_text, filepath, chunk_index FROM chunks WHERE id = ?",
99
- (chunk_id,)
100
- ).fetchone()
101
- if row:
102
- results.append({
103
- "chunk_id": chunk_id,
104
- "chunk_text": row[0],
105
- "filepath": row[1],
106
- "chunk_index": row[2],
107
- "sparse_score": score,
108
- })
109
- conn.close()
110
- return results
111
-
112
-
113
- if __name__ == "__main__":
114
- sr = SparseRetriever()
115
- results = sr.retrieve("quarterly budget", top_k=5)
116
- for r in results:
117
- print(f"[{r['sparse_score']:.4f}] {r['filepath']} → {r['chunk_text'][:80]}")
 
 
 
 
 
 
 
 
 
 
 
1
+ # searcher/sparse_retriever.py
2
+
3
+ import os
4
+ import sqlite3
5
+ import math
6
+ import yaml
7
+ from collections import defaultdict
8
+
9
+
10
+ class SparseRetriever:
11
+ """
12
+ BM25 (Okapi BM25) lexical retrieval over the SQLite chunk store.
13
+
14
+ Why BM25 alongside semantic search?
15
+ - Dense retrieval can miss exact keyword matches (product codes, names, IDs)
16
+ - BM25 is great for rare/specific terms that embeddings smooth over
17
+ - Hybrid = best of both worlds
18
+
19
+ BM25 formula:
20
+ score(q, d) = Σ IDF(t) × (tf × (k1+1)) / (tf + k1 × (1 - b + b × dl/avgdl))
21
+ """
22
+
23
+ def __init__(self, config_path="config.yaml"):
24
+ with open(config_path) as f:
25
+ config = yaml.safe_load(f)
26
+
27
+ self.data_dir = config["data_dir"]
28
+ self.db_path = f"{self.data_dir}/metadata.db"
29
+ self.k1 = 1.5 # term frequency saturation
30
+ self.b = 0.75 # length normalisation
31
+
32
+ # Build in-memory BM25 index from SQLite on startup
33
+ self._corpus = [] # list of (chunk_id, token_list)
34
+ self._avgdl = 0.0
35
+ self._N = 0
36
+ self._df = defaultdict(int) # term → doc frequency
37
+ self._build_index()
38
+
39
+ def _build_index(self):
40
+ """Load all chunks from SQLite and compute BM25 statistics."""
41
+ os.makedirs(self.data_dir, exist_ok=True)
42
+ conn = sqlite3.connect(self.db_path)
43
+ try:
44
+ rows = conn.execute("SELECT id, chunk_text FROM chunks").fetchall()
45
+ except sqlite3.OperationalError:
46
+ rows = []
47
+ conn.close()
48
+
49
+ total_len = 0
50
+ for chunk_id, text in rows:
51
+ tokens = text.lower().split()
52
+ self._corpus.append((chunk_id, tokens))
53
+ total_len += len(tokens)
54
+ for token in set(tokens):
55
+ self._df[token] += 1
56
+
57
+ self._avgdl = total_len / len(rows) if rows else 1.0
58
+ self._N = len(rows)
59
+
60
+ def _idf(self, term: str) -> float:
61
+ """Inverse document frequency for a term."""
62
+ df = self._df.get(term, 0)
63
+ return math.log((self._N - df + 0.5) / (df + 0.5) + 1)
64
+
65
+ def retrieve(self, query: str, top_k: int = 20) -> list[dict]:
66
+ """
67
+ Run BM25 retrieval over the corpus.
68
+
69
+ Args:
70
+ query (str) — raw or rewritten query (NOT expanded — BM25 is lexical)
71
+ top_k (int) — number of results to return
72
+
73
+ Returns:
74
+ list[dict] with chunk_id and sparse_score, sorted descending
75
+ """
76
+ if not self._corpus:
77
+ return []
78
+
79
+ query_terms = query.lower().split()
80
+ scores = {}
81
+
82
+ for chunk_id, tokens in self._corpus:
83
+ dl = len(tokens)
84
+ score = 0.0
85
+ tf_map = defaultdict(int)
86
+ for t in tokens:
87
+ tf_map[t] += 1
88
+
89
+ for term in query_terms:
90
+ if term not in tf_map:
91
+ continue
92
+ tf = tf_map[term]
93
+ idf = self._idf(term)
94
+ numerator = tf * (self.k1 + 1)
95
+ denominator = tf + self.k1 * (1 - self.b + self.b * dl / self._avgdl)
96
+ score += idf * numerator / denominator
97
+
98
+ if score > 0:
99
+ scores[chunk_id] = score
100
+
101
+ sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
102
+
103
+ # Fetch text for top results
104
+ conn = sqlite3.connect(self.db_path)
105
+ results = []
106
+ for chunk_id, score in sorted_results:
107
+ row = conn.execute(
108
+ "SELECT chunk_text, filepath, chunk_index FROM chunks WHERE id = ?",
109
+ (chunk_id,)
110
+ ).fetchone()
111
+ if row:
112
+ results.append({
113
+ "chunk_id": chunk_id,
114
+ "chunk_text": row[0],
115
+ "filepath": row[1],
116
+ "chunk_index": row[2],
117
+ "sparse_score": score,
118
+ })
119
+ conn.close()
120
+ return results
121
+
122
+
123
+ if __name__ == "__main__":
124
+ sr = SparseRetriever()
125
+ results = sr.retrieve("quarterly budget", top_k=5)
126
+ for r in results:
127
+ print(f"[{r['sparse_score']:.4f}] {r['filepath']} → {r['chunk_text'][:80]}")