redhairedshanks1 commited on
Commit
03ba8f7
Β·
verified Β·
1 Parent(s): 69542d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -33
app.py CHANGED
@@ -1,3 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  import docx
@@ -24,38 +196,37 @@ except Exception:
24
 
25
 
26
  # ------------------------------
27
- # Core Classes with Snippet Support
28
  # ------------------------------
29
  class MultiVectorDocument:
30
  def __init__(self, doc_id: str, vectors: List[np.ndarray], texts: List[str], metadata: Dict = None):
31
  self.doc_id = doc_id
32
  self.vectors = vectors # list of embeddings
33
- self.texts = texts # original paragraphs/chunks
34
  self.metadata = metadata or {}
35
 
36
 
37
  class SingleVectorIndex:
 
38
  def __init__(self, dim: int):
39
  self.dim = dim
40
- self.docs = {} # doc_id β†’ vector
41
- self.texts = {} # doc_id β†’ snippet preview
42
 
43
  def add_document(self, doc: MultiVectorDocument):
44
  centroid = np.mean(doc.vectors, axis=0)
45
  self.docs[doc.doc_id] = centroid / np.linalg.norm(centroid)
46
- # preview: first couple of paragraphs
47
- self.texts[doc.doc_id] = " | ".join(doc.texts[:2])
48
 
49
  def search(self, query_vec: np.ndarray, top_k=3):
50
  qn = query_vec / np.linalg.norm(query_vec)
51
- scores = [(doc_id,
52
- self.texts[doc_id],
53
- float(np.dot(qn, vec)))
54
  for doc_id, vec in self.docs.items()]
55
  return sorted(scores, key=lambda x: -x[2])[:top_k]
56
 
57
 
58
  class MuVERAIndex:
 
59
  def __init__(self, dim: int):
60
  self.dim = dim
61
  self.corpus = {}
@@ -66,26 +237,28 @@ class MuVERAIndex:
66
  centroid = np.mean(doc.vectors, axis=0)
67
  self.global_centroids[doc.doc_id] = centroid / np.linalg.norm(centroid)
68
 
69
- def search(self, query_vec: np.ndarray, top_k: int = 3):
70
  qn = query_vec / np.linalg.norm(query_vec)
71
- # Step 1: shortlist by centroid
 
72
  scores = [(doc_id, float(np.dot(qn, cent)))
73
  for doc_id, cent in self.global_centroids.items()]
74
- shortlist = sorted(scores, key=lambda x: -x[1])[: top_k * 2]
75
 
76
- # Step 2: fine-grained on passages
77
  reranked = []
78
  for doc_id, _ in shortlist:
79
  doc = self.corpus[doc_id]
80
- sims = [np.dot(qn, v/np.linalg.norm(v)) for v in doc.vectors]
81
- best_idx = int(np.argmax(sims))
82
- reranked.append((doc_id, doc.texts[best_idx], float(sims[best_idx])))
83
 
84
- return sorted(reranked, key=lambda x: -x[2])[:top_k]
 
85
 
86
 
87
  # ------------------------------
88
- # File Loaders (docx, txt)
89
  # ------------------------------
90
  def load_docx(path: str):
91
  doc = docx.Document(path)
@@ -104,12 +277,16 @@ def load_txt(path: str):
104
 
105
 
106
  # ------------------------------
107
- # App Initialization
108
  # ------------------------------
109
  dim = EMBEDDING_DIM
110
  single_index = SingleVectorIndex(dim)
111
  muvera_index = MuVERAIndex(dim)
112
 
 
 
 
 
113
  def add_files(files):
114
  added = []
115
  for f in files:
@@ -131,25 +308,27 @@ def query(q: str, top_k: int = 3):
131
 
132
  q_vec = embed_text(q)
133
  single_results = single_index.search(q_vec, top_k)
134
- muvera_results = muvera_index.search(q_vec, top_k)
135
 
136
- def fmt(results):
137
  if not results:
138
  return "No results yet. Upload docs first."
139
- return "\n\n".join([
140
- f"{rank+1}. πŸ“„ {doc_id}\n ✨ Snippet: {snippet}\n πŸ”Ή Score={score:.3f}"
141
- for rank, (doc_id, snippet, score) in enumerate(results)
142
- ])
 
 
143
 
144
- return fmt(single_results), fmt(muvera_results)
145
 
146
 
147
  # ------------------------------
148
- # Gradio Interface
149
  # ------------------------------
150
  with gr.Blocks() as demo:
151
- gr.Markdown("## πŸ”Ž MuVERA Demo: Multi-Vector Retrieval vs Single Vector Search")
152
- gr.Markdown("Upload `.docx` or `.txt` files (small text docs), then compare retrieval methods.")
153
 
154
  with gr.Row():
155
  uploader = gr.File(file_types=[".docx", ".txt"], file_count="multiple")
@@ -157,12 +336,12 @@ with gr.Blocks() as demo:
157
 
158
  uploader.upload(add_files, uploader, status)
159
 
160
- q_box = gr.Textbox(label="Enter query", placeholder="Search something like: efficient retrieval methods...")
161
- topk_slider = gr.Slider(1, 5, value=3, step=1, label="Top-k Results")
162
 
163
  with gr.Row():
164
- out_single = gr.Textbox(label="Single-Vector Results", lines=12)
165
- out_muvera = gr.Textbox(label="MuVERA Results", lines=12)
166
 
167
  btn = gr.Button("Search πŸ”")
168
  btn.click(query, [q_box, topk_slider], [out_single, out_muvera])
 
1
+ # import gradio as gr
2
+ # import numpy as np
3
+ # import docx
4
+ # from typing import List, Tuple, Dict
5
+
6
+ # # ------------------------------
7
+ # # Embedding: Real SentenceTransformer (preferred), fallback to dummy
8
+ # # ------------------------------
9
+ # try:
10
+ # from sentence_transformers import SentenceTransformer
11
+ # _embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
12
+ # EMBEDDING_DIM = _embedding_model.get_sentence_embedding_dimension()
13
+
14
+ # def embed_text(text: str) -> np.ndarray:
15
+ # return _embedding_model.encode(text, normalize_embeddings=True)
16
+
17
+ # USING_REAL = True
18
+ # except Exception:
19
+ # EMBEDDING_DIM = 32
20
+ # def embed_text(text: str) -> np.ndarray:
21
+ # np.random.seed(abs(hash(text)) % (10**6))
22
+ # return np.random.randn(EMBEDDING_DIM)
23
+ # USING_REAL = False
24
+
25
+
26
+ # # ------------------------------
27
+ # # Core Classes with Snippet Support
28
+ # # ------------------------------
29
+ # class MultiVectorDocument:
30
+ # def __init__(self, doc_id: str, vectors: List[np.ndarray], texts: List[str], metadata: Dict = None):
31
+ # self.doc_id = doc_id
32
+ # self.vectors = vectors # list of embeddings
33
+ # self.texts = texts # original paragraphs/chunks
34
+ # self.metadata = metadata or {}
35
+
36
+
37
+ # class SingleVectorIndex:
38
+ # def __init__(self, dim: int):
39
+ # self.dim = dim
40
+ # self.docs = {} # doc_id β†’ vector
41
+ # self.texts = {} # doc_id β†’ snippet preview
42
+
43
+ # def add_document(self, doc: MultiVectorDocument):
44
+ # centroid = np.mean(doc.vectors, axis=0)
45
+ # self.docs[doc.doc_id] = centroid / np.linalg.norm(centroid)
46
+ # # preview: first couple of paragraphs
47
+ # self.texts[doc.doc_id] = " | ".join(doc.texts[:2])
48
+
49
+ # def search(self, query_vec: np.ndarray, top_k=3):
50
+ # qn = query_vec / np.linalg.norm(query_vec)
51
+ # scores = [(doc_id,
52
+ # self.texts[doc_id],
53
+ # float(np.dot(qn, vec)))
54
+ # for doc_id, vec in self.docs.items()]
55
+ # return sorted(scores, key=lambda x: -x[2])[:top_k]
56
+
57
+
58
+ # class MuVERAIndex:
59
+ # def __init__(self, dim: int):
60
+ # self.dim = dim
61
+ # self.corpus = {}
62
+ # self.global_centroids = {}
63
+
64
+ # def add_document(self, doc: MultiVectorDocument):
65
+ # self.corpus[doc.doc_id] = doc
66
+ # centroid = np.mean(doc.vectors, axis=0)
67
+ # self.global_centroids[doc.doc_id] = centroid / np.linalg.norm(centroid)
68
+
69
+ # def search(self, query_vec: np.ndarray, top_k: int = 3):
70
+ # qn = query_vec / np.linalg.norm(query_vec)
71
+ # # Step 1: shortlist by centroid
72
+ # scores = [(doc_id, float(np.dot(qn, cent)))
73
+ # for doc_id, cent in self.global_centroids.items()]
74
+ # shortlist = sorted(scores, key=lambda x: -x[1])[: top_k * 2]
75
+
76
+ # # Step 2: fine-grained on passages
77
+ # reranked = []
78
+ # for doc_id, _ in shortlist:
79
+ # doc = self.corpus[doc_id]
80
+ # sims = [np.dot(qn, v/np.linalg.norm(v)) for v in doc.vectors]
81
+ # best_idx = int(np.argmax(sims))
82
+ # reranked.append((doc_id, doc.texts[best_idx], float(sims[best_idx])))
83
+
84
+ # return sorted(reranked, key=lambda x: -x[2])[:top_k]
85
+
86
+
87
+ # # ------------------------------
88
+ # # File Loaders (docx, txt)
89
+ # # ------------------------------
90
+ # def load_docx(path: str):
91
+ # doc = docx.Document(path)
92
+ # texts, vectors = [], []
93
+ # for para in doc.paragraphs:
94
+ # if para.text.strip():
95
+ # texts.append(para.text.strip())
96
+ # vectors.append(embed_text(para.text.strip()))
97
+ # return MultiVectorDocument(doc_id=path.split("/")[-1], vectors=vectors, texts=texts)
98
+
99
+ # def load_txt(path: str):
100
+ # with open(path, "r", encoding="utf-8") as f:
101
+ # lines = [line.strip() for line in f if line.strip()]
102
+ # vectors = [embed_text(line) for line in lines]
103
+ # return MultiVectorDocument(doc_id=path.split("/")[-1], vectors=vectors, texts=lines)
104
+
105
+
106
+ # # ------------------------------
107
+ # # App Initialization
108
+ # # ------------------------------
109
+ # dim = EMBEDDING_DIM
110
+ # single_index = SingleVectorIndex(dim)
111
+ # muvera_index = MuVERAIndex(dim)
112
+
113
+ # def add_files(files):
114
+ # added = []
115
+ # for f in files:
116
+ # if f.name.endswith(".docx"):
117
+ # doc = load_docx(f.name)
118
+ # elif f.name.endswith(".txt"):
119
+ # doc = load_txt(f.name)
120
+ # else:
121
+ # continue
122
+ # single_index.add_document(doc)
123
+ # muvera_index.add_document(doc)
124
+ # added.append(doc.doc_id)
125
+ # return f"βœ… Indexed: {', '.join(added)}" if added else "⚠️ No valid docs uploaded."
126
+
127
+
128
+ # def query(q: str, top_k: int = 3):
129
+ # if not q.strip():
130
+ # return "Please enter a query", "Please enter a query"
131
+
132
+ # q_vec = embed_text(q)
133
+ # single_results = single_index.search(q_vec, top_k)
134
+ # muvera_results = muvera_index.search(q_vec, top_k)
135
+
136
+ # def fmt(results):
137
+ # if not results:
138
+ # return "No results yet. Upload docs first."
139
+ # return "\n\n".join([
140
+ # f"{rank+1}. πŸ“„ {doc_id}\n ✨ Snippet: {snippet}\n πŸ”Ή Score={score:.3f}"
141
+ # for rank, (doc_id, snippet, score) in enumerate(results)
142
+ # ])
143
+
144
+ # return fmt(single_results), fmt(muvera_results)
145
+
146
+
147
+ # # ------------------------------
148
+ # # Gradio Interface
149
+ # # ------------------------------
150
+ # with gr.Blocks() as demo:
151
+ # gr.Markdown("## πŸ”Ž MuVERA Demo: Multi-Vector Retrieval vs Single Vector Search")
152
+ # gr.Markdown("Upload `.docx` or `.txt` files (small text docs), then compare retrieval methods.")
153
+
154
+ # with gr.Row():
155
+ # uploader = gr.File(file_types=[".docx", ".txt"], file_count="multiple")
156
+ # status = gr.Textbox(label="Index status")
157
+
158
+ # uploader.upload(add_files, uploader, status)
159
+
160
+ # q_box = gr.Textbox(label="Enter query", placeholder="Search something like: efficient retrieval methods...")
161
+ # topk_slider = gr.Slider(1, 5, value=3, step=1, label="Top-k Results")
162
+
163
+ # with gr.Row():
164
+ # out_single = gr.Textbox(label="Single-Vector Results", lines=12)
165
+ # out_muvera = gr.Textbox(label="MuVERA Results", lines=12)
166
+
167
+ # btn = gr.Button("Search πŸ”")
168
+ # btn.click(query, [q_box, topk_slider], [out_single, out_muvera])
169
+
170
+ # demo.launch()
171
+
172
+
173
  import gradio as gr
174
  import numpy as np
175
  import docx
 
196
 
197
 
198
  # ------------------------------
199
+ # Core Classes
200
  # ------------------------------
201
  class MultiVectorDocument:
202
  def __init__(self, doc_id: str, vectors: List[np.ndarray], texts: List[str], metadata: Dict = None):
203
  self.doc_id = doc_id
204
  self.vectors = vectors # list of embeddings
205
+ self.texts = texts # corresponding passages
206
  self.metadata = metadata or {}
207
 
208
 
209
  class SingleVectorIndex:
210
+ """ Naive single-vector index = each doc collapsed to one centroid. """
211
  def __init__(self, dim: int):
212
  self.dim = dim
213
+ self.docs = {}
214
+ self.texts = {}
215
 
216
  def add_document(self, doc: MultiVectorDocument):
217
  centroid = np.mean(doc.vectors, axis=0)
218
  self.docs[doc.doc_id] = centroid / np.linalg.norm(centroid)
219
+ self.texts[doc.doc_id] = " | ".join(doc.texts[:2]) # preview of first passages
 
220
 
221
  def search(self, query_vec: np.ndarray, top_k=3):
222
  qn = query_vec / np.linalg.norm(query_vec)
223
+ scores = [(doc_id, self.texts[doc_id], float(np.dot(qn, vec)))
 
 
224
  for doc_id, vec in self.docs.items()]
225
  return sorted(scores, key=lambda x: -x[2])[:top_k]
226
 
227
 
228
  class MuVERAIndex:
229
+ """ Multi-vector index with centroid prefilter, but returns best-N snippets across docs. """
230
  def __init__(self, dim: int):
231
  self.dim = dim
232
  self.corpus = {}
 
237
  centroid = np.mean(doc.vectors, axis=0)
238
  self.global_centroids[doc.doc_id] = centroid / np.linalg.norm(centroid)
239
 
240
+ def search(self, query_vec: np.ndarray, top_k=3, per_doc_hits=2):
241
  qn = query_vec / np.linalg.norm(query_vec)
242
+
243
+ # Step 1: shortlist docs by centroid
244
  scores = [(doc_id, float(np.dot(qn, cent)))
245
  for doc_id, cent in self.global_centroids.items()]
246
+ shortlist = sorted(scores, key=lambda x: -x[1])[: top_k * 3]
247
 
248
+ # Step 2: evaluate ALL passages in shortlisted docs
249
  reranked = []
250
  for doc_id, _ in shortlist:
251
  doc = self.corpus[doc_id]
252
+ for passage, vec in zip(doc.texts, doc.vectors):
253
+ sim = np.dot(qn, vec/np.linalg.norm(vec))
254
+ reranked.append((doc_id, passage, float(sim)))
255
 
256
+ # Step 3: return globally best passages across docs
257
+ return sorted(reranked, key=lambda x: -x[2])[: top_k * per_doc_hits]
258
 
259
 
260
  # ------------------------------
261
+ # File Loaders
262
  # ------------------------------
263
  def load_docx(path: str):
264
  doc = docx.Document(path)
 
277
 
278
 
279
  # ------------------------------
280
+ # App State
281
  # ------------------------------
282
  dim = EMBEDDING_DIM
283
  single_index = SingleVectorIndex(dim)
284
  muvera_index = MuVERAIndex(dim)
285
 
286
+
287
+ # ------------------------------
288
+ # Functions for Gradio
289
+ # ------------------------------
290
  def add_files(files):
291
  added = []
292
  for f in files:
 
308
 
309
  q_vec = embed_text(q)
310
  single_results = single_index.search(q_vec, top_k)
311
+ muvera_results = muvera_index.search(q_vec, top_k, per_doc_hits=2)
312
 
313
+ def fmt(results, mode="doc"):
314
  if not results:
315
  return "No results yet. Upload docs first."
316
+ formatted = []
317
+ for rank, (doc_id, snippet, score) in enumerate(results):
318
+ formatted.append(
319
+ f"{rank+1}. πŸ“„ {doc_id}\n ✨ Snippet: {snippet}\n πŸ”Ή Score={score:.3f}"
320
+ )
321
+ return "\n\n".join(formatted)
322
 
323
+ return fmt(single_results, "doc"), fmt(muvera_results, "snippet")
324
 
325
 
326
  # ------------------------------
327
+ # Gradio UI
328
  # ------------------------------
329
  with gr.Blocks() as demo:
330
+ gr.Markdown("## πŸ”Ž MuVERA Demo: Multi-Vector Retrieval vs Single-Vector Search")
331
+ gr.Markdown("Upload `.docx` or `.txt` files, then compare retrieval systems.")
332
 
333
  with gr.Row():
334
  uploader = gr.File(file_types=[".docx", ".txt"], file_count="multiple")
 
336
 
337
  uploader.upload(add_files, uploader, status)
338
 
339
+ q_box = gr.Textbox(label="Query", placeholder="Example: efficient retrieval methods")
340
+ topk_slider = gr.Slider(1, 5, value=3, step=1, label="Top-k Docs to Consider")
341
 
342
  with gr.Row():
343
+ out_single = gr.Textbox(label="Single-Vector Results (Doc-level)", lines=10)
344
+ out_muvera = gr.Textbox(label="MuVERA Results (Top Snippets)", lines=15)
345
 
346
  btn = gr.Button("Search πŸ”")
347
  btn.click(query, [q_box, topk_slider], [out_single, out_muvera])