RikkaBotan commited on
Commit
f347c89
·
verified ·
1 Parent(s): e364ce1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -38
app.py CHANGED
@@ -3,6 +3,7 @@ import torch
3
  from sentence_transformers import SentenceTransformer
4
  from ddgs import DDGS
5
  import time
 
6
 
7
  # Load Model
8
  model = SentenceTransformer(
@@ -56,19 +57,14 @@ def semantic_web_search(query):
56
  for i, (score, d) in enumerate(ranked):
57
  md += f"""
58
  #### 💎 Rank {i+1}
59
-
60
  [{d['title']}]({d['href']})
61
-
62
  **Score:** `{score:.4f}`
63
-
64
  {d['body']}
65
-
66
  ---
67
  """
68
  return md
69
 
70
 
71
- # Progressive Threshold Search with progress
72
  def progressive_search(query, threshold=0.7, step=50, max_cap=999):
73
  if query.strip() == "":
74
  yield "Please enter a search query."
@@ -76,6 +72,12 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
76
 
77
  current_k = step
78
 
 
 
 
 
 
 
79
  while current_k <= max_cap:
80
  try:
81
  docs = web_search(query, max_results=current_k)
@@ -89,7 +91,20 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
89
  current_k += step
90
  continue
91
 
92
- texts = [d["title"] + " " + d["body"] for d in docs]
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  with torch.no_grad():
95
  embeddings = model.encode(
@@ -100,15 +115,30 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
100
 
101
  query_emb = embeddings[0]
102
  doc_embs = embeddings[1:]
103
- scores = (query_emb @ doc_embs.T).cpu().numpy()
104
- best_score = float(scores.max())
105
 
106
- md = f"### Searching…\n- Documents examined: `{len(docs)}`\n- Best score so far: `{best_score:.4f}`\n"
 
 
 
 
 
 
 
 
 
 
 
107
  yield md
108
 
109
  if best_score >= threshold:
110
- ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
111
- md = f"### Threshold reached!\n"
 
 
 
 
 
 
112
  for i, (score, d) in enumerate(ranked):
113
  md += f"""
114
  #### Rank {i+1}
@@ -126,10 +156,17 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
126
 
127
  current_k += step
128
  time.sleep(1)
129
- ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
130
- md = f"### Threshold not reached in max search range.\n"
131
- for i, (score, d) in enumerate(ranked):
132
- md += f"""
 
 
 
 
 
 
 
133
  #### Rank {i+1}
134
 
135
  [{d['title']}]({d['href']})
@@ -140,8 +177,8 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
140
 
141
  ---
142
  """
143
- yield md
144
 
 
145
 
146
 
147
  # UI
@@ -149,7 +186,6 @@ pastel_css = """
149
  body {
150
  background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
151
  }
152
-
153
  /* gradient headings */
154
  h1, h2, h3, h4 {
155
  background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
@@ -159,18 +195,14 @@ h1, h2, h3, h4 {
159
  letter-spacing: 0.4px;
160
  padding: 4px;
161
  }
162
-
163
  /* optional: slightly softer subtitle tone */
164
  h2, h3 {
165
  opacity: 0.9;
166
  }
167
-
168
-
169
  .gradio-container {
170
  font-family: 'Helvetica Neue', sans-serif;
171
  color: #1e3a8a;
172
  }
173
-
174
  /* model card */
175
  .model-card {
176
  background: #ffffff;
@@ -180,7 +212,6 @@ h2, h3 {
180
  box-shadow: 0 12px 20px rgba(60,120,255,0.18);
181
  margin-bottom: 20px;
182
  }
183
-
184
  /* result card */
185
  .result-card {
186
  background: #ffffff;
@@ -189,51 +220,42 @@ h2, h3 {
189
  border: 1px solid #dbeafe;
190
  box-shadow: 0 12px 20px rgba(60,120,255,0.18);
191
  }
192
-
193
  .gr-markdown, .prose {
194
  border: none !important;
195
  box-shadow: none !important;
196
  padding: 0 !important;
197
  color: #1e3a8a !important;
198
  }
199
-
200
  .model-card, .result-card {
201
  background: #ffffff;
202
  color: #1e3a8a;
203
  }
204
-
205
  @media (prefers-color-scheme: dark) {
206
  body {
207
  background: linear-gradient(180deg, #0f172a 0%, #1e293b 40%, #334155 100%);
208
  }
209
-
210
  .gradio-container {
211
  color: #dbeafe;
212
  }
213
-
214
  .gr-markdown, .prose {
215
  color: #dbeafe !important;
216
  }
217
-
218
  .model-card, .result-card {
219
  background: #1a1a1a;
220
  color: #dbeafe;
221
  border: 1px solid #3b82f6;
222
  box-shadow: 0 12px 20px rgba(60,120,255,0.18);
223
  }
224
-
225
  .gr-markdown, .prose {
226
  color: #dbeafe !important;
227
  }
228
  }
229
-
230
  textarea, input {
231
  border-radius: 12px !important;
232
  border: 1px solid #c7ddff !important;
233
  background-color: #f5f9ff !important;
234
  color: #1e3a8a !important;
235
  }
236
-
237
  button {
238
  background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
239
  color: #ffffff !important;
@@ -241,14 +263,11 @@ button {
241
  border: 1px solid #93c5fd !important;
242
  font-weight: 600;
243
  letter-spacing: 0.3px;
244
-
245
  box-shadow:
246
  0 6px 14px rgba(60,120,255,0.28),
247
  inset 0 1px 0 rgba(255,255,255,0.6);
248
-
249
  transition: all 0.25s ease;
250
  }
251
-
252
  button:hover {
253
  background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
254
  box-shadow:
@@ -256,14 +275,12 @@ button:hover {
256
  inset 0 1px 0 rgba(255,255,255,0.7);
257
  transform: translateY(-1px);
258
  }
259
-
260
  button:active {
261
  transform: translateY(1px);
262
  box-shadow:
263
  0 3px 8px rgba(60,120,255,0.2),
264
  inset 0 2px 4px rgba(0,0,0,0.08);
265
  }
266
-
267
  """
268
 
269
  with gr.Blocks(css=pastel_css) as demo:
@@ -275,11 +292,9 @@ with gr.Blocks(css=pastel_css) as demo:
275
  gr.Markdown("""
276
  ## About this Model
277
  **RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
278
-
279
  ### Performance
280
  - **NanoBEIR NDCG@10 = 0.5124**
281
  - Higher than other static embedding models
282
-
283
  ### Efficiency
284
  - 512 dimensions
285
  - ~2× faster retrieval
 
3
  from sentence_transformers import SentenceTransformer
4
  from ddgs import DDGS
5
  import time
6
+ import numpy as np
7
 
8
  # Load Model
9
  model = SentenceTransformer(
 
57
  for i, (score, d) in enumerate(ranked):
58
  md += f"""
59
  #### 💎 Rank {i+1}
 
60
  [{d['title']}]({d['href']})
 
61
  **Score:** `{score:.4f}`
 
62
  {d['body']}
 
63
  ---
64
  """
65
  return md
66
 
67
 
 
68
  def progressive_search(query, threshold=0.7, step=50, max_cap=999):
69
  if query.strip() == "":
70
  yield "Please enter a search query."
 
72
 
73
  current_k = step
74
 
75
+ scores_last = []
76
+ docs_last = []
77
+
78
+ seen_urls = set()
79
+ total_examined = 0
80
+
81
  while current_k <= max_cap:
82
  try:
83
  docs = web_search(query, max_results=current_k)
 
91
  current_k += step
92
  continue
93
 
94
+ total_examined += len(docs)
95
+
96
+ new_docs = []
97
+ for d in docs:
98
+ url = d["href"]
99
+ if url not in seen_urls:
100
+ seen_urls.add(url)
101
+ new_docs.append(d)
102
+
103
+ if len(new_docs) == 0:
104
+ current_k += step
105
+ continue
106
+
107
+ texts = [d["title"] + " " + d["body"] for d in new_docs]
108
 
109
  with torch.no_grad():
110
  embeddings = model.encode(
 
115
 
116
  query_emb = embeddings[0]
117
  doc_embs = embeddings[1:]
 
 
118
 
119
+ scores = (query_emb @ doc_embs.T).cpu().numpy().flatten()
120
+
121
+ scores_last.extend(scores.tolist())
122
+ docs_last.extend(new_docs)
123
+
124
+ best_score = float(np.max(scores_last))
125
+
126
+ md = (
127
+ f"### Searching…\n"
128
+ f"- Documents examined (with duplicates): `{total_examined}`\n"
129
+ f"- Best score so far: `{best_score:.4f}`\n"
130
+ )
131
  yield md
132
 
133
  if best_score >= threshold:
134
+ ranked = sorted(
135
+ zip(scores_last, docs_last),
136
+ key=lambda x: x[0],
137
+ reverse=True
138
+ )[:5]
139
+
140
+ md = "### Threshold reached!\n"
141
+
142
  for i, (score, d) in enumerate(ranked):
143
  md += f"""
144
  #### Rank {i+1}
 
156
 
157
  current_k += step
158
  time.sleep(1)
159
+
160
+ ranked = sorted(
161
+ zip(scores_last, docs_last),
162
+ key=lambda x: x[0],
163
+ reverse=True
164
+ )[:5]
165
+
166
+ md = "### Threshold not reached in max search range.\n"
167
+
168
+ for i, (score, d) in enumerate(ranked):
169
+ md += f"""
170
  #### Rank {i+1}
171
 
172
  [{d['title']}]({d['href']})
 
177
 
178
  ---
179
  """
 
180
 
181
+ yield md
182
 
183
 
184
  # UI
 
186
  body {
187
  background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
188
  }
 
189
  /* gradient headings */
190
  h1, h2, h3, h4 {
191
  background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
 
195
  letter-spacing: 0.4px;
196
  padding: 4px;
197
  }
 
198
  /* optional: slightly softer subtitle tone */
199
  h2, h3 {
200
  opacity: 0.9;
201
  }
 
 
202
  .gradio-container {
203
  font-family: 'Helvetica Neue', sans-serif;
204
  color: #1e3a8a;
205
  }
 
206
  /* model card */
207
  .model-card {
208
  background: #ffffff;
 
212
  box-shadow: 0 12px 20px rgba(60,120,255,0.18);
213
  margin-bottom: 20px;
214
  }
 
215
  /* result card */
216
  .result-card {
217
  background: #ffffff;
 
220
  border: 1px solid #dbeafe;
221
  box-shadow: 0 12px 20px rgba(60,120,255,0.18);
222
  }
 
223
  .gr-markdown, .prose {
224
  border: none !important;
225
  box-shadow: none !important;
226
  padding: 0 !important;
227
  color: #1e3a8a !important;
228
  }
 
229
  .model-card, .result-card {
230
  background: #ffffff;
231
  color: #1e3a8a;
232
  }
 
233
  @media (prefers-color-scheme: dark) {
234
  body {
235
  background: linear-gradient(180deg, #0f172a 0%, #1e293b 40%, #334155 100%);
236
  }
 
237
  .gradio-container {
238
  color: #dbeafe;
239
  }
 
240
  .gr-markdown, .prose {
241
  color: #dbeafe !important;
242
  }
 
243
  .model-card, .result-card {
244
  background: #1a1a1a;
245
  color: #dbeafe;
246
  border: 1px solid #3b82f6;
247
  box-shadow: 0 12px 20px rgba(60,120,255,0.18);
248
  }
 
249
  .gr-markdown, .prose {
250
  color: #dbeafe !important;
251
  }
252
  }
 
253
  textarea, input {
254
  border-radius: 12px !important;
255
  border: 1px solid #c7ddff !important;
256
  background-color: #f5f9ff !important;
257
  color: #1e3a8a !important;
258
  }
 
259
  button {
260
  background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
261
  color: #ffffff !important;
 
263
  border: 1px solid #93c5fd !important;
264
  font-weight: 600;
265
  letter-spacing: 0.3px;
 
266
  box-shadow:
267
  0 6px 14px rgba(60,120,255,0.28),
268
  inset 0 1px 0 rgba(255,255,255,0.6);
 
269
  transition: all 0.25s ease;
270
  }
 
271
  button:hover {
272
  background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
273
  box-shadow:
 
275
  inset 0 1px 0 rgba(255,255,255,0.7);
276
  transform: translateY(-1px);
277
  }
 
278
  button:active {
279
  transform: translateY(1px);
280
  box-shadow:
281
  0 3px 8px rgba(60,120,255,0.2),
282
  inset 0 2px 4px rgba(0,0,0,0.08);
283
  }
 
284
  """
285
 
286
  with gr.Blocks(css=pastel_css) as demo:
 
292
  gr.Markdown("""
293
  ## About this Model
294
  **RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
 
295
  ### Performance
296
  - **NanoBEIR NDCG@10 = 0.5124**
297
  - Higher than other static embedding models
 
298
  ### Efficiency
299
  - 512 dimensions
300
  - ~2× faster retrieval