RikkaBotan commited on
Commit
577d2bb
·
verified ·
1 Parent(s): 13f7edd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +307 -304
app.py CHANGED
@@ -1,304 +1,307 @@
1
- import gradio as gr
2
- import torch
3
- from sentence_transformers import SentenceTransformer
4
- from ddgs import DDGS
5
-
6
- # Load Model
7
- model = SentenceTransformer(
8
- "RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en",
9
- trust_remote_code=True,
10
- device="cuda" if torch.cuda.is_available() else "cpu"
11
- )
12
-
13
-
14
- # Web Search
15
- def web_search(query, max_results=100):
16
- results = []
17
- with DDGS() as ddgs:
18
- for r in ddgs.text(query, max_results=max_results):
19
- results.append({
20
- "title": r.get("title", ""),
21
- "body": r.get("body", ""),
22
- "href": r.get("href", "")
23
- })
24
- return results
25
-
26
-
27
- # Standard Semantic Search
28
- def semantic_web_search(query):
29
- if query.strip() == "":
30
- return "Please enter a search query."
31
-
32
- docs = web_search(query, max_results=100)
33
- texts = [d["title"] + " " + d["body"] for d in docs]
34
-
35
- with torch.no_grad():
36
- embeddings = model.encode(
37
- [query] + texts[:256],
38
- convert_to_tensor=True,
39
- normalize_embeddings=True
40
- )
41
-
42
- query_emb = embeddings[0]
43
- doc_embs = embeddings[1:]
44
- scores = (query_emb @ doc_embs.T).cpu().numpy()
45
-
46
- ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:30]
47
-
48
- md = ""
49
- for i, (score, d) in enumerate(ranked):
50
- md += f"""
51
- #### 💎 Rank {i+1}
52
- [{d['title']}]({d['href']})
53
-
54
- **Score:** `{score:.4f}`
55
-
56
- {d['body']}
57
-
58
- ---
59
- """
60
- return md
61
-
62
-
63
- # Progressive Threshold Search
64
- def progressive_search(query, threshold=0.7, step=50, max_cap=999):
65
- if query.strip() == "":
66
- return "Please enter a search query."
67
-
68
- current_k = step
69
- best_score = 0.0
70
-
71
- while current_k <= max_cap:
72
-
73
- docs = web_search(query, max_results=current_k)
74
- texts = [d["title"] + " " + d["body"] for d in docs]
75
-
76
- with torch.no_grad():
77
- embeddings = model.encode(
78
- [query] + texts[:256],
79
- convert_to_tensor=True,
80
- normalize_embeddings=True
81
- )
82
-
83
- query_emb = embeddings[0]
84
- doc_embs = embeddings[1:]
85
- scores = (query_emb @ doc_embs.T).cpu().numpy()
86
-
87
- best_score = float(scores.max())
88
-
89
- if best_score >= threshold:
90
-
91
- ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
92
-
93
- md = f"""
94
- #### Threshold Reached
95
-
96
- - Threshold: `{threshold}`
97
-
98
- - **Best Score:** `{best_score:.4f}`
99
-
100
- - Documents Searched: `{len(docs)}`
101
-
102
- ---
103
- """
104
-
105
- for i, (score, d) in enumerate(ranked):
106
- md += f"""
107
- #### Rank {i+1}
108
-
109
- [{d['title']}]({d['href']})
110
-
111
- **Score:** `{score:.4f}`
112
-
113
- {d['body']}
114
-
115
- ---
116
- """
117
- return md
118
-
119
- current_k += step
120
-
121
- ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
122
- md = f"""
123
- #### Threshold Not Reached ๐·°(৹˃ᗝ˂৹)°·๐
124
-
125
- - Threshold: `{threshold}`
126
-
127
- - **Best Score:** `{best_score:.4f}`
128
-
129
- - Documents Searched: `{current_k}`
130
- """
131
- for i, (score, d) in enumerate(ranked):
132
- md += f"""
133
- #### Rank {i+1}
134
-
135
- [{d['title']}]({d['href']})
136
-
137
- **Score:** `{score:.4f}`
138
-
139
- {d['body']}
140
-
141
- ---
142
- """
143
- return md
144
-
145
-
146
- # UI
147
- pastel_css = """
148
- body {
149
- background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
150
- }
151
-
152
- /* gradient headings */
153
- h1, h2, h3, h4 {
154
- background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
155
- -webkit-background-clip: text;
156
- -webkit-text-fill-color: transparent;
157
- font-weight: 800;
158
- letter-spacing: 0.4px;
159
- padding: 4px;
160
- }
161
-
162
- /* optional: slightly softer subtitle tone */
163
- h2, h3 {
164
- opacity: 0.9;
165
- }
166
-
167
-
168
- .gradio-container {
169
- font-family: 'Helvetica Neue', sans-serif;
170
- color: #1e3a8a;
171
- }
172
-
173
- /* model card */
174
- .model-card {
175
- background: #ffffff;
176
- border-radius: 18px;
177
- padding: 22px;
178
- border: 1px solid #dbeafe;
179
- box-shadow: 0 12px 20px rgba(60,120,255,0.18);
180
- margin-bottom: 20px;
181
- }
182
-
183
- /* result card */
184
- .result-card {
185
- background: #ffffff;
186
- border-radius: 18px;
187
- padding: 22px;
188
- border: 1px solid #dbeafe;
189
- box-shadow: 0 12px 20px rgba(60,120,255,0.18);
190
- }
191
-
192
- .gr-markdown, .prose {
193
- border: none !important;
194
- box-shadow: none !important;
195
- padding: 0 !important;
196
- }
197
-
198
- textarea, input {
199
- border-radius: 12px !important;
200
- border: 1px solid #c7ddff !important;
201
- background-color: #f5f9ff !important;
202
- color: #1e3a8a !important;
203
- }
204
-
205
- button {
206
- background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
207
- color: #ffffff !important;
208
- border-radius: 14px !important;
209
- border: 1px solid #93c5fd !important;
210
- font-weight: 600;
211
- letter-spacing: 0.3px;
212
-
213
- box-shadow:
214
- 0 6px 14px rgba(60,120,255,0.28),
215
- inset 0 1px 0 rgba(255,255,255,0.6);
216
-
217
- transition: all 0.25s ease;
218
- }
219
-
220
- button:hover {
221
- background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
222
- box-shadow:
223
- 0 8px 18px rgba(60,120,255,0.35),
224
- inset 0 1px 0 rgba(255,255,255,0.7);
225
- transform: translateY(-1px);
226
- }
227
-
228
- button:active {
229
- transform: translateY(1px);
230
- box-shadow:
231
- 0 3px 8px rgba(60,120,255,0.2),
232
- inset 0 2px 4px rgba(0,0,0,0.08);
233
- }
234
-
235
- """
236
-
237
- with gr.Blocks(css=pastel_css) as demo:
238
-
239
- gr.Markdown('# Semantic Web Search and Deep Web Search')
240
- gr.Markdown('## Fast Retrieval with Stable Static Embedding')
241
-
242
- with gr.Column(elem_classes="model-card"):
243
- gr.Markdown("""
244
- ## About this Model
245
- **RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
246
-
247
- ### Performance
248
- - **NanoBEIR NDCG@10 = 0.5124**
249
- - Higher than other static embedding models
250
-
251
- ### Efficiency
252
- - 512 dimensions
253
- - ~2× faster retrieval
254
- - Separable Dynamic Tanh normalization
255
- """)
256
-
257
- with gr.Tabs():
258
-
259
- # Standard
260
- with gr.Tab("Standard Search"):
261
-
262
- query1 = gr.Textbox(
263
- value="What is Stable Static Embedding?",
264
- label="Enter your search query"
265
- )
266
-
267
- btn1 = gr.Button("Search")
268
-
269
- with gr.Column(elem_classes="result-card"):
270
- out1 = gr.Markdown()
271
-
272
- btn1.click(
273
- semantic_web_search,
274
- inputs=query1,
275
- outputs=out1
276
- )
277
-
278
- # deep
279
- with gr.Tab("Deep Search"):
280
-
281
- query2 = gr.Textbox(
282
- value="What is Stable Static Embedding?",
283
- label="Enter your search query"
284
- )
285
-
286
- threshold = gr.Slider(
287
- 0.3, 0.95, value=0.7, step=0.05,
288
- label="Score Threshold"
289
- )
290
-
291
- btn2 = gr.Button("Run Deep Search")
292
-
293
- with gr.Column(elem_classes="result-card"):
294
- out2 = gr.Markdown()
295
-
296
- btn2.click(
297
- progressive_search,
298
- inputs=[query2, threshold],
299
- outputs=out2
300
- )
301
-
302
- gr.Markdown("© 2026 Rikka Botan")
303
-
304
- demo.launch()
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from sentence_transformers import SentenceTransformer
4
+ from ddgs import DDGS
5
+ import time
6
+
7
+ # Load Model
8
+ model = SentenceTransformer(
9
+ "RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en",
10
+ trust_remote_code=True,
11
+ device="cuda" if torch.cuda.is_available() else "cpu"
12
+ )
13
+
14
+
15
+ # Web Search with error handling
16
+ def web_search(query, max_results=100):
17
+ results = []
18
+ with DDGS() as ddgs:
19
+ try:
20
+ for i, r in enumerate(ddgs.text(query, max_results=max_results), start=1):
21
+ try:
22
+ results.append({
23
+ "title": r.get("title", ""),
24
+ "body": r.get("body", ""),
25
+ "href": r.get("href", "")
26
+ })
27
+ except Exception as e:
28
+ print(f"Skip doc {i}: {e}")
29
+ except Exception as e:
30
+ print(f"Skip web batch (max={max_results}): {e}")
31
+ return results
32
+
33
+
34
+ # Standard Semantic Search
35
+ def semantic_web_search(query):
36
+ if query.strip() == "":
37
+ return "Please enter a search query."
38
+
39
+ docs = web_search(query, max_results=100)
40
+ texts = [d["title"] + " " + d["body"] for d in docs]
41
+
42
+ with torch.no_grad():
43
+ embeddings = model.encode(
44
+ [query] + texts[:256],
45
+ convert_to_tensor=True,
46
+ normalize_embeddings=True
47
+ )
48
+
49
+ query_emb = embeddings[0]
50
+ doc_embs = embeddings[1:]
51
+ scores = (query_emb @ doc_embs.T).cpu().numpy()
52
+
53
+ ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:30]
54
+
55
+ md = ""
56
+ for i, (score, d) in enumerate(ranked):
57
+ md += f"""
58
+ #### 💎 Rank {i+1}
59
+
60
+ [{d['title']}]({d['href']})
61
+
62
+ **Score:** `{score:.4f}`
63
+
64
+ {d['body']}
65
+
66
+ ---
67
+ """
68
+ return md
69
+
70
+
71
+ # Progressive Threshold Search with progress
72
+ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
73
+ if query.strip() == "":
74
+ yield "Please enter a search query."
75
+ return
76
+
77
+ current_k = step
78
+
79
+ while current_k <= max_cap:
80
+ try:
81
+ docs = web_search(query, max_results=current_k)
82
+ except Exception as e:
83
+ yield f"Skipped batch {current_k} due to error: {e}"
84
+ current_k += step
85
+ continue
86
+
87
+ if len(docs) == 0:
88
+ yield f"No documents fetched for {current_k} results"
89
+ current_k += step
90
+ continue
91
+
92
+ texts = [d["title"] + " " + d["body"] for d in docs]
93
+
94
+ with torch.no_grad():
95
+ embeddings = model.encode(
96
+ [query] + texts[:256],
97
+ convert_to_tensor=True,
98
+ normalize_embeddings=True
99
+ )
100
+
101
+ query_emb = embeddings[0]
102
+ doc_embs = embeddings[1:]
103
+ scores = (query_emb @ doc_embs.T).cpu().numpy()
104
+ best_score = float(scores.max())
105
+
106
+ md = f"### Searching…\n- Documents examined: `{len(docs)}`\n- Best score so far: `{best_score:.4f}`\n"
107
+ yield md
108
+
109
+ if best_score >= threshold:
110
+ ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
111
+ md = f"### Threshold reached!\n"
112
+ for i, (score, d) in enumerate(ranked):
113
+ md += f"""
114
+ #### Rank {i+1}
115
+
116
+ [{d['title']}]({d['href']})
117
+
118
+ **Score:** `{score:.4f}`
119
+
120
+ {d['body']}
121
+
122
+ ---
123
+ """
124
+ yield md
125
+ return
126
+
127
+ current_k += step
128
+ time.sleep(1)
129
+ ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
130
+ md = f"### Threshold not reached in max search range.\n"
131
+ for i, (score, d) in enumerate(ranked):
132
+ md += f"""
133
+ #### Rank {i+1}
134
+
135
+ [{d['title']}]({d['href']})
136
+
137
+ **Score:** `{score:.4f}`
138
+
139
+ {d['body']}
140
+
141
+ ---
142
+ """
143
+ yield md
144
+
145
+
146
+
147
+ # UI
148
+ pastel_css = """
149
+ body {
150
+ background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
151
+ }
152
+
153
+ /* gradient headings */
154
+ h1, h2, h3, h4 {
155
+ background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
156
+ -webkit-background-clip: text;
157
+ -webkit-text-fill-color: transparent;
158
+ font-weight: 800;
159
+ letter-spacing: 0.4px;
160
+ padding: 4px;
161
+ }
162
+
163
+ /* optional: slightly softer subtitle tone */
164
+ h2, h3 {
165
+ opacity: 0.9;
166
+ }
167
+
168
+
169
+ .gradio-container {
170
+ font-family: 'Helvetica Neue', sans-serif;
171
+ color: #1e3a8a;
172
+ }
173
+
174
+ /* model card */
175
+ .model-card {
176
+ background: #ffffff;
177
+ border-radius: 18px;
178
+ padding: 22px;
179
+ border: 1px solid #dbeafe;
180
+ box-shadow: 0 12px 20px rgba(60,120,255,0.18);
181
+ margin-bottom: 20px;
182
+ }
183
+
184
+ /* result card */
185
+ .result-card {
186
+ background: #ffffff;
187
+ border-radius: 18px;
188
+ padding: 22px;
189
+ border: 1px solid #dbeafe;
190
+ box-shadow: 0 12px 20px rgba(60,120,255,0.18);
191
+ }
192
+
193
+ .gr-markdown, .prose {
194
+ border: none !important;
195
+ box-shadow: none !important;
196
+ padding: 0 !important;
197
+ }
198
+
199
+ textarea, input {
200
+ border-radius: 12px !important;
201
+ border: 1px solid #c7ddff !important;
202
+ background-color: #f5f9ff !important;
203
+ color: #1e3a8a !important;
204
+ }
205
+
206
+ button {
207
+ background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
208
+ color: #ffffff !important;
209
+ border-radius: 14px !important;
210
+ border: 1px solid #93c5fd !important;
211
+ font-weight: 600;
212
+ letter-spacing: 0.3px;
213
+
214
+ box-shadow:
215
+ 0 6px 14px rgba(60,120,255,0.28),
216
+ inset 0 1px 0 rgba(255,255,255,0.6);
217
+
218
+ transition: all 0.25s ease;
219
+ }
220
+
221
+ button:hover {
222
+ background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
223
+ box-shadow:
224
+ 0 8px 18px rgba(60,120,255,0.35),
225
+ inset 0 1px 0 rgba(255,255,255,0.7);
226
+ transform: translateY(-1px);
227
+ }
228
+
229
+ button:active {
230
+ transform: translateY(1px);
231
+ box-shadow:
232
+ 0 3px 8px rgba(60,120,255,0.2),
233
+ inset 0 2px 4px rgba(0,0,0,0.08);
234
+ }
235
+
236
+ """
237
+
238
+ with gr.Blocks(css=pastel_css) as demo:
239
+
240
+ gr.Markdown('# Semantic Web Search and Deep Web Search')
241
+ gr.Markdown('## Fast Retrieval with Stable Static Embedding')
242
+
243
+ with gr.Column(elem_classes="model-card"):
244
+ gr.Markdown("""
245
+ ## About this Model
246
+ **RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
247
+
248
+ ### Performance
249
+ - **NanoBEIR NDCG@10 = 0.5124**
250
+ - Higher than other static embedding models
251
+
252
+ ### Efficiency
253
+ - 512 dimensions
254
+ - ~2× faster retrieval
255
+ - Separable Dynamic Tanh normalization
256
+ """)
257
+
258
+ with gr.Tabs():
259
+
260
+ # Standard
261
+ with gr.Tab("Standard Search"):
262
+
263
+ query1 = gr.Textbox(
264
+ value="What is Stable Static Embedding?",
265
+ label="Enter your search query"
266
+ )
267
+
268
+ btn1 = gr.Button("Search")
269
+
270
+ with gr.Column(elem_classes="result-card"):
271
+ out1 = gr.Markdown()
272
+
273
+ btn1.click(
274
+ semantic_web_search,
275
+ inputs=query1,
276
+ outputs=out1,
277
+
278
+ )
279
+
280
+ # deep
281
+ with gr.Tab("Deep Search"):
282
+
283
+ query2 = gr.Textbox(
284
+ value="What is Stable Static Embedding?",
285
+ label="Enter your search query"
286
+ )
287
+
288
+ threshold = gr.Slider(
289
+ 0.3, 0.95, value=0.7, step=0.05,
290
+ label="Score Threshold"
291
+ )
292
+
293
+ btn2 = gr.Button("Run Deep Search")
294
+
295
+ with gr.Column(elem_classes="result-card"):
296
+ out2 = gr.Markdown()
297
+
298
+ btn2.click(
299
+ progressive_search,
300
+ inputs=[query2, threshold],
301
+ outputs=out2,
302
+ show_progress=True,
303
+ )
304
+
305
+ gr.Markdown("© 2026 Rikka Botan")
306
+
307
+ demo.launch()