RikkaBotan commited on
Commit
02f6fec
·
verified ·
1 Parent(s): 8524691

Upload sse_deep_research.py

Browse files
Files changed (1) hide show
  1. sse_deep_research.py +304 -0
sse_deep_research.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from sentence_transformers import SentenceTransformer
4
+ from ddgs import DDGS
5
+
6
+ # Load Model
7
+ model = SentenceTransformer(
8
+ "RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en",
9
+ trust_remote_code=True,
10
+ device="cuda" if torch.cuda.is_available() else "cpu"
11
+ )
12
+
13
+
14
+ # Web Search
15
+ def web_search(query, max_results=100):
16
+ results = []
17
+ with DDGS() as ddgs:
18
+ for r in ddgs.text(query, max_results=max_results):
19
+ results.append({
20
+ "title": r.get("title", ""),
21
+ "body": r.get("body", ""),
22
+ "href": r.get("href", "")
23
+ })
24
+ return results
25
+
26
+
27
+ # Standard Semantic Search
28
+ def semantic_web_search(query):
29
+ if query.strip() == "":
30
+ return "Please enter a search query."
31
+
32
+ docs = web_search(query, max_results=100)
33
+ texts = [d["title"] + " " + d["body"] for d in docs]
34
+
35
+ with torch.no_grad():
36
+ embeddings = model.encode(
37
+ [query] + texts[:256],
38
+ convert_to_tensor=True,
39
+ normalize_embeddings=True
40
+ )
41
+
42
+ query_emb = embeddings[0]
43
+ doc_embs = embeddings[1:]
44
+ scores = (query_emb @ doc_embs.T).cpu().numpy()
45
+
46
+ ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:30]
47
+
48
+ md = ""
49
+ for i, (score, d) in enumerate(ranked):
50
+ md += f"""
51
+ #### 💎 Rank {i+1}
52
+ [{d['title']}]({d['href']})
53
+
54
+ **Score:** `{score:.4f}`
55
+
56
+ {d['body']}
57
+
58
+ ---
59
+ """
60
+ return md
61
+
62
+
63
+ # Progressive Threshold Search
64
+ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
65
+ if query.strip() == "":
66
+ return "Please enter a search query."
67
+
68
+ current_k = step
69
+ best_score = 0.0
70
+
71
+ while current_k <= max_cap:
72
+
73
+ docs = web_search(query, max_results=current_k)
74
+ texts = [d["title"] + " " + d["body"] for d in docs]
75
+
76
+ with torch.no_grad():
77
+ embeddings = model.encode(
78
+ [query] + texts[:256],
79
+ convert_to_tensor=True,
80
+ normalize_embeddings=True
81
+ )
82
+
83
+ query_emb = embeddings[0]
84
+ doc_embs = embeddings[1:]
85
+ scores = (query_emb @ doc_embs.T).cpu().numpy()
86
+
87
+ best_score = float(scores.max())
88
+
89
+ if best_score >= threshold:
90
+
91
+ ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
92
+
93
+ md = f"""
94
+ #### Threshold Reached
95
+
96
+ - Threshold: `{threshold}`
97
+
98
+ - **Best Score:** `{best_score:.4f}`
99
+
100
+ - Documents Searched: `{len(docs)}`
101
+
102
+ ---
103
+ """
104
+
105
+ for i, (score, d) in enumerate(ranked):
106
+ md += f"""
107
+ #### Rank {i+1}
108
+
109
+ [{d['title']}]({d['href']})
110
+
111
+ **Score:** `{score:.4f}`
112
+
113
+ {d['body']}
114
+
115
+ ---
116
+ """
117
+ return md
118
+
119
+ current_k += step
120
+
121
+ ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
122
+ md = f"""
123
+ #### Threshold Not Reached ๐·°(৹˃ᗝ˂৹)°·๐
124
+
125
+ - Threshold: `{threshold}`
126
+
127
+ - **Best Score:** `{best_score:.4f}`
128
+
129
+ - Documents Searched: `{current_k}`
130
+ """
131
+ for i, (score, d) in enumerate(ranked):
132
+ md += f"""
133
+ #### Rank {i+1}
134
+
135
+ [{d['title']}]({d['href']})
136
+
137
+ **Score:** `{score:.4f}`
138
+
139
+ {d['body']}
140
+
141
+ ---
142
+ """
143
+ return md
144
+
145
+
146
+ # UI
147
+ pastel_css = """
148
+ body {
149
+ background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
150
+ }
151
+
152
+ /* gradient headings */
153
+ h1, h2, h3, h4 {
154
+ background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
155
+ -webkit-background-clip: text;
156
+ -webkit-text-fill-color: transparent;
157
+ font-weight: 800;
158
+ letter-spacing: 0.4px;
159
+ padding: 4px;
160
+ }
161
+
162
+ /* optional: slightly softer subtitle tone */
163
+ h2, h3 {
164
+ opacity: 0.9;
165
+ }
166
+
167
+
168
+ .gradio-container {
169
+ font-family: 'Helvetica Neue', sans-serif;
170
+ color: #1e3a8a;
171
+ }
172
+
173
+ /* model card */
174
+ .model-card {
175
+ background: #ffffff;
176
+ border-radius: 18px;
177
+ padding: 22px;
178
+ border: 1px solid #dbeafe;
179
+ box-shadow: 0 12px 20px rgba(60,120,255,0.18);
180
+ margin-bottom: 20px;
181
+ }
182
+
183
+ /* result card */
184
+ .result-card {
185
+ background: #ffffff;
186
+ border-radius: 18px;
187
+ padding: 22px;
188
+ border: 1px solid #dbeafe;
189
+ box-shadow: 0 12px 20px rgba(60,120,255,0.18);
190
+ }
191
+
192
+ .gr-markdown, .prose {
193
+ border: none !important;
194
+ box-shadow: none !important;
195
+ padding: 0 !important;
196
+ }
197
+
198
+ textarea, input {
199
+ border-radius: 12px !important;
200
+ border: 1px solid #c7ddff !important;
201
+ background-color: #f5f9ff !important;
202
+ color: #1e3a8a !important;
203
+ }
204
+
205
+ button {
206
+ background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
207
+ color: #ffffff !important;
208
+ border-radius: 14px !important;
209
+ border: 1px solid #93c5fd !important;
210
+ font-weight: 600;
211
+ letter-spacing: 0.3px;
212
+
213
+ box-shadow:
214
+ 0 6px 14px rgba(60,120,255,0.28),
215
+ inset 0 1px 0 rgba(255,255,255,0.6);
216
+
217
+ transition: all 0.25s ease;
218
+ }
219
+
220
+ button:hover {
221
+ background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
222
+ box-shadow:
223
+ 0 8px 18px rgba(60,120,255,0.35),
224
+ inset 0 1px 0 rgba(255,255,255,0.7);
225
+ transform: translateY(-1px);
226
+ }
227
+
228
+ button:active {
229
+ transform: translateY(1px);
230
+ box-shadow:
231
+ 0 3px 8px rgba(60,120,255,0.2),
232
+ inset 0 2px 4px rgba(0,0,0,0.08);
233
+ }
234
+
235
+ """
236
+
237
+ with gr.Blocks(css=pastel_css) as demo:
238
+
239
+ gr.Markdown('# Semantic Web Search and Deep Web Search')
240
+ gr.Markdown('## Fast Retrieval with Stable Static Embedding')
241
+
242
+ with gr.Column(elem_classes="model-card"):
243
+ gr.Markdown("""
244
+ ## About this Model
245
+ **RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
246
+
247
+ ### Performance
248
+ - **NanoBEIR NDCG@10 = 0.5124**
249
+ - Higher than other static embedding models
250
+
251
+ ### Efficiency
252
+ - 512 dimensions
253
+ - ~2× faster retrieval
254
+ - Separable Dynamic Tanh normalization
255
+ """)
256
+
257
+ with gr.Tabs():
258
+
259
+ # Standard
260
+ with gr.Tab("Standard Search"):
261
+
262
+ query1 = gr.Textbox(
263
+ value="What is Stable Static Embedding?",
264
+ label="Enter your search query"
265
+ )
266
+
267
+ btn1 = gr.Button("Search")
268
+
269
+ with gr.Column(elem_classes="result-card"):
270
+ out1 = gr.Markdown()
271
+
272
+ btn1.click(
273
+ semantic_web_search,
274
+ inputs=query1,
275
+ outputs=out1
276
+ )
277
+
278
+ # deep
279
+ with gr.Tab("Deep Search"):
280
+
281
+ query2 = gr.Textbox(
282
+ value="What is Stable Static Embedding?",
283
+ label="Enter your search query"
284
+ )
285
+
286
+ threshold = gr.Slider(
287
+ 0.3, 0.95, value=0.7, step=0.05,
288
+ label="Score Threshold"
289
+ )
290
+
291
+ btn2 = gr.Button("Run Deep Search")
292
+
293
+ with gr.Column(elem_classes="result-card"):
294
+ out2 = gr.Markdown()
295
+
296
+ btn2.click(
297
+ progressive_search,
298
+ inputs=[query2, threshold],
299
+ outputs=out2
300
+ )
301
+
302
+ gr.Markdown("© 2026 Rikka Botan")
303
+
304
+ demo.launch()