Spaces:

RikkaBotan
/

Stable-Static-Embedding-Semantic-Web-Search

Running

App Files Files Community

RikkaBotan commited on 1 day ago

Commit

577d2bb

verified ·

1 Parent(s): 13f7edd

Update app.py

Browse files

Files changed (1) hide show

app.py +307 -304

app.py CHANGED Viewed

@@ -1,304 +1,307 @@
-import gradio as gr
-import torch
-from sentence_transformers import SentenceTransformer
-from ddgs import DDGS
-# Load Model
-model = SentenceTransformer(
-    "RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en",
-    trust_remote_code=True,
-    device="cuda" if torch.cuda.is_available() else "cpu"
-)
-# Web Search
-def web_search(query, max_results=100):
-    results = []
-    with DDGS() as ddgs:
-        for r in ddgs.text(query, max_results=max_results):
-            results.append({
-                "title": r.get("title", ""),
-                "body": r.get("body", ""),
-                "href": r.get("href", "")
-            })
-    return results
-# Standard Semantic Search
-def semantic_web_search(query):
-    if query.strip() == "":
-        return "Please enter a search query."
-    docs = web_search(query, max_results=100)
-    texts = [d["title"] + " " + d["body"] for d in docs]
-    with torch.no_grad():
-        embeddings = model.encode(
-            [query] + texts[:256],
-            convert_to_tensor=True,
-            normalize_embeddings=True
-        )
-    query_emb = embeddings[0]
-    doc_embs = embeddings[1:]
-    scores = (query_emb @ doc_embs.T).cpu().numpy()
-    ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:30]
-    md = ""
-    for i, (score, d) in enumerate(ranked):
-        md += f"""
-#### 💎 Rank {i+1}
-[{d['title']}]({d['href']})
-**Score:** `{score:.4f}`
-{d['body']}
----
-"""
-    return md
-# Progressive Threshold Search
-def progressive_search(query, threshold=0.7, step=50, max_cap=999):
-    if query.strip() == "":
-        return "Please enter a search query."
-    current_k = step
-    best_score = 0.0
-    while current_k <= max_cap:
-        docs = web_search(query, max_results=current_k)
-        texts = [d["title"] + " " + d["body"] for d in docs]
-        with torch.no_grad():
-            embeddings = model.encode(
-                [query] + texts[:256],
-                convert_to_tensor=True,
-                normalize_embeddings=True
-            )
-        query_emb = embeddings[0]
-        doc_embs = embeddings[1:]
-        scores = (query_emb @ doc_embs.T).cpu().numpy()
-        best_score = float(scores.max())
-        if best_score >= threshold:
-            ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
-            md = f"""
-#### Threshold Reached
-- Threshold: `{threshold}`
-- **Best Score:** `{best_score:.4f}`
-- Documents Searched: `{len(docs)}`
----
-"""
-            for i, (score, d) in enumerate(ranked):
-                md += f"""
-#### Rank {i+1}
-[{d['title']}]({d['href']})
-**Score:** `{score:.4f}`
-{d['body']}
----
-"""
-            return md
-        current_k += step
-        ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
-        md = f"""
-#### Threshold Not Reached ๐·°(৹˃ᗝ˂৹)°·๐
-- Threshold: `{threshold}`
-- **Best Score:** `{best_score:.4f}`
-- Documents Searched: `{current_k}`
-"""
-        for i, (score, d) in enumerate(ranked):
-            md += f"""
-#### Rank {i+1}
-[{d['title']}]({d['href']})
-**Score:** `{score:.4f}`
-{d['body']}
----
-"""
-    return md
-# UI
-pastel_css = """
-body {
-    background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
-}
-/* gradient headings */
-h1, h2, h3, h4 {
-    background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
-    -webkit-background-clip: text;
-    -webkit-text-fill-color: transparent;
-    font-weight: 800;
-    letter-spacing: 0.4px;
-    padding: 4px;
-}
-/* optional: slightly softer subtitle tone */
-h2, h3 {
-    opacity: 0.9;
-}
-.gradio-container {
-    font-family: 'Helvetica Neue', sans-serif;
-    color: #1e3a8a;
-}
-/* model card */
-.model-card {
-    background: #ffffff;
-    border-radius: 18px;
-    padding: 22px;
-    border: 1px solid #dbeafe;
-    box-shadow: 0 12px 20px rgba(60,120,255,0.18);
-    margin-bottom: 20px;
-}
-/* result card */
-.result-card {
-    background: #ffffff;
-    border-radius: 18px;
-    padding: 22px;
-    border: 1px solid #dbeafe;
-    box-shadow: 0 12px 20px rgba(60,120,255,0.18);
-}
-.gr-markdown, .prose {
-    border: none !important;
-    box-shadow: none !important;
-    padding: 0 !important;
-}
-textarea, input {
-    border-radius: 12px !important;
-    border: 1px solid #c7ddff !important;
-    background-color: #f5f9ff !important;
-    color: #1e3a8a !important;
-}
-button {
-    background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
-    color: #ffffff !important;
-    border-radius: 14px !important;
-    border: 1px solid #93c5fd !important;
-    font-weight: 600;
-    letter-spacing: 0.3px;
-    box-shadow:
-        0 6px 14px rgba(60,120,255,0.28),
-        inset 0 1px 0 rgba(255,255,255,0.6);
-    transition: all 0.25s ease;
-}
-button:hover {
-    background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
-    box-shadow:
-        0 8px 18px rgba(60,120,255,0.35),
-        inset 0 1px 0 rgba(255,255,255,0.7);
-    transform: translateY(-1px);
-}
-button:active {
-    transform: translateY(1px);
-    box-shadow:
-        0 3px 8px rgba(60,120,255,0.2),
-        inset 0 2px 4px rgba(0,0,0,0.08);
-}
-"""
-with gr.Blocks(css=pastel_css) as demo:
-    gr.Markdown('# Semantic Web Search and Deep Web Search')
-    gr.Markdown('## Fast Retrieval with Stable Static Embedding')
-    with gr.Column(elem_classes="model-card"):
-        gr.Markdown("""
-## About this Model
-**RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
-### Performance
-- **NanoBEIR NDCG@10 = 0.5124**
-- Higher than other static embedding models
-### Efficiency
-- 512 dimensions
-- ~2× faster retrieval
-- Separable Dynamic Tanh normalization
-""")
-    with gr.Tabs():
-        # Standard
-        with gr.Tab("Standard Search"):
-            query1 = gr.Textbox(
-                value="What is Stable Static Embedding?",
-                label="Enter your search query"
-            )
-            btn1 = gr.Button("Search")
-            with gr.Column(elem_classes="result-card"):
-                out1 = gr.Markdown()
-            btn1.click(
-                semantic_web_search,
-                inputs=query1,
-                outputs=out1
-            )
-        # deep
-        with gr.Tab("Deep Search"):
-            query2 = gr.Textbox(
-                value="What is Stable Static Embedding?",
-                label="Enter your search query"
-            )
-            threshold = gr.Slider(
-                0.3, 0.95, value=0.7, step=0.05,
-                label="Score Threshold"
-            )
-            btn2 = gr.Button("Run Deep Search")
-            with gr.Column(elem_classes="result-card"):
-                out2 = gr.Markdown()
-            btn2.click(
-                progressive_search,
-                inputs=[query2, threshold],
-                outputs=out2
-            )
-    gr.Markdown("© 2026 Rikka Botan")
-demo.launch()

+import gradio as gr
+import torch
+from sentence_transformers import SentenceTransformer
+from ddgs import DDGS
+import time
+# Load Model
+model = SentenceTransformer(
+    "RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en",
+    trust_remote_code=True,
+    device="cuda" if torch.cuda.is_available() else "cpu"
+)
+# Web Search with error handling
+def web_search(query, max_results=100):
+    results = []
+    with DDGS() as ddgs:
+        try:
+            for i, r in enumerate(ddgs.text(query, max_results=max_results), start=1):
+                try:
+                    results.append({
+                        "title": r.get("title", ""),
+                        "body": r.get("body", ""),
+                        "href": r.get("href", "")
+                    })
+                except Exception as e:
+                    print(f"Skip doc {i}: {e}")
+        except Exception as e:
+            print(f"Skip web batch (max={max_results}): {e}")
+    return results
+# Standard Semantic Search
+def semantic_web_search(query):
+    if query.strip() == "":
+        return "Please enter a search query."
+    docs = web_search(query, max_results=100)
+    texts = [d["title"] + " " + d["body"] for d in docs]
+    with torch.no_grad():
+        embeddings = model.encode(
+            [query] + texts[:256],
+            convert_to_tensor=True,
+            normalize_embeddings=True
+        )
+    query_emb = embeddings[0]
+    doc_embs = embeddings[1:]
+    scores = (query_emb @ doc_embs.T).cpu().numpy()
+    ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:30]
+    md = ""
+    for i, (score, d) in enumerate(ranked):
+        md += f"""
+#### 💎 Rank {i+1}
+[{d['title']}]({d['href']})
+**Score:** `{score:.4f}`
+{d['body']}
+---
+"""
+    return md
+# Progressive Threshold Search with progress
+def progressive_search(query, threshold=0.7, step=50, max_cap=999):
+    if query.strip() == "":
+        yield "Please enter a search query."
+        return
+    current_k = step
+    while current_k <= max_cap:
+        try:
+            docs = web_search(query, max_results=current_k)
+        except Exception as e:
+            yield f"Skipped batch {current_k} due to error: {e}"
+            current_k += step
+            continue
+        if len(docs) == 0:
+            yield f"No documents fetched for {current_k} results"
+            current_k += step
+            continue
+        texts = [d["title"] + " " + d["body"] for d in docs]
+        with torch.no_grad():
+            embeddings = model.encode(
+                [query] + texts[:256],
+                convert_to_tensor=True,
+                normalize_embeddings=True
+            )
+        query_emb = embeddings[0]
+        doc_embs = embeddings[1:]
+        scores = (query_emb @ doc_embs.T).cpu().numpy()
+        best_score = float(scores.max())
+        md = f"### Searching…\n- Documents examined: `{len(docs)}`\n- Best score so far: `{best_score:.4f}`\n"
+        yield md
+        if best_score >= threshold:
+            ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
+            md = f"### Threshold reached!\n"
+            for i, (score, d) in enumerate(ranked):
+                md += f"""
+#### Rank {i+1}
+[{d['title']}]({d['href']})
+**Score:** `{score:.4f}`
+{d['body']}
+---
+"""
+            yield md
+            return
+        current_k += step
+        time.sleep(1)
+        ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
+        md = f"### Threshold not reached in max search range.\n"
+        for i, (score, d) in enumerate(ranked):
+            md += f"""
+#### Rank {i+1}
+[{d['title']}]({d['href']})
+**Score:** `{score:.4f}`
+{d['body']}
+---
+"""
+    yield md
+# UI
+pastel_css = """
+body {
+    background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
+}
+/* gradient headings */
+h1, h2, h3, h4 {
+    background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    font-weight: 800;
+    letter-spacing: 0.4px;
+    padding: 4px;
+}
+/* optional: slightly softer subtitle tone */
+h2, h3 {
+    opacity: 0.9;
+}
+.gradio-container {
+    font-family: 'Helvetica Neue', sans-serif;
+    color: #1e3a8a;
+}
+/* model card */
+.model-card {
+    background: #ffffff;
+    border-radius: 18px;
+    padding: 22px;
+    border: 1px solid #dbeafe;
+    box-shadow: 0 12px 20px rgba(60,120,255,0.18);
+    margin-bottom: 20px;
+}
+/* result card */
+.result-card {
+    background: #ffffff;
+    border-radius: 18px;
+    padding: 22px;
+    border: 1px solid #dbeafe;
+    box-shadow: 0 12px 20px rgba(60,120,255,0.18);
+}
+.gr-markdown, .prose {
+    border: none !important;
+    box-shadow: none !important;
+    padding: 0 !important;
+}
+textarea, input {
+    border-radius: 12px !important;
+    border: 1px solid #c7ddff !important;
+    background-color: #f5f9ff !important;
+    color: #1e3a8a !important;
+}
+button {
+    background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
+    color: #ffffff !important;
+    border-radius: 14px !important;
+    border: 1px solid #93c5fd !important;
+    font-weight: 600;
+    letter-spacing: 0.3px;
+    box-shadow:
+        0 6px 14px rgba(60,120,255,0.28),
+        inset 0 1px 0 rgba(255,255,255,0.6);
+    transition: all 0.25s ease;
+}
+button:hover {
+    background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
+    box-shadow:
+        0 8px 18px rgba(60,120,255,0.35),
+        inset 0 1px 0 rgba(255,255,255,0.7);
+    transform: translateY(-1px);
+}
+button:active {
+    transform: translateY(1px);
+    box-shadow:
+        0 3px 8px rgba(60,120,255,0.2),
+        inset 0 2px 4px rgba(0,0,0,0.08);
+}
+"""
+with gr.Blocks(css=pastel_css) as demo:
+    gr.Markdown('# Semantic Web Search and Deep Web Search')
+    gr.Markdown('## Fast Retrieval with Stable Static Embedding')
+    with gr.Column(elem_classes="model-card"):
+        gr.Markdown("""
+## About this Model
+**RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
+### Performance
+- **NanoBEIR NDCG@10 = 0.5124**
+- Higher than other static embedding models
+### Efficiency
+- 512 dimensions
+- ~2× faster retrieval
+- Separable Dynamic Tanh normalization
+""")
+    with gr.Tabs():
+        # Standard
+        with gr.Tab("Standard Search"):
+            query1 = gr.Textbox(
+                value="What is Stable Static Embedding?",
+                label="Enter your search query"
+            )
+            btn1 = gr.Button("Search")
+            with gr.Column(elem_classes="result-card"):
+                out1 = gr.Markdown()
+            btn1.click(
+                semantic_web_search,
+                inputs=query1,
+                outputs=out1,
+            )
+        # deep
+        with gr.Tab("Deep Search"):
+            query2 = gr.Textbox(
+                value="What is Stable Static Embedding?",
+                label="Enter your search query"
+            )
+            threshold = gr.Slider(
+                0.3, 0.95, value=0.7, step=0.05,
+                label="Score Threshold"
+            )
+            btn2 = gr.Button("Run Deep Search")
+            with gr.Column(elem_classes="result-card"):
+                out2 = gr.Markdown()
+            btn2.click(
+                progressive_search,
+                inputs=[query2, threshold],
+                outputs=out2,
+                show_progress=True,
+            )
+    gr.Markdown("© 2026 Rikka Botan")
+demo.launch()