Stable-Static-Embedding-Semantic-Web-Search-Japanese

Running

App Files Files Community

RikkaBotan commited on Feb 20

Commit

f347c89

verified ·

1 Parent(s): e364ce1

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -38

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 from sentence_transformers import SentenceTransformer
 from ddgs import DDGS
 import time
 # Load Model
 model = SentenceTransformer(
@@ -56,19 +57,14 @@ def semantic_web_search(query):
     for i, (score, d) in enumerate(ranked):
         md += f"""
 #### 💎 Rank {i+1}
 [{d['title']}]({d['href']})
 **Score:** `{score:.4f}`
 {d['body']}
 ---
 """
     return md
-# Progressive Threshold Search with progress
 def progressive_search(query, threshold=0.7, step=50, max_cap=999):
     if query.strip() == "":
         yield "Please enter a search query."
@@ -76,6 +72,12 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
     current_k = step
     while current_k <= max_cap:
         try:
             docs = web_search(query, max_results=current_k)
@@ -89,7 +91,20 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
             current_k += step
             continue
-        texts = [d["title"] + " " + d["body"] for d in docs]
         with torch.no_grad():
             embeddings = model.encode(
@@ -100,15 +115,30 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
         query_emb = embeddings[0]
         doc_embs = embeddings[1:]
-        scores = (query_emb @ doc_embs.T).cpu().numpy()
-        best_score = float(scores.max())
-        md = f"### Searching…\n- Documents examined: `{len(docs)}`\n- Best score so far: `{best_score:.4f}`\n"
         yield md
         if best_score >= threshold:
-            ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
-            md = f"### Threshold reached!\n"
             for i, (score, d) in enumerate(ranked):
                 md += f"""
 #### Rank {i+1}
@@ -126,10 +156,17 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
         current_k += step
         time.sleep(1)
-        ranked = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)[:5]
-        md = f"### Threshold not reached in max search range.\n"
-        for i, (score, d) in enumerate(ranked):
-            md += f"""
 #### Rank {i+1}
 [{d['title']}]({d['href']})
@@ -140,8 +177,8 @@ def progressive_search(query, threshold=0.7, step=50, max_cap=999):
 ---
 """
-    yield md
 # UI
@@ -149,7 +186,6 @@ pastel_css = """
 body {
     background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
 }
 /* gradient headings */
 h1, h2, h3, h4 {
     background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
@@ -159,18 +195,14 @@ h1, h2, h3, h4 {
     letter-spacing: 0.4px;
     padding: 4px;
 }
 /* optional: slightly softer subtitle tone */
 h2, h3 {
     opacity: 0.9;
 }
 .gradio-container {
     font-family: 'Helvetica Neue', sans-serif;
     color: #1e3a8a;
 }
 /* model card */
 .model-card {
     background: #ffffff;
@@ -180,7 +212,6 @@ h2, h3 {
     box-shadow: 0 12px 20px rgba(60,120,255,0.18);
     margin-bottom: 20px;
 }
 /* result card */
 .result-card {
     background: #ffffff;
@@ -189,51 +220,42 @@ h2, h3 {
     border: 1px solid #dbeafe;
     box-shadow: 0 12px 20px rgba(60,120,255,0.18);
 }
 .gr-markdown, .prose {
     border: none !important;
     box-shadow: none !important;
     padding: 0 !important;
     color: #1e3a8a !important;
 }
 .model-card, .result-card {
     background: #ffffff;
     color: #1e3a8a;
 }
 @media (prefers-color-scheme: dark) {
     body {
         background: linear-gradient(180deg, #0f172a 0%, #1e293b 40%, #334155 100%);
     }
     .gradio-container {
         color: #dbeafe;
     }
     .gr-markdown, .prose {
         color: #dbeafe !important;
     }
     .model-card, .result-card {
         background: #1a1a1a;
         color: #dbeafe;
         border: 1px solid #3b82f6;
         box-shadow: 0 12px 20px rgba(60,120,255,0.18);
     }
     .gr-markdown, .prose {
         color: #dbeafe !important;
     }
 }
 textarea, input {
     border-radius: 12px !important;
     border: 1px solid #c7ddff !important;
     background-color: #f5f9ff !important;
     color: #1e3a8a !important;
 }
 button {
     background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
     color: #ffffff !important;
@@ -241,14 +263,11 @@ button {
     border: 1px solid #93c5fd !important;
     font-weight: 600;
     letter-spacing: 0.3px;
     box-shadow:
         0 6px 14px rgba(60,120,255,0.28),
         inset 0 1px 0 rgba(255,255,255,0.6);
     transition: all 0.25s ease;
 }
 button:hover {
     background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
     box-shadow:
@@ -256,14 +275,12 @@ button:hover {
         inset 0 1px 0 rgba(255,255,255,0.7);
     transform: translateY(-1px);
 }
 button:active {
     transform: translateY(1px);
     box-shadow:
         0 3px 8px rgba(60,120,255,0.2),
         inset 0 2px 4px rgba(0,0,0,0.08);
 }
 """
 with gr.Blocks(css=pastel_css) as demo:
@@ -275,11 +292,9 @@ with gr.Blocks(css=pastel_css) as demo:
         gr.Markdown("""
 ## About this Model
 **RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
 ### Performance
 - **NanoBEIR NDCG@10 = 0.5124**
 - Higher than other static embedding models
 ### Efficiency
 - 512 dimensions
 - ~2× faster retrieval

 from sentence_transformers import SentenceTransformer
 from ddgs import DDGS
 import time
+import numpy as np
 # Load Model
 model = SentenceTransformer(
     for i, (score, d) in enumerate(ranked):
         md += f"""
 #### 💎 Rank {i+1}
 [{d['title']}]({d['href']})
 **Score:** `{score:.4f}`
 {d['body']}
 ---
 """
     return md
 def progressive_search(query, threshold=0.7, step=50, max_cap=999):
     if query.strip() == "":
         yield "Please enter a search query."
     current_k = step
+    scores_last = []
+    docs_last = []
+    seen_urls = set()
+    total_examined = 0
     while current_k <= max_cap:
         try:
             docs = web_search(query, max_results=current_k)
             current_k += step
             continue
+        total_examined += len(docs)
+        new_docs = []
+        for d in docs:
+            url = d["href"]
+            if url not in seen_urls:
+                seen_urls.add(url)
+                new_docs.append(d)
+        if len(new_docs) == 0:
+            current_k += step
+            continue
+        texts = [d["title"] + " " + d["body"] for d in new_docs]
         with torch.no_grad():
             embeddings = model.encode(
         query_emb = embeddings[0]
         doc_embs = embeddings[1:]
+        scores = (query_emb @ doc_embs.T).cpu().numpy().flatten()
+        scores_last.extend(scores.tolist())
+        docs_last.extend(new_docs)
+        best_score = float(np.max(scores_last))
+        md = (
+            f"### Searching…\n"
+            f"- Documents examined (with duplicates): `{total_examined}`\n"
+            f"- Best score so far: `{best_score:.4f}`\n"
+        )
         yield md
         if best_score >= threshold:
+            ranked = sorted(
+                zip(scores_last, docs_last),
+                key=lambda x: x[0],
+                reverse=True
+            )[:5]
+            md = "### Threshold reached!\n"
             for i, (score, d) in enumerate(ranked):
                 md += f"""
 #### Rank {i+1}
         current_k += step
         time.sleep(1)
+    ranked = sorted(
+        zip(scores_last, docs_last),
+        key=lambda x: x[0],
+        reverse=True
+    )[:5]
+    md = "### Threshold not reached in max search range.\n"
+    for i, (score, d) in enumerate(ranked):
+        md += f"""
 #### Rank {i+1}
 [{d['title']}]({d['href']})
 ---
 """
+    yield md
 # UI
 body {
     background: linear-gradient(180deg, #f5f9ff 0%, #eaf3ff 40%, #dbeafe 100%);
 }
 /* gradient headings */
 h1, h2, h3, h4 {
     background: linear-gradient(135deg, #0b1f5e 0%, #1e3a8a 15%, #3b82f6 30%, #93c5fd 100%);
     letter-spacing: 0.4px;
     padding: 4px;
 }
 /* optional: slightly softer subtitle tone */
 h2, h3 {
     opacity: 0.9;
 }
 .gradio-container {
     font-family: 'Helvetica Neue', sans-serif;
     color: #1e3a8a;
 }
 /* model card */
 .model-card {
     background: #ffffff;
     box-shadow: 0 12px 20px rgba(60,120,255,0.18);
     margin-bottom: 20px;
 }
 /* result card */
 .result-card {
     background: #ffffff;
     border: 1px solid #dbeafe;
     box-shadow: 0 12px 20px rgba(60,120,255,0.18);
 }
 .gr-markdown, .prose {
     border: none !important;
     box-shadow: none !important;
     padding: 0 !important;
     color: #1e3a8a !important;
 }
 .model-card, .result-card {
     background: #ffffff;
     color: #1e3a8a;
 }
 @media (prefers-color-scheme: dark) {
     body {
         background: linear-gradient(180deg, #0f172a 0%, #1e293b 40%, #334155 100%);
     }
     .gradio-container {
         color: #dbeafe;
     }
     .gr-markdown, .prose {
         color: #dbeafe !important;
     }
     .model-card, .result-card {
         background: #1a1a1a;
         color: #dbeafe;
         border: 1px solid #3b82f6;
         box-shadow: 0 12px 20px rgba(60,120,255,0.18);
     }
     .gr-markdown, .prose {
         color: #dbeafe !important;
     }
 }
 textarea, input {
     border-radius: 12px !important;
     border: 1px solid #c7ddff !important;
     background-color: #f5f9ff !important;
     color: #1e3a8a !important;
 }
 button {
     background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 40%, #93c5fd 100%) !important;
     color: #ffffff !important;
     border: 1px solid #93c5fd !important;
     font-weight: 600;
     letter-spacing: 0.3px;
     box-shadow:
         0 6px 14px rgba(60,120,255,0.28),
         inset 0 1px 0 rgba(255,255,255,0.6);
     transition: all 0.25s ease;
 }
 button:hover {
     background: linear-gradient(135deg, #1b3380 0%, #2563eb 40%, #7fb8ff 100%) !important;
     box-shadow:
         inset 0 1px 0 rgba(255,255,255,0.7);
     transform: translateY(-1px);
 }
 button:active {
     transform: translateY(1px);
     box-shadow:
         0 3px 8px rgba(60,120,255,0.2),
         inset 0 2px 4px rgba(0,0,0,0.08);
 }
 """
 with gr.Blocks(css=pastel_css) as demo:
         gr.Markdown("""
 ## About this Model
 **RikkaBotan/stable-static-embedding-fast-retrieval-mrl-en**
 ### Performance
 - **NanoBEIR NDCG@10 = 0.5124**
 - Higher than other static embedding models
 ### Efficiency
 - 512 dimensions
 - ~2× faster retrieval