Spaces:

sentence-transformers
/

quantized-retrieval

Running

App Files Files Community

Tom Aarsen commited on 29 days ago

Commit

e005eea

1 Parent(s): cf19736

Rewrite the app frontend; fix accidental exact search bug

Browse files

Files changed (1) hide show

app.py +112 -83

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import time
 import gradio as gr
 from datasets import load_dataset, load_from_disk
 from huggingface_hub import hf_hub_download
@@ -10,12 +11,26 @@ import numpy as np
 # Load titles, texts, and int8 embeddings in a lazy Dataset, allowing us to efficiently access specific rows on demand
 # Note that we never actually use the int8 embeddings for search directly, they are only used for rescoring after the binary search
-title_text_int8_dataset = load_dataset("sentence-transformers/quantized-retrieval-data", split="train").select_columns(["url", "title", "text", "embedding"])
 # title_text_int8_dataset = load_from_disk("wikipedia-mxbai-embed-int8-index").select_columns(["url", "title", "text", "embedding"])
 # Load the binary indices
-binary_index_path = hf_hub_download(repo_id="sentence-transformers/quantized-retrieval-data", filename="wikipedia_ubinary_faiss_50m.index", local_dir=".", repo_type="dataset")
-binary_ivf_index_path = hf_hub_download(repo_id="sentence-transformers/quantized-retrieval-data", filename="wikipedia_ubinary_ivf_faiss_50m.index", local_dir=".", repo_type="dataset")
 binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(binary_index_path)
 binary_ivf_index: faiss.IndexBinaryIVF = faiss.read_index_binary(binary_ivf_index_path)
@@ -32,16 +47,14 @@ warmup_queries = [
     "How to bake a chocolate cake?",
     "What is the theory of relativity?",
 ]
-model.encode(warmup_queries)
 def search(
     query,
     top_k: int = 20,
     rescore_multiplier: int = 4,
-    use_approx: bool = False,
-    display_score: bool = True,
-    display_binary_rank: bool = False,
 ):
     # 1. Embed the query as float32
     start_time = time.time()
@@ -63,6 +76,7 @@ def search(
     )
     binary_ids = binary_ids[0]
     search_time = time.time() - start_time
     # 4. Load the corresponding int8 embeddings
     start_time = time.time()
@@ -85,43 +99,85 @@ def search(
     # 7. Load titles and texts for the top_k results
     start_time = time.time()
-    top_k_titles = title_text_int8_dataset[top_k_indices]["title"]
     top_k_urls = title_text_int8_dataset[top_k_indices]["url"]
     top_k_texts = title_text_int8_dataset[top_k_indices]["text"]
-    top_k_titles = [f"[{title}]({url})" for title, url in zip(top_k_titles, top_k_urls)]
     load_text_time = time.time() - start_time
-    rank = np.arange(1, top_k + 1)
-    data = {
-        "Score": [f"{score:.2f}" for score in top_k_scores],
-        "#": rank,
-        "Binary #": indices + 1,
-        "Title": top_k_titles,
-        "Text": top_k_texts,
-    }
-    if not display_score:
-        del data["Score"]
-    if not display_binary_rank:
-        del data["Binary #"]
-        del data["#"]
-    df = pd.DataFrame(data)
-    return df, {
-        "Embed Time": f"{embed_time:.4f} s",
-        "Quantize Time": f"{quantize_time:.4f} s",
-        "Search Time": f"{search_time:.4f} s",
-        "Load int8 Time": f"{load_int8_time:.4f} s",
-        "Rescore Time": f"{rescore_time:.4f} s",
-        "Sort Time": f"{sort_time:.4f} s",
-        "Load Text Time": f"{load_text_time:.4f} s",
-        "Total Retrieval Time": f"{quantize_time + search_time + load_int8_time + rescore_time + sort_time + load_text_time:.4f} s",
-    }
 with gr.Blocks(title="Quantized Retrieval") as demo:
-    gr.Markdown(
-        """
-## Quantized Retrieval - Binary Search with Scalar (int8) Rescoring
 This demo showcases retrieval using [quantized embeddings](https://huggingface.co/blog/embedding-quantization) on a CPU. The corpus consists of [41 million texts](https://huggingface.co/datasets/sentence-transformers/quantized-retrieval-data) from Wikipedia articles.
 <details><summary>Click to learn about the retrieval process</summary>
@@ -148,41 +204,24 @@ Notes:
 - The approximate search index (a binary Inverted File Index (IVF)) is in beta and has not been trained with a lot of data.
 </details>
 """
-    )
-    with gr.Row():
-        with gr.Column(scale=60):
             query = gr.Textbox(
                 label="Query for Wikipedia articles",
                 placeholder="Enter a query to search for relevant texts from Wikipedia.",
             )
-        with gr.Column(scale=25):
-            use_approx = gr.Radio(
-                choices=[("Exact Search", False), ("Approximate Search", True)],
-                value=True,
-                label="Search Settings",
-            )
-        with gr.Column(scale=15):
-            display_score = gr.Checkbox(
-                label="Display Score",
-                value=True,
-            )
-            display_binary_rank = gr.Checkbox(
-                label='Display Binary Rank',
-                value=False,
-            )
-    with gr.Row():
-        with gr.Column(scale=2):
             top_k = gr.Slider(
                 minimum=10,
                 maximum=1000,
                 step=1,
                 value=20,
                 label="Number of documents to retrieve",
-                info="Number of documents to retrieve from the binary search",
             )
-        with gr.Column(scale=2):
             rescore_multiplier = gr.Slider(
                 minimum=1,
                 maximum=10,
@@ -191,17 +230,17 @@ Notes:
                 label="Rescore multiplier",
                 info="Search for `rescore_multiplier` as many documents to rescore",
             )
-    search_button = gr.Button(value="Search")
     with gr.Row():
-        with gr.Column(scale=4):
-            output = gr.Dataframe(
-                headers=["Score", "#", "Binary #", "Title", "Text"],
-                datatype="markdown",
-            )
         with gr.Column(scale=1):
-            json = gr.JSON()
     examples = gr.Examples(
         examples=[
@@ -212,30 +251,20 @@ Notes:
         ],
         fn=search,
         inputs=[query],
-        outputs=[output, json],
         cache_examples=False,
         run_on_click=True,
     )
     query.submit(
         search,
-        inputs=[query, top_k, rescore_multiplier, use_approx, display_score, display_binary_rank],
-        outputs=[output, json],
     )
     search_button.click(
         search,
-        inputs=[query, top_k, rescore_multiplier, use_approx, display_score, display_binary_rank],
-        outputs=[output, json],
-    )
-    display_score.change(
-        search,
-        inputs=[query, top_k, rescore_multiplier, use_approx, display_score, display_binary_rank],
-        outputs=[output, json],
-    )
-    display_binary_rank.change(
-        search,
-        inputs=[query, top_k, rescore_multiplier, use_approx, display_score, display_binary_rank],
-        outputs=[output, json],
     )
 demo.queue()

 import time
+import html
 import gradio as gr
 from datasets import load_dataset, load_from_disk
 from huggingface_hub import hf_hub_download
 # Load titles, texts, and int8 embeddings in a lazy Dataset, allowing us to efficiently access specific rows on demand
 # Note that we never actually use the int8 embeddings for search directly, they are only used for rescoring after the binary search
+title_text_int8_dataset = load_dataset(
+    "sentence-transformers/quantized-retrieval-data", split="train"
+).select_columns(["url", "title", "text", "embedding"])
 # title_text_int8_dataset = load_from_disk("wikipedia-mxbai-embed-int8-index").select_columns(["url", "title", "text", "embedding"])
+TOTAL_NUM_DOCS = title_text_int8_dataset.num_rows
 # Load the binary indices
+binary_index_path = hf_hub_download(
+    repo_id="sentence-transformers/quantized-retrieval-data",
+    filename="wikipedia_ubinary_faiss_50m.index",
+    local_dir=".",
+    repo_type="dataset",
+)
+binary_ivf_index_path = hf_hub_download(
+    repo_id="sentence-transformers/quantized-retrieval-data",
+    filename="wikipedia_ubinary_ivf_faiss_50m.index",
+    local_dir=".",
+    repo_type="dataset",
+)
 binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(binary_index_path)
 binary_ivf_index: faiss.IndexBinaryIVF = faiss.read_index_binary(binary_ivf_index_path)
     "How to bake a chocolate cake?",
     "What is the theory of relativity?",
 ]
+model.encode_query(warmup_queries)
 def search(
     query,
     top_k: int = 20,
     rescore_multiplier: int = 4,
+    use_approx: bool = True,
 ):
     # 1. Embed the query as float32
     start_time = time.time()
     )
     binary_ids = binary_ids[0]
     search_time = time.time() - start_time
+    num_docs_searched = len(binary_ids)
     # 4. Load the corresponding int8 embeddings
     start_time = time.time()
     # 7. Load titles and texts for the top_k results
     start_time = time.time()
+    raw_top_k_titles = title_text_int8_dataset[top_k_indices]["title"]
     top_k_urls = title_text_int8_dataset[top_k_indices]["url"]
     top_k_texts = title_text_int8_dataset[top_k_indices]["text"]
     load_text_time = time.time() - start_time
+    # Build HTML cards for each result so the full row is visible at once
+    cards = []
+    for i in range(len(top_k_indices)):
+        title = html.escape(str(raw_top_k_titles[i]))
+        url = html.escape(str(top_k_urls[i]))
+        text = html.escape(str(top_k_texts[i]))
+        score_str = f"{top_k_scores[i]:.2f}"
+        rank_str = str(i + 1)
+        binary_rank_str = str(indices[i] + 1)
+        card_html = f"""
+<div style=\"border: 1px solid var(--border-color-primary, #e0e0e0); border-radius: 10px; padding: 10px 12px; margin-bottom: 10px; background-color: var(--block-background-fill, transparent); color: inherit;\">
+    <div style=\"display: flex; align-items: flex-start; justify-content: space-between; gap: 8px; margin-bottom: 4px;\">
+        <div style=\"font-size: 16px; font-weight: 600; min-width: 0;\">
+            <a href=\"{url}\" target=\"_blank\" style=\"text-decoration: none; color: var(--link-text-color, #1f6feb);\">{title}</a>
+        </div>
+        <div style=\"font-size: 12px; color: var(--body-text-color-subdued, #586069); text-align: right; white-space: nowrap;\">
+            Score: {score_str} • Rank: {rank_str} • Binary rank: {binary_rank_str}
+        </div>
+    </div>
+    <div style=\"font-size: 13px; line-height: 1.4; max-height: 8em; overflow: hidden;\">{text}</div>
+</div>
+"""
+        cards.append(card_html)
+    if cards:
+        cards_html = "\n".join(cards)
+    else:
+        cards_html = "<div>No results.</div>"
+    total_retrieval_time = (
+        quantize_time
+        + search_time
+        + load_int8_time
+        + rescore_time
+        + sort_time
+        + load_text_time
+    )
+    num_docs_retrieved = len(top_k_indices)
+    search_mode = "Approximate (IVF)" if use_approx else "Exact"
+    summary_md = f"""
+<div style=\"border: 1px solid var(--border-color-primary, #e0e0e0); border-radius: 10px; padding: 10px 12px; background-color: var(--block-background-fill, transparent);\">
+    <h3 style=\"margin-top: 0;\">Search Summary</h3>
+    <ul style=\"margin-top: 0; margin-bottom: 8px; padding-left: 18px;\">
+        <li>Total docs in corpus: {TOTAL_NUM_DOCS:,}</li>
+        <li>Docs searched: {num_docs_searched}</li>
+        <li>Docs retrieved: {num_docs_retrieved}</li>
+        <li>Search mode: {search_mode}</li>
+    </ul>
+    <h4>Timings (in seconds)</h4>
+    <ul style=\"margin-top: 0; margin-bottom: 0; padding-left: 18px;\">
+        <li>Embed on CPU: {embed_time:.4f}</li>
+        <li>Quantize: {quantize_time:.4f}</li>
+        <li>Search: {search_time:.4f}</li>
+        <li>Load int8: {load_int8_time:.4f}</li>
+        <li>Rescore: {rescore_time:.4f}</li>
+        <li>Sort: {sort_time:.4f}</li>
+        <li>Load text: {load_text_time:.4f}</li>
+    </ul>
+    <strong>Total retrieval time: {total_retrieval_time:.4f} seconds</strong>
+</div>"""
+    return cards_html, summary_md
 with gr.Blocks(title="Quantized Retrieval") as demo:
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.Markdown(
+                """
+<div style='border: 1px solid var(--border-color-primary, #e0e0e0); border-radius: 10px; padding: 12px 14px; background-color: var(--block-background-fill, transparent);'>
+<h1 style='margin-top: 0;'>Quantized Retrieval - Binary Search with Scalar (int8) Rescoring</h1>
 This demo showcases retrieval using [quantized embeddings](https://huggingface.co/blog/embedding-quantization) on a CPU. The corpus consists of [41 million texts](https://huggingface.co/datasets/sentence-transformers/quantized-retrieval-data) from Wikipedia articles.
 <details><summary>Click to learn about the retrieval process</summary>
 - The approximate search index (a binary Inverted File Index (IVF)) is in beta and has not been trained with a lot of data.
 </details>
+</div>
 """
+            )
             query = gr.Textbox(
                 label="Query for Wikipedia articles",
                 placeholder="Enter a query to search for relevant texts from Wikipedia.",
             )
+            search_button = gr.Button(value="Search", variant="secondary")
+        with gr.Column(scale=1, min_width=0):
             top_k = gr.Slider(
                 minimum=10,
                 maximum=1000,
                 step=1,
                 value=20,
                 label="Number of documents to retrieve",
+                info="Number of documents to retrieve using binary search",
             )
             rescore_multiplier = gr.Slider(
                 minimum=1,
                 maximum=10,
                 label="Rescore multiplier",
                 info="Search for `rescore_multiplier` as many documents to rescore",
             )
+            use_approx = gr.Radio(
+                choices=[("Approximate Search", True), ("Exact Search", False)],
+                value=True,
+                label="Search Settings",
+            )
     with gr.Row():
+        with gr.Column(scale=3):
+            cards = gr.HTML(label="Results")
         with gr.Column(scale=1):
+            summary = gr.Markdown(label="Search Summary")
     examples = gr.Examples(
         examples=[
         ],
         fn=search,
         inputs=[query],
+        outputs=[cards, summary],
         cache_examples=False,
         run_on_click=True,
     )
     query.submit(
         search,
+        inputs=[query, top_k, rescore_multiplier, use_approx],
+        outputs=[cards, summary],
     )
     search_button.click(
         search,
+        inputs=[query, top_k, rescore_multiplier, use_approx],
+        outputs=[cards, summary],
     )
 demo.queue()