Spaces:

dejanseo
/

grounding-snippet-generator

Running

App Files Files Community

dejanseo commited on 13 days ago

Commit

848b652

verified ·

1 Parent(s): e8fb8fd

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +160 -34

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,166 @@
-import altair as alt
 import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+"""
+Snippet Generator - Recreates Google Vertex AI/Gemini grounding snippets
+Uses MS MARCO Cross-Encoder for search relevance ranking.
+"""
+import re
 import numpy as np
 import streamlit as st
+import torch
+from sentence_transformers import CrossEncoder
+# --- Configuration ---
+MODEL_NAME = "cross-encoder/ms-marco-electra-base"
+MAX_SNIPPET_CHARS = 450
+MAX_SENTENCES = 5
+st.set_page_config(
+    page_title="Snippet Generator",
+    page_icon="✂️",
+    layout="centered"
+)
+@st.cache_resource
+def load_model():
+    """Load CrossEncoder model."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = CrossEncoder(MODEL_NAME, device=device)
+    return model
+def segment_sentences(text: str) -> list[str]:
+    """Sentence segmentation with deduplication and filtering."""
+    # Split on sentence boundaries AND newlines
+    pattern = r'(?<=[.!?])\s+|\n+'
+    raw_sentences = re.split(pattern, text)
+    seen = set()
+    sentences = []
+    for s in raw_sentences:
+        s = s.strip()
+        if not s or len(s) < 20:
+            continue
+        if s.startswith('http') or s.startswith('URL:'):
+            continue
+        # Skip low-alpha content (metadata, tables, prices)
+        alpha_ratio = sum(c.isalpha() for c in s) / max(len(s), 1)
+        if alpha_ratio < 0.5:
+            continue
+        # Skip questions
+        if s.endswith('?'):
+            continue
+        normalized = ' '.join(s.lower().split())
+        if normalized in seen:
+            continue
+        seen.add(normalized)
+        sentences.append(s)
+    return sentences
+def generate_snippet(query: str, document: str, model, max_chars: int, max_sents: int) -> tuple[str, list]:
+    """Generate snippet using Cross-Encoder scoring."""
+    sentences = segment_sentences(document)
+    if not sentences:
+        return "", []
+    # Cross-encoder: score query-sentence pairs
+    pairs = [[query, sent] for sent in sentences]
+    scores = model.predict(pairs)
+    ranked_indices = np.argsort(scores)[::-1]
+    # Select with budget
+    selected = []
+    total_length = 0
+    for idx in ranked_indices:
+        sent = sentences[idx]
+        if total_length + len(sent) <= max_chars and len(selected) < max_sents:
+            selected.append((idx, sent, scores[idx]))
+            total_length += len(sent)
+    if not selected:
+        best_idx = ranked_indices[0]
+        return sentences[best_idx][:max_chars] + "...", []
+    # Sort by document order
+    selected.sort(key=lambda x: x[0])
+    # Stitch with ellipsis for gaps
+    snippet_parts = []
+    prev_idx = -1
+    for idx, sent, _ in selected:
+        if prev_idx >= 0 and idx > prev_idx + 1:
+            snippet_parts.append("...")
+        snippet_parts.append(sent)
+        prev_idx = idx
+    if prev_idx < len(sentences) - 1:
+        snippet_parts.append("...")
+    # Debug info
+    debug_info = [(scores[ranked_indices[i]], sentences[ranked_indices[i]])
+                  for i in range(min(5, len(ranked_indices)))]
+    return " ".join(snippet_parts), debug_info
+# --- Streamlit UI ---
+st.title("✂️ Snippet Generator")
+st.caption("Recreates Google Vertex AI / Gemini grounding-style snippets")
+st.markdown("""
+This tool generates extractive snippets from documents using a Cross-Encoder model trained on MS MARCO search relevance data.
+**How it works:**
+1. Segments document into sentences
+2. Scores each sentence against your query using `cross-encoder/ms-marco-electra-base`
+3. Selects top-scoring sentences within budget
+4. Stitches them in document order with `...` for gaps
+""")
+st.markdown("---")
+query = st.text_input("🔍 Query", value="best prostate cancer treatment in the world")
+document = st.text_area(
+    "📄 Document",
+    height=250,
+    placeholder="Paste document content here..."
+)
+with st.expander("⚙️ Settings"):
+    max_chars = st.slider("Max snippet characters", 200, 1500, MAX_SNIPPET_CHARS, 50)
+    max_sents = st.slider("Max sentences", 2, 15, MAX_SENTENCES)
+    show_debug = st.checkbox("Show debug info", value=True)
+if st.button("Generate Snippet", type="primary"):
+    if query and document:
+        with st.spinner("Loading model & scoring sentences..."):
+            model = load_model()
+            snippet, debug = generate_snippet(query, document, model, max_chars, max_sents)
+        st.subheader("Generated Snippet")
+        st.code(snippet, language=None)
+        if show_debug and debug:
+            st.markdown("---")
+            st.write("**Top sentences by score:**")
+            for score, sent in debug:
+                st.text(f"{score:.4f}: {sent[:80]}...")
+    else:
+        st.warning("Please enter both a query and document.")
+st.markdown("---")
+st.caption("Model: `cross-encoder/ms-marco-electra-base` | [GitHub](https://github.com/UKPLab/sentence-transformers)")