Spaces:

dejanseo
/

grounding-snippet-generator

Running

App Files Files Community

grounding-snippet-generator / src /streamlit_app.py

dejanseo

Update src/streamlit_app.py

848b652 verified 14 days ago

raw

history blame

4.94 kB

	"""
	Snippet Generator - Recreates Google Vertex AI/Gemini grounding snippets
	Uses MS MARCO Cross-Encoder for search relevance ranking.
	"""

	import re
	import numpy as np
	import streamlit as st
	import torch
	from sentence_transformers import CrossEncoder

	# --- Configuration ---
	MODEL_NAME = "cross-encoder/ms-marco-electra-base"
	MAX_SNIPPET_CHARS = 450
	MAX_SENTENCES = 5

	st.set_page_config(
	page_title="Snippet Generator",
	page_icon="✂️",
	layout="centered"
	)


	@st.cache_resource
	def load_model():
	"""Load CrossEncoder model."""
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = CrossEncoder(MODEL_NAME, device=device)
	return model


	def segment_sentences(text: str) -> list[str]:
	"""Sentence segmentation with deduplication and filtering."""
	# Split on sentence boundaries AND newlines
	pattern = r'(?<=[.!?])\s+\|\n+'
	raw_sentences = re.split(pattern, text)

	seen = set()
	sentences = []

	for s in raw_sentences:
	s = s.strip()

	if not s or len(s) < 20:
	continue

	if s.startswith('http') or s.startswith('URL:'):
	continue

	# Skip low-alpha content (metadata, tables, prices)
	alpha_ratio = sum(c.isalpha() for c in s) / max(len(s), 1)
	if alpha_ratio < 0.5:
	continue

	# Skip questions
	if s.endswith('?'):
	continue

	normalized = ' '.join(s.lower().split())
	if normalized in seen:
	continue
	seen.add(normalized)

	sentences.append(s)

	return sentences


	def generate_snippet(query: str, document: str, model, max_chars: int, max_sents: int) -> tuple[str, list]:
	"""Generate snippet using Cross-Encoder scoring."""
	sentences = segment_sentences(document)

	if not sentences:
	return "", []

	# Cross-encoder: score query-sentence pairs
	pairs = [[query, sent] for sent in sentences]
	scores = model.predict(pairs)

	ranked_indices = np.argsort(scores)[::-1]

	# Select with budget
	selected = []
	total_length = 0

	for idx in ranked_indices:
	sent = sentences[idx]
	if total_length + len(sent) <= max_chars and len(selected) < max_sents:
	selected.append((idx, sent, scores[idx]))
	total_length += len(sent)

	if not selected:
	best_idx = ranked_indices[0]
	return sentences[best_idx][:max_chars] + "...", []

	# Sort by document order
	selected.sort(key=lambda x: x[0])

	# Stitch with ellipsis for gaps
	snippet_parts = []
	prev_idx = -1

	for idx, sent, _ in selected:
	if prev_idx >= 0 and idx > prev_idx + 1:
	snippet_parts.append("...")
	snippet_parts.append(sent)
	prev_idx = idx

	if prev_idx < len(sentences) - 1:
	snippet_parts.append("...")

	# Debug info
	debug_info = [(scores[ranked_indices[i]], sentences[ranked_indices[i]])
	for i in range(min(5, len(ranked_indices)))]

	return " ".join(snippet_parts), debug_info


	# --- Streamlit UI ---
	st.title("✂️ Snippet Generator")
	st.caption("Recreates Google Vertex AI / Gemini grounding-style snippets")

	st.markdown("""
	This tool generates extractive snippets from documents using a Cross-Encoder model trained on MS MARCO search relevance data.

	How it works:
	1. Segments document into sentences
	2. Scores each sentence against your query using `cross-encoder/ms-marco-electra-base`
	3. Selects top-scoring sentences within budget
	4. Stitches them in document order with `...` for gaps
	""")

	st.markdown("---")

	query = st.text_input("🔍 Query", value="best prostate cancer treatment in the world")

	document = st.text_area(
	"📄 Document",
	height=250,
	placeholder="Paste document content here..."
	)

	with st.expander("⚙️ Settings"):
	max_chars = st.slider("Max snippet characters", 200, 1500, MAX_SNIPPET_CHARS, 50)
	max_sents = st.slider("Max sentences", 2, 15, MAX_SENTENCES)
	show_debug = st.checkbox("Show debug info", value=True)

	if st.button("Generate Snippet", type="primary"):
	if query and document:
	with st.spinner("Loading model & scoring sentences..."):
	model = load_model()
	snippet, debug = generate_snippet(query, document, model, max_chars, max_sents)

	st.subheader("Generated Snippet")
	st.code(snippet, language=None)

	if show_debug and debug:
	st.markdown("---")
	st.write("Top sentences by score:")
	for score, sent in debug:
	st.text(f"{score:.4f}: {sent[:80]}...")
	else:
	st.warning("Please enter both a query and document.")

	st.markdown("---")
	st.caption("Model: `cross-encoder/ms-marco-electra-base` \| [GitHub](https://github.com/UKPLab/sentence-transformers)")