Spaces:
Running
Running
| # app.py | |
| import torch | |
| import streamlit as st | |
| from transformers import AutoModel | |
| st.set_page_config(page_title="Semantic Highlight Bilingual Demo", layout="wide") | |
| def load_model(): | |
| model = AutoModel.from_pretrained( | |
| "zilliz/semantic-highlight-bilingual-v1", | |
| trust_remote_code=True, | |
| ) | |
| return model | |
| def split_sentences(text: str): | |
| text = text.strip() | |
| if not text: | |
| return [] | |
| # Very simple heuristic: use Chinese period if present, else English period. | |
| if "。" in text: | |
| parts = [s.strip() for s in text.split("。") if s.strip()] | |
| # Add back "。" to each sentence for nicer display. | |
| sentences = [s + "。" for s in parts] | |
| else: | |
| parts = [s.strip() for s in text.split(".") if s.strip()] | |
| sentences = [s + "." for s in parts] | |
| return sentences | |
| def highlight_context(context: str, highlighted_sentences): | |
| if not context or not highlighted_sentences: | |
| return context | |
| # Simple HTML highlighting by sentence replacement | |
| highlighted_html = context | |
| for sent in highlighted_sentences: | |
| sent_clean = sent.strip() | |
| if not sent_clean: | |
| continue | |
| # Avoid double-wrapping: only replace plain text, not already highlighted | |
| replacement = f'<span class="hl-sentence">{sent_clean}</span>' | |
| highlighted_html = highlighted_html.replace(sent_clean, replacement) | |
| # Basic styling | |
| style = """ | |
| <style> | |
| .hl-sentence { | |
| background-color: rgba(255, 215, 0, 0.35); | |
| padding: 2px 3px; | |
| border-radius: 3px; | |
| } | |
| .context-box { | |
| white-space: pre-wrap; | |
| font-family: ui-monospace, Menlo, Monaco, "Courier New", monospace; | |
| font-size: 0.9rem; | |
| line-height: 1.5; | |
| } | |
| </style> | |
| """ | |
| return style + f'<div class="context-box">{highlighted_html}</div>' | |
| def main(): | |
| st.title("Semantic Highlight Bilingual Demo") | |
| st.caption("Model: zilliz/semantic-highlight-bilingual-v1") | |
| with st.sidebar: | |
| st.header("Settings") | |
| threshold = st.slider( | |
| "Relevance threshold", | |
| min_value=0.0, | |
| max_value=1.0, | |
| value=0.5, | |
| step=0.01, | |
| help="Lower values highlight more sentences; higher values highlight fewer.", | |
| ) | |
| language = st.selectbox( | |
| "Language", | |
| options=["auto", "en", "zh"], | |
| index=0, | |
| help="Let the model auto-detect, or force English (en) / Chinese (zh).", | |
| ) | |
| return_sentence_metrics = st.checkbox( | |
| "Return per-sentence probabilities", | |
| value=True, | |
| ) | |
| st.markdown("---") | |
| st.info( | |
| "1. Enter a query.\n" | |
| "2. Paste a document as context.\n" | |
| "3. Click **Run Semantic Highlight**." | |
| ) | |
| default_question = "What are the symptoms of dehydration?" | |
| default_context = ( | |
| "Dehydration occurs when your body loses more fluid than you take in.\n" | |
| "Common signs include feeling thirsty and having a dry mouth.\n" | |
| "The human body is composed of about 60% water.\n" | |
| "Dark yellow urine and infrequent urination are warning signs.\n" | |
| "Water is essential for many bodily functions.\n" | |
| "Dizziness, fatigue, and headaches can indicate severe dehydration.\n" | |
| "Drinking enough water daily is often recommended." | |
| ) | |
| col_left, col_right = st.columns(2) | |
| with col_left: | |
| question = st.text_input( | |
| "Query / Question", | |
| value=default_question, | |
| ) | |
| context = st.text_area( | |
| "Context / Document", | |
| value=default_context, | |
| height=260, | |
| ) | |
| with col_right: | |
| st.subheader("Controls") | |
| run = st.button("Run Semantic Highlight", type="primary") | |
| if run: | |
| if not question.strip(): | |
| st.error("Please enter a query/question.") | |
| return | |
| if not context.strip(): | |
| st.error("Please enter some context text.") | |
| return | |
| with st.spinner("Loading model and running inference..."): | |
| model = load_model() | |
| kwargs = { | |
| "question": question, | |
| "context": context, | |
| "threshold": threshold, | |
| "return_sentence_metrics": return_sentence_metrics, | |
| } | |
| if language != "auto": | |
| kwargs["language"] = language | |
| with torch.no_grad(): | |
| result = model.process(**kwargs) | |
| highlighted_sentences = result.get("highlighted_sentences", []) | |
| compression_rate = result.get("compression_rate", None) | |
| sentence_probs = result.get("sentence_probabilities", None) | |
| st.subheader("Results") | |
| # Metrics row | |
| metric_cols = st.columns(3) | |
| with metric_cols[0]: | |
| st.metric( | |
| "Highlighted sentences", | |
| value=len(highlighted_sentences), | |
| ) | |
| with metric_cols[1]: | |
| if compression_rate is not None: | |
| st.metric( | |
| "Compression rate", | |
| value=f"{compression_rate * 100:.1f}%", | |
| help="Approximate percentage of text removed.", | |
| ) | |
| with metric_cols[2]: | |
| st.metric( | |
| "Threshold used", | |
| value=f"{threshold:.2f}", | |
| ) | |
| # Highlighted sentence list | |
| st.markdown("### Highlighted Sentences") | |
| if highlighted_sentences: | |
| for i, sent in enumerate(highlighted_sentences, start=1): | |
| st.markdown(f"**{i}.** {sent}") | |
| else: | |
| st.write("No sentences passed the current threshold.") | |
| # Full context with inline highlights | |
| st.markdown("### Context with Highlights") | |
| highlighted_html = highlight_context(context, highlighted_sentences) | |
| st.markdown(highlighted_html, unsafe_allow_html=True) | |
| # Sentence probabilities table (if available) | |
| if return_sentence_metrics and sentence_probs is not None: | |
| st.markdown("### Sentence Probabilities") | |
| sentences = split_sentences(context) | |
| # Align lengths if possible; otherwise just show probabilities | |
| if len(sentences) == len(sentence_probs): | |
| import pandas as pd | |
| data = { | |
| "Sentence #": list(range(1, len(sentences) + 1)), | |
| "Sentence": sentences, | |
| "Probability": sentence_probs, | |
| } | |
| df = pd.DataFrame(data) | |
| st.dataframe( | |
| df, | |
| use_container_width=True, | |
| ) | |
| else: | |
| st.write( | |
| "Count of split sentences does not match model probabilities; " | |
| "showing raw probability list." | |
| ) | |
| st.write(sentence_probs) | |
| if __name__ == "__main__": | |
| main() | |