|
|
import streamlit as st |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import requests |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="Semantic Similarity Explainer", |
|
|
page_icon="π", |
|
|
layout="wide" |
|
|
) |
|
|
|
|
|
|
|
|
st.title("π Semantic Similarity Explainer with AI") |
|
|
st.markdown(""" |
|
|
This app calculates the **semantic similarity** between two sentences using transformer-based embeddings (all-MiniLM-L6-v2) and uses AI to explain why that specific score makes sense. |
|
|
""") |
|
|
|
|
|
|
|
|
if 'history' not in st.session_state: |
|
|
st.session_state.history = [] |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_model(): |
|
|
return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
|
|
|
|
|
|
|
with st.spinner("Loading transformer model..."): |
|
|
model = load_model() |
|
|
|
|
|
|
|
|
api_key = os.getenv("OPENROUTER_API_KEY") |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.header("βοΈ Configuration") |
|
|
|
|
|
if api_key: |
|
|
st.success("π’ API Connected") |
|
|
else: |
|
|
st.error("β API Key not found in environment") |
|
|
api_key = st.text_input("OpenRouter API Key", type="password", help="Get your API key from https://openrouter.ai/keys") |
|
|
st.info("π‘ Tip: Set OPENROUTER_API_KEY in Hugging Face Spaces secrets to avoid typing it every time") |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown(""" |
|
|
### How it works: |
|
|
1. Enter two sentences |
|
|
2. Generate embeddings using transformer |
|
|
3. Calculate cosine similarity |
|
|
4. AI explains the similarity score |
|
|
5. View the full prompt sent to AI |
|
|
""") |
|
|
|
|
|
st.info(""" |
|
|
**Model:** all-MiniLM-L6-v2 |
|
|
|
|
|
This transformer model creates 384-dimensional embeddings that capture semantic meaning, not just word overlap. |
|
|
""") |
|
|
|
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
with col1: |
|
|
sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are hot") |
|
|
|
|
|
with col2: |
|
|
sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are cold") |
|
|
|
|
|
|
|
|
if st.button("π― Calculate & Explain", type="primary"): |
|
|
if not sentence1 or not sentence2: |
|
|
st.error("Please enter both sentences!") |
|
|
elif not api_key: |
|
|
st.error("Please enter your OpenRouter API key in the sidebar!") |
|
|
else: |
|
|
try: |
|
|
|
|
|
sentence1_normalized = sentence1.lower().strip() |
|
|
sentence2_normalized = sentence2.lower().strip() |
|
|
|
|
|
|
|
|
with st.spinner("Generating semantic embeddings..."): |
|
|
embeddings = model.encode([sentence1_normalized, sentence2_normalized]) |
|
|
embedding1 = embeddings[0].reshape(1, -1) |
|
|
embedding2 = embeddings[1].reshape(1, -1) |
|
|
|
|
|
|
|
|
similarity = cosine_similarity(embedding1, embedding2)[0][0] |
|
|
|
|
|
|
|
|
similarity_float = float(similarity) |
|
|
similarity_rounded = round(similarity_float, 2) |
|
|
|
|
|
|
|
|
st.success(f"**Semantic similarity between:**") |
|
|
st.info(f'"{sentence1}" and "{sentence2}" β **{similarity_rounded:.2f}**') |
|
|
|
|
|
|
|
|
if similarity_rounded < 0.3: |
|
|
similarity_desc = "Low similarity" |
|
|
elif similarity_rounded < 0.7: |
|
|
similarity_desc = "Moderate similarity" |
|
|
else: |
|
|
similarity_desc = "High similarity" |
|
|
|
|
|
|
|
|
st.progress(float(similarity_rounded), text=similarity_desc) |
|
|
|
|
|
|
|
|
detailed_prompt = f"""You are an expert in Natural Language Processing and semantic similarity analysis using transformer-based embeddings. |
|
|
|
|
|
I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which creates 384-dimensional vector embeddings that capture deep semantic meaning. |
|
|
|
|
|
**ANALYSIS REQUEST:** |
|
|
Sentence 1: "{sentence1}" |
|
|
Sentence 2: "{sentence2}" |
|
|
Cosine Similarity Score: {similarity_rounded:.2f} |
|
|
|
|
|
Please provide a detailed explanation of WHY these two specific sentences resulted in a similarity score of {similarity_rounded:.2f}. |
|
|
|
|
|
**Your analysis should cover:** |
|
|
|
|
|
1. **Score Interpretation**: What does {similarity_rounded:.2f} mean on the 0.00-1.00 scale? Is this low, moderate, or high similarity? |
|
|
|
|
|
2. **Semantic Analysis**: |
|
|
- What are the key semantic elements in each sentence? |
|
|
- What similarities did the transformer model detect? |
|
|
- What differences contributed to the score not being higher/lower? |
|
|
|
|
|
3. **Linguistic Features**: |
|
|
- Sentence structure patterns |
|
|
- Word relationships (synonyms, antonyms, related concepts) |
|
|
- Grammatical similarities |
|
|
- Contextual meaning |
|
|
|
|
|
4. **Transformer Model Behavior**: |
|
|
- How does all-MiniLM-L6-v2 process these sentences? |
|
|
- What semantic features likely contributed most to this score? |
|
|
- Why this score makes sense from a deep learning perspective |
|
|
|
|
|
5. **Intuitive Validation**: Does this {similarity_rounded:.2f} score match what a human would expect when comparing these sentences? |
|
|
|
|
|
Please be specific about these exact sentences and this exact score of {similarity_rounded:.2f}. Explain the reasoning behind this particular numerical result.""" |
|
|
|
|
|
|
|
|
with st.spinner("π€ AI is analyzing why you got this specific similarity score..."): |
|
|
headers = { |
|
|
"Authorization": f"Bearer {api_key}", |
|
|
"Content-Type": "application/json", |
|
|
"HTTP-Referer": "https://github.com/semantic-similarity-app", |
|
|
"X-Title": "Semantic Similarity Explainer" |
|
|
} |
|
|
|
|
|
data = { |
|
|
"model": "openai/gpt-3.5-turbo", |
|
|
"messages": [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are an NLP expert who explains similarity scores in simple, short terms that anyone can understand." |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": detailed_prompt |
|
|
} |
|
|
], |
|
|
"temperature": 0.10, |
|
|
"max_tokens": 400 |
|
|
} |
|
|
|
|
|
response = requests.post( |
|
|
"https://openrouter.ai/api/v1/chat/completions", |
|
|
headers=headers, |
|
|
json=data |
|
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
|
result = response.json() |
|
|
explanation = result['choices'][0]['message']['content'] |
|
|
|
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["π€ AI Explanation", "π Prompt Sent to AI", "π§ Technical Details"]) |
|
|
|
|
|
with tab1: |
|
|
st.markdown("### π§ Why You Got This Similarity Score") |
|
|
st.markdown("**AI Analysis:**") |
|
|
|
|
|
|
|
|
with st.container(): |
|
|
st.markdown(f""" |
|
|
<div style="background-color: #f0f2f6; padding: 20px; border-radius: 10px; border-left: 4px solid #1f77b4;"> |
|
|
{explanation} |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with tab2: |
|
|
st.markdown("### π€ Exact Prompt Sent to GPT-3.5-Turbo") |
|
|
st.markdown("This is exactly what was sent to the AI to generate the explanation:") |
|
|
st.code(detailed_prompt, language="text") |
|
|
|
|
|
st.markdown("**API Details:**") |
|
|
st.json({ |
|
|
"model": "openai/gpt-3.5-turbo", |
|
|
"temperature": 0.3, |
|
|
"max_tokens": 800, |
|
|
"system_message": "You are an expert NLP researcher..." |
|
|
}) |
|
|
|
|
|
with tab3: |
|
|
st.markdown("### π§ Technical Details") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
with col1: |
|
|
st.markdown("**Sentence 1 Analysis:**") |
|
|
st.text(f"Original: {sentence1}") |
|
|
st.text(f"Normalized: {sentence1_normalized}") |
|
|
st.text(f"Embedding shape: {embedding1.shape}") |
|
|
st.text(f"Embedding L2 norm: {np.linalg.norm(embedding1):.4f}") |
|
|
|
|
|
st.markdown("**First 10 embedding dimensions:**") |
|
|
embedding_preview = embedding1[0][:10] |
|
|
for i, val in enumerate(embedding_preview): |
|
|
st.text(f"Dim {i}: {val:.4f}") |
|
|
|
|
|
with col2: |
|
|
st.markdown("**Sentence 2 Analysis:**") |
|
|
st.text(f"Original: {sentence2}") |
|
|
st.text(f"Normalized: {sentence2_normalized}") |
|
|
st.text(f"Embedding shape: {embedding2.shape}") |
|
|
st.text(f"Embedding L2 norm: {np.linalg.norm(embedding2):.4f}") |
|
|
|
|
|
st.markdown("**First 10 embedding dimensions:**") |
|
|
embedding_preview = embedding2[0][:10] |
|
|
for i, val in enumerate(embedding_preview): |
|
|
st.text(f"Dim {i}: {val:.4f}") |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("**Similarity Computation Details:**") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
|
st.metric("Embedding Dimensions", "384") |
|
|
st.metric("Exact Similarity", f"{similarity_float:.6f}") |
|
|
|
|
|
with col2: |
|
|
st.metric("Rounded Similarity", f"{similarity_rounded:.2f}") |
|
|
dot_product = np.dot(embedding1[0], embedding2[0]) |
|
|
st.metric("Dot Product", f"{dot_product:.4f}") |
|
|
|
|
|
with col3: |
|
|
|
|
|
angle = np.arccos(np.clip(similarity_float, -1.0, 1.0)) |
|
|
angle_degrees = np.degrees(angle) |
|
|
st.metric("Vector Angle (degrees)", f"{angle_degrees:.2f}Β°") |
|
|
st.metric("Model Used", "all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
st.session_state.history.append({ |
|
|
"sentence1": sentence1, |
|
|
"sentence2": sentence2, |
|
|
"similarity": similarity_rounded, |
|
|
"explanation": explanation |
|
|
}) |
|
|
|
|
|
st.success("β
Analysis complete! Check the tabs above for detailed explanations.") |
|
|
|
|
|
else: |
|
|
st.error(f"β API Error: {response.status_code}") |
|
|
st.error(f"Response: {response.text}") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"β An error occurred: {str(e)}") |
|
|
st.error("Please check your API key and internet connection.") |
|
|
|
|
|
|
|
|
if st.session_state.history: |
|
|
st.markdown("---") |
|
|
st.markdown("### π Previous Calculations") |
|
|
|
|
|
for i, item in enumerate(reversed(st.session_state.history[-5:])): |
|
|
with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' β Score: {item['similarity']:.2f}"): |
|
|
st.markdown(item['explanation']) |
|
|
|
|
|
|
|
|
with st.expander("βΉοΈ Understanding Semantic Similarity Scores"): |
|
|
st.markdown(""" |
|
|
### How to Interpret Cosine Similarity Scores |
|
|
|
|
|
**What the numbers mean:** |
|
|
- **0.90 - 1.00**: Nearly identical meaning (e.g., "The car is fast" vs "The automobile is quick") |
|
|
- **0.70 - 0.89**: High semantic similarity (e.g., "I love dogs" vs "I adore puppies") |
|
|
- **0.50 - 0.69**: Moderate similarity (e.g., "You are hot" vs "You are cold" - same structure, opposite meaning) |
|
|
- **0.30 - 0.49**: Low similarity (e.g., "I like pizza" vs "Mathematics is difficult") |
|
|
- **0.00 - 0.29**: Very low similarity (e.g., "Hello world" vs "Quantum physics equations") |
|
|
|
|
|
**Why transformer embeddings are powerful:** |
|
|
- They understand **context** and **meaning**, not just word overlap |
|
|
- They capture **relationships** between words (synonyms, antonyms, related concepts) |
|
|
- They consider **sentence structure** and **grammatical patterns** |
|
|
- They detect **semantic similarity** even with different words |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown(""" |
|
|
<div style='text-align: center'> |
|
|
<p>π Made with β€οΈ using Streamlit | Powered by Sentence Transformers & OpenRouter API</p> |
|
|
<p><small>Each calculation automatically sends your sentences and similarity score to GPT-3.5-turbo for detailed analysis</small></p> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |