import streamlit as st import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import requests import json import os # Page config st.set_page_config( page_title="Semantic Similarity Explainer", page_icon="🔍", layout="wide" ) # Title and description st.title("🔍 Semantic Similarity Explainer with AI") st.markdown(""" This app calculates the **semantic similarity** between two sentences using transformer-based embeddings (all-MiniLM-L6-v2) and uses AI to explain why that specific score makes sense. """) # Initialize session state if 'history' not in st.session_state: st.session_state.history = [] # Cache the model loading @st.cache_resource def load_model(): return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Load the model with st.spinner("Loading transformer model..."): model = load_model() # Get API key from environment variables (Hugging Face Spaces secrets) api_key = os.getenv("OPENROUTER_API_KEY") # Sidebar for configuration with st.sidebar: st.header("âš™ī¸ Configuration") if api_key: st.success("đŸŸĸ API Connected") else: st.error("❌ API Key not found in environment") api_key = st.text_input("OpenRouter API Key", type="password", help="Get your API key from https://openrouter.ai/keys") st.info("💡 Tip: Set OPENROUTER_API_KEY in Hugging Face Spaces secrets to avoid typing it every time") st.markdown("---") st.markdown(""" ### How it works: 1. Enter two sentences 2. Generate embeddings using transformer 3. Calculate cosine similarity 4. AI explains the similarity score 5. View the full prompt sent to AI """) st.info(""" **Model:** all-MiniLM-L6-v2 This transformer model creates 384-dimensional embeddings that capture semantic meaning, not just word overlap. """) # Main content col1, col2 = st.columns(2) with col1: sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are hot") with col2: sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are cold") # Calculate button if st.button("đŸŽ¯ Calculate & Explain", type="primary"): if not sentence1 or not sentence2: st.error("Please enter both sentences!") elif not api_key: st.error("Please enter your OpenRouter API key in the sidebar!") else: try: # Normalize to lowercase for consistency sentence1_normalized = sentence1.lower().strip() sentence2_normalized = sentence2.lower().strip() # Generate embeddings with st.spinner("Generating semantic embeddings..."): embeddings = model.encode([sentence1_normalized, sentence2_normalized]) embedding1 = embeddings[0].reshape(1, -1) embedding2 = embeddings[1].reshape(1, -1) # Calculate cosine similarity similarity = cosine_similarity(embedding1, embedding2)[0][0] # Convert to Python float to fix the progress bar error similarity_float = float(similarity) similarity_rounded = round(similarity_float, 2) # Display similarity score st.success(f"**Semantic similarity between:**") st.info(f'"{sentence1}" and "{sentence2}" → **{similarity_rounded:.2f}**') # Show similarity meter (fixed the float32 error) if similarity_rounded < 0.3: similarity_desc = "Low similarity" elif similarity_rounded < 0.7: similarity_desc = "Moderate similarity" else: similarity_desc = "High similarity" # Convert to regular Python float for progress bar st.progress(float(similarity_rounded), text=similarity_desc) # Create a comprehensive prompt for the AI to explain WHY this specific score occurred detailed_prompt = f"""You are an expert in Natural Language Processing and semantic similarity analysis using transformer-based embeddings. I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which creates 384-dimensional vector embeddings that capture deep semantic meaning. **ANALYSIS REQUEST:** Sentence 1: "{sentence1}" Sentence 2: "{sentence2}" Cosine Similarity Score: {similarity_rounded:.2f} Please provide a detailed explanation of WHY these two specific sentences resulted in a similarity score of {similarity_rounded:.2f}. **Your analysis should cover:** 1. **Score Interpretation**: What does {similarity_rounded:.2f} mean on the 0.00-1.00 scale? Is this low, moderate, or high similarity? 2. **Semantic Analysis**: - What are the key semantic elements in each sentence? - What similarities did the transformer model detect? - What differences contributed to the score not being higher/lower? 3. **Linguistic Features**: - Sentence structure patterns - Word relationships (synonyms, antonyms, related concepts) - Grammatical similarities - Contextual meaning 4. **Transformer Model Behavior**: - How does all-MiniLM-L6-v2 process these sentences? - What semantic features likely contributed most to this score? - Why this score makes sense from a deep learning perspective 5. **Intuitive Validation**: Does this {similarity_rounded:.2f} score match what a human would expect when comparing these sentences? Please be specific about these exact sentences and this exact score of {similarity_rounded:.2f}. Explain the reasoning behind this particular numerical result.""" # Call OpenRouter API with the detailed prompt with st.spinner("🤖 AI is analyzing why you got this specific similarity score..."): headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "HTTP-Referer": "https://github.com/semantic-similarity-app", "X-Title": "Semantic Similarity Explainer" } data = { "model": "openai/gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are an NLP expert who explains similarity scores in simple, short terms that anyone can understand." }, { "role": "user", "content": detailed_prompt } ], "temperature": 0.10, # Slightly higher for more natural explanations "max_tokens": 400 # Much shorter responses } response = requests.post( "https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data ) if response.status_code == 200: result = response.json() explanation = result['choices'][0]['message']['content'] # Display results in tabs tab1, tab2, tab3 = st.tabs(["🤖 AI Explanation", "📝 Prompt Sent to AI", "🔧 Technical Details"]) with tab1: st.markdown("### 🧠 Why You Got This Similarity Score") st.markdown("**AI Analysis:**") # Create a nice container for the AI explanation with st.container(): st.markdown(f"""
{explanation}
""", unsafe_allow_html=True) with tab2: st.markdown("### 📤 Exact Prompt Sent to GPT-3.5-Turbo") st.markdown("This is exactly what was sent to the AI to generate the explanation:") st.code(detailed_prompt, language="text") st.markdown("**API Details:**") st.json({ "model": "openai/gpt-3.5-turbo", "temperature": 0.3, "max_tokens": 800, "system_message": "You are an expert NLP researcher..." }) with tab3: st.markdown("### 🔧 Technical Details") col1, col2 = st.columns(2) with col1: st.markdown("**Sentence 1 Analysis:**") st.text(f"Original: {sentence1}") st.text(f"Normalized: {sentence1_normalized}") st.text(f"Embedding shape: {embedding1.shape}") st.text(f"Embedding L2 norm: {np.linalg.norm(embedding1):.4f}") st.markdown("**First 10 embedding dimensions:**") embedding_preview = embedding1[0][:10] for i, val in enumerate(embedding_preview): st.text(f"Dim {i}: {val:.4f}") with col2: st.markdown("**Sentence 2 Analysis:**") st.text(f"Original: {sentence2}") st.text(f"Normalized: {sentence2_normalized}") st.text(f"Embedding shape: {embedding2.shape}") st.text(f"Embedding L2 norm: {np.linalg.norm(embedding2):.4f}") st.markdown("**First 10 embedding dimensions:**") embedding_preview = embedding2[0][:10] for i, val in enumerate(embedding_preview): st.text(f"Dim {i}: {val:.4f}") st.markdown("---") st.markdown("**Similarity Computation Details:**") col1, col2, col3 = st.columns(3) with col1: st.metric("Embedding Dimensions", "384") st.metric("Exact Similarity", f"{similarity_float:.6f}") with col2: st.metric("Rounded Similarity", f"{similarity_rounded:.2f}") dot_product = np.dot(embedding1[0], embedding2[0]) st.metric("Dot Product", f"{dot_product:.4f}") with col3: # Calculate angle between vectors angle = np.arccos(np.clip(similarity_float, -1.0, 1.0)) angle_degrees = np.degrees(angle) st.metric("Vector Angle (degrees)", f"{angle_degrees:.2f}°") st.metric("Model Used", "all-MiniLM-L6-v2") # Save to history st.session_state.history.append({ "sentence1": sentence1, "sentence2": sentence2, "similarity": similarity_rounded, "explanation": explanation }) st.success("✅ Analysis complete! Check the tabs above for detailed explanations.") else: st.error(f"❌ API Error: {response.status_code}") st.error(f"Response: {response.text}") except Exception as e: st.error(f"❌ An error occurred: {str(e)}") st.error("Please check your API key and internet connection.") # Display history if st.session_state.history: st.markdown("---") st.markdown("### 📜 Previous Calculations") for i, item in enumerate(reversed(st.session_state.history[-5:])): # Show last 5 with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' → Score: {item['similarity']:.2f}"): st.markdown(item['explanation']) # Info box about semantic similarity with st.expander("â„šī¸ Understanding Semantic Similarity Scores"): st.markdown(""" ### How to Interpret Cosine Similarity Scores **What the numbers mean:** - **0.90 - 1.00**: Nearly identical meaning (e.g., "The car is fast" vs "The automobile is quick") - **0.70 - 0.89**: High semantic similarity (e.g., "I love dogs" vs "I adore puppies") - **0.50 - 0.69**: Moderate similarity (e.g., "You are hot" vs "You are cold" - same structure, opposite meaning) - **0.30 - 0.49**: Low similarity (e.g., "I like pizza" vs "Mathematics is difficult") - **0.00 - 0.29**: Very low similarity (e.g., "Hello world" vs "Quantum physics equations") **Why transformer embeddings are powerful:** - They understand **context** and **meaning**, not just word overlap - They capture **relationships** between words (synonyms, antonyms, related concepts) - They consider **sentence structure** and **grammatical patterns** - They detect **semantic similarity** even with different words """) # Footer st.markdown("---") st.markdown("""

🚀 Made with â¤ī¸ using Streamlit | Powered by Sentence Transformers & OpenRouter API

Each calculation automatically sends your sentences and similarity score to GPT-3.5-turbo for detailed analysis

""", unsafe_allow_html=True)