Spaces:

uumerrr684
/

Cosine_Similarity_Explainer

Running

File size: 14,395 Bytes

import streamlit as st
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import requests
import json
import os

# Page config
st.set_page_config(
    page_title="Semantic Similarity Explainer",
    page_icon="🔍",
    layout="wide"
)

# Title and description
st.title("🔍 Semantic Similarity Explainer with AI")
st.markdown("""
This app calculates the **semantic similarity** between two sentences using transformer-based embeddings (all-MiniLM-L6-v2) and uses AI to explain why that specific score makes sense.
""")

# Initialize session state
if 'history' not in st.session_state:
    st.session_state.history = []

# Cache the model loading
@st.cache_resource
def load_model():
    return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the model
with st.spinner("Loading transformer model..."):
    model = load_model()

# Get API key from environment variables (Hugging Face Spaces secrets)
api_key = os.getenv("OPENROUTER_API_KEY")

# Sidebar for configuration
with st.sidebar:
    st.header("⚙️ Configuration")
    
    if api_key:
        st.success("🟢 API Connected")
    else:
        st.error("❌ API Key not found in environment")
        api_key = st.text_input("OpenRouter API Key", type="password", help="Get your API key from https://openrouter.ai/keys")
        st.info("💡 Tip: Set OPENROUTER_API_KEY in Hugging Face Spaces secrets to avoid typing it every time")
    
    st.markdown("---")
    st.markdown("""
    ### How it works:
    1. Enter two sentences
    2. Generate embeddings using transformer
    3. Calculate cosine similarity
    4. AI explains the similarity score
    5. View the full prompt sent to AI
    """)
    
    st.info("""
    **Model:** all-MiniLM-L6-v2
    
    This transformer model creates 384-dimensional embeddings that capture semantic meaning, not just word overlap.
    """)

# Main content
col1, col2 = st.columns(2)

with col1:
    sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are hot")

with col2:
    sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are cold")

# Calculate button
if st.button("🎯 Calculate & Explain", type="primary"):
    if not sentence1 or not sentence2:
        st.error("Please enter both sentences!")
    elif not api_key:
        st.error("Please enter your OpenRouter API key in the sidebar!")
    else:
        try:
            # Normalize to lowercase for consistency
            sentence1_normalized = sentence1.lower().strip()
            sentence2_normalized = sentence2.lower().strip()
            
            # Generate embeddings
            with st.spinner("Generating semantic embeddings..."):
                embeddings = model.encode([sentence1_normalized, sentence2_normalized])
                embedding1 = embeddings[0].reshape(1, -1)
                embedding2 = embeddings[1].reshape(1, -1)
                
                # Calculate cosine similarity
                similarity = cosine_similarity(embedding1, embedding2)[0][0]
                
                # Convert to Python float to fix the progress bar error
                similarity_float = float(similarity)
                similarity_rounded = round(similarity_float, 2)
            
            # Display similarity score
            st.success(f"**Semantic similarity between:**")
            st.info(f'"{sentence1}" and "{sentence2}" → **{similarity_rounded:.2f}**')
            
            # Show similarity meter (fixed the float32 error)
            if similarity_rounded < 0.3:
                similarity_desc = "Low similarity"
            elif similarity_rounded < 0.7:
                similarity_desc = "Moderate similarity"
            else:
                similarity_desc = "High similarity"
            
            # Convert to regular Python float for progress bar
            st.progress(float(similarity_rounded), text=similarity_desc)
            
            # Create a comprehensive prompt for the AI to explain WHY this specific score occurred
            detailed_prompt = f"""You are an expert in Natural Language Processing and semantic similarity analysis using transformer-based embeddings.

I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which creates 384-dimensional vector embeddings that capture deep semantic meaning.

**ANALYSIS REQUEST:**
Sentence 1: "{sentence1}"
Sentence 2: "{sentence2}"
Cosine Similarity Score: {similarity_rounded:.2f}

Please provide a detailed explanation of WHY these two specific sentences resulted in a similarity score of {similarity_rounded:.2f}.

**Your analysis should cover:**

1. **Score Interpretation**: What does {similarity_rounded:.2f} mean on the 0.00-1.00 scale? Is this low, moderate, or high similarity?

2. **Semantic Analysis**: 
   - What are the key semantic elements in each sentence?
   - What similarities did the transformer model detect?
   - What differences contributed to the score not being higher/lower?

3. **Linguistic Features**:
   - Sentence structure patterns
   - Word relationships (synonyms, antonyms, related concepts)
   - Grammatical similarities
   - Contextual meaning

4. **Transformer Model Behavior**: 
   - How does all-MiniLM-L6-v2 process these sentences?
   - What semantic features likely contributed most to this score?
   - Why this score makes sense from a deep learning perspective

5. **Intuitive Validation**: Does this {similarity_rounded:.2f} score match what a human would expect when comparing these sentences?

Please be specific about these exact sentences and this exact score of {similarity_rounded:.2f}. Explain the reasoning behind this particular numerical result."""

            # Call OpenRouter API with the detailed prompt
            with st.spinner("🤖 AI is analyzing why you got this specific similarity score..."):
                headers = {
                    "Authorization": f"Bearer {api_key}",
                    "Content-Type": "application/json",
                    "HTTP-Referer": "https://github.com/semantic-similarity-app",
                    "X-Title": "Semantic Similarity Explainer"
                }
                
                data = {
                    "model": "openai/gpt-3.5-turbo",
                    "messages": [
                        {
                            "role": "system", 
                            "content": "You are an NLP expert who explains similarity scores in simple, short terms that anyone can understand."
                        },
                        {
                            "role": "user", 
                            "content": detailed_prompt
                        }
                    ],
                    "temperature": 0.10,  # Slightly higher for more natural explanations
                    "max_tokens": 400  # Much shorter responses
                }
                
                response = requests.post(
                    "https://openrouter.ai/api/v1/chat/completions",
                    headers=headers,
                    json=data
                )
                
                if response.status_code == 200:
                    result = response.json()
                    explanation = result['choices'][0]['message']['content']
                    
                    # Display results in tabs
                    tab1, tab2, tab3 = st.tabs(["🤖 AI Explanation", "📝 Prompt Sent to AI", "🔧 Technical Details"])
                    
                    with tab1:
                        st.markdown("### 🧠 Why You Got This Similarity Score")
                        st.markdown("**AI Analysis:**")
                        
                        # Create a nice container for the AI explanation
                        with st.container():
                            st.markdown(f"""
                            <div style="background-color: #f0f2f6; padding: 20px; border-radius: 10px; border-left: 4px solid #1f77b4;">
                            {explanation}
                            </div>
                            """, unsafe_allow_html=True)
                    
                    with tab2:
                        st.markdown("### 📤 Exact Prompt Sent to GPT-3.5-Turbo")
                        st.markdown("This is exactly what was sent to the AI to generate the explanation:")
                        st.code(detailed_prompt, language="text")
                        
                        st.markdown("**API Details:**")
                        st.json({
                            "model": "openai/gpt-3.5-turbo",
                            "temperature": 0.3,
                            "max_tokens": 800,
                            "system_message": "You are an expert NLP researcher..."
                        })
                    
                    with tab3:
                        st.markdown("### 🔧 Technical Details")
                        
                        col1, col2 = st.columns(2)
                        
                        with col1:
                            st.markdown("**Sentence 1 Analysis:**")
                            st.text(f"Original: {sentence1}")
                            st.text(f"Normalized: {sentence1_normalized}")
                            st.text(f"Embedding shape: {embedding1.shape}")
                            st.text(f"Embedding L2 norm: {np.linalg.norm(embedding1):.4f}")
                            
                            st.markdown("**First 10 embedding dimensions:**")
                            embedding_preview = embedding1[0][:10]
                            for i, val in enumerate(embedding_preview):
                                st.text(f"Dim {i}: {val:.4f}")
                        
                        with col2:
                            st.markdown("**Sentence 2 Analysis:**")
                            st.text(f"Original: {sentence2}")
                            st.text(f"Normalized: {sentence2_normalized}")
                            st.text(f"Embedding shape: {embedding2.shape}")
                            st.text(f"Embedding L2 norm: {np.linalg.norm(embedding2):.4f}")
                            
                            st.markdown("**First 10 embedding dimensions:**")
                            embedding_preview = embedding2[0][:10]
                            for i, val in enumerate(embedding_preview):
                                st.text(f"Dim {i}: {val:.4f}")
                        
                        st.markdown("---")
                        st.markdown("**Similarity Computation Details:**")
                        
                        col1, col2, col3 = st.columns(3)
                        
                        with col1:
                            st.metric("Embedding Dimensions", "384")
                            st.metric("Exact Similarity", f"{similarity_float:.6f}")
                        
                        with col2:
                            st.metric("Rounded Similarity", f"{similarity_rounded:.2f}")
                            dot_product = np.dot(embedding1[0], embedding2[0])
                            st.metric("Dot Product", f"{dot_product:.4f}")
                        
                        with col3:
                            # Calculate angle between vectors
                            angle = np.arccos(np.clip(similarity_float, -1.0, 1.0))
                            angle_degrees = np.degrees(angle)
                            st.metric("Vector Angle (degrees)", f"{angle_degrees:.2f}°")
                            st.metric("Model Used", "all-MiniLM-L6-v2")
                    
                    # Save to history
                    st.session_state.history.append({
                        "sentence1": sentence1,
                        "sentence2": sentence2,
                        "similarity": similarity_rounded,
                        "explanation": explanation
                    })
                    
                    st.success("✅ Analysis complete! Check the tabs above for detailed explanations.")
                    
                else:
                    st.error(f"❌ API Error: {response.status_code}")
                    st.error(f"Response: {response.text}")
                    
        except Exception as e:
            st.error(f"❌ An error occurred: {str(e)}")
            st.error("Please check your API key and internet connection.")

# Display history
if st.session_state.history:
    st.markdown("---")
    st.markdown("### 📜 Previous Calculations")
    
    for i, item in enumerate(reversed(st.session_state.history[-5:])):  # Show last 5
        with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' → Score: {item['similarity']:.2f}"):
            st.markdown(item['explanation'])

# Info box about semantic similarity
with st.expander("ℹ️ Understanding Semantic Similarity Scores"):
    st.markdown("""
    ### How to Interpret Cosine Similarity Scores
    
    **What the numbers mean:**
    - **0.90 - 1.00**: Nearly identical meaning (e.g., "The car is fast" vs "The automobile is quick")
    - **0.70 - 0.89**: High semantic similarity (e.g., "I love dogs" vs "I adore puppies")
    - **0.50 - 0.69**: Moderate similarity (e.g., "You are hot" vs "You are cold" - same structure, opposite meaning)
    - **0.30 - 0.49**: Low similarity (e.g., "I like pizza" vs "Mathematics is difficult")
    - **0.00 - 0.29**: Very low similarity (e.g., "Hello world" vs "Quantum physics equations")
    
    **Why transformer embeddings are powerful:**
    - They understand **context** and **meaning**, not just word overlap
    - They capture **relationships** between words (synonyms, antonyms, related concepts)
    - They consider **sentence structure** and **grammatical patterns**
    - They detect **semantic similarity** even with different words
    """)

# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center'>
    <p>🚀 Made with ❤️ using Streamlit | Powered by Sentence Transformers & OpenRouter API</p>
    <p><small>Each calculation automatically sends your sentences and similarity score to GPT-3.5-turbo for detailed analysis</small></p>
</div>
""", unsafe_allow_html=True)