Spaces:

uumerrr684
/

Cosine_Similarity_Explainer

Sleeping

App Files Files Community

uumerrr684 commited on Aug 17, 2025

Commit

79d1bc3

verified ·

1 Parent(s): 8828e89

Create app.py

Browse files

Files changed (1) hide show

app.py +269 -0

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import streamlit as st
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import requests
+import json
+import os
+# Page config
+st.set_page_config(
+    page_title="Semantic Similarity Explainer",
+    page_icon="🔍",
+    layout="wide"
+)
+# Title and description
+st.title("🔍 Semantic Similarity Explainer with AI")
+st.markdown("""
+This app calculates the **semantic similarity** between two sentences using transformer-based embeddings (all-MiniLM-L6-v2) and uses AI to explain why that specific score makes sense.
+""")
+# Initialize session state
+if 'history' not in st.session_state:
+    st.session_state.history = []
+# Cache the model loading
+@st.cache_resource
+def load_model():
+    return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# Load the model
+with st.spinner("Loading transformer model..."):
+    model = load_model()
+# Sidebar for API key
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    api_key = st.text_input("OpenRouter API Key", type="password", help="Get your API key from https://openrouter.ai/keys")
+    st.markdown("---")
+    st.markdown("""
+    ### How it works:
+    1. Enter two sentences
+    2. Generate embeddings using transformer
+    3. Calculate cosine similarity
+    4. AI explains the similarity score
+    5. View the full prompt sent to AI
+    """)
+    st.info("""
+    **Model:** all-MiniLM-L6-v2
+    This transformer model creates 384-dimensional embeddings that capture semantic meaning, not just word overlap.
+    """)
+# Main content
+col1, col2 = st.columns(2)
+with col1:
+    sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are pretty")
+with col2:
+    sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are ugly")
+# Calculate button
+if st.button("🎯 Calculate & Explain", type="primary"):
+    if not sentence1 or not sentence2:
+        st.error("Please enter both sentences!")
+    elif not api_key:
+        st.error("Please enter your OpenRouter API key in the sidebar!")
+    else:
+        try:
+            # Normalize to lowercase for consistency
+            sentence1_normalized = sentence1.lower().strip()
+            sentence2_normalized = sentence2.lower().strip()
+            # Generate embeddings
+            with st.spinner("Generating semantic embeddings..."):
+                embeddings = model.encode([sentence1_normalized, sentence2_normalized])
+                embedding1 = embeddings[0].reshape(1, -1)
+                embedding2 = embeddings[1].reshape(1, -1)
+                # Calculate cosine similarity
+                similarity = cosine_similarity(embedding1, embedding2)[0][0]
+                # Round to 2 decimal places
+                similarity_rounded = round(similarity, 2)
+            # Display similarity score
+            st.success(f"**Semantic similarity between:**")
+            st.info(f'"{sentence1}" and "{sentence2}" → **{similarity_rounded:.2f}**')
+            # Show similarity meter
+            progress_color = "normal"
+            if similarity_rounded < 0.3:
+                progress_color = "normal"
+                similarity_desc = "Low similarity"
+            elif similarity_rounded < 0.7:
+                similarity_desc = "Moderate similarity"
+            else:
+                similarity_desc = "High similarity"
+            st.progress(similarity_rounded, text=similarity_desc)
+            # Create the prompt for the AI
+            prompt = f"""You are an expert in Natural Language Processing and semantic similarity using transformer-based embeddings.
+I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which generates 384-dimensional dense vector embeddings that capture semantic meaning.
+Original Sentence 1: "{sentence1}"
+Original Sentence 2: "{sentence2}"
+Normalized (lowercase) for embedding:
+Sentence 1: "{sentence1_normalized}"
+Sentence 2: "{sentence2_normalized}"
+Calculated Semantic Similarity Score: {similarity_rounded:.2f}
+Please explain:
+1. What this similarity score means (0.00 = completely different meaning, 1.00 = identical meaning)
+2. Why these two specific sentences resulted in a score of {similarity_rounded:.2f}
+3. What semantic features (meaning, context, sentiment) contributed to this score
+4. How transformer embeddings capture deeper meaning beyond just word overlap
+5. Whether this score makes intuitive sense given the semantic relationship between the sentences
+Note: This uses semantic embeddings, not TF-IDF, so the score reflects actual meaning similarity, not just word overlap."""
+            # Call OpenRouter API
+            with st.spinner("Getting AI explanation..."):
+                headers = {
+                    "Authorization": f"Bearer {api_key}",
+                    "Content-Type": "application/json",
+                    "HTTP-Referer": "https://github.com/yourusername/semantic-similarity-app",
+                    "X-Title": "Semantic Similarity Explainer"
+                }
+                data = {
+                    "model": "openai/gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are an expert in NLP and transformer-based semantic similarity analysis. Provide clear, educational explanations about how embeddings capture meaning."},
+                        {"role": "user", "content": prompt}
+                    ],
+                    "temperature": 0.7,
+                    "max_tokens": 600
+                }
+                response = requests.post(
+                    "https://openrouter.ai/api/v1/chat/completions",
+                    headers=headers,
+                    json=data
+                )
+                if response.status_code == 200:
+                    result = response.json()
+                    explanation = result['choices'][0]['message']['content']
+                    # Display results in tabs
+                    tab1, tab2, tab3 = st.tabs(["📊 AI Explanation", "📝 Full Prompt Sent", "🔧 Technical Details"])
+                    with tab1:
+                        st.markdown("### AI Explanation")
+                        st.markdown(explanation)
+                    with tab2:
+                        st.markdown("### Full Prompt Sent to GPT-3.5-turbo")
+                        st.code(prompt, language="text")
+                    with tab3:
+                        st.markdown("### Technical Details")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.markdown("**Sentence 1 Details:**")
+                            st.text(f"Original: {sentence1}")
+                            st.text(f"Normalized: {sentence1_normalized}")
+                            st.text(f"Embedding shape: {embedding1.shape}")
+                            st.text(f"Embedding norm: {np.linalg.norm(embedding1):.4f}")
+                            # Show first 10 dimensions of embedding
+                            st.markdown("**First 10 embedding dimensions:**")
+                            embedding_preview = embedding1[0][:10]
+                            for i, val in enumerate(embedding_preview):
+                                st.text(f"Dim {i}: {val:.4f}")
+                        with col2:
+                            st.markdown("**Sentence 2 Details:**")
+                            st.text(f"Original: {sentence2}")
+                            st.text(f"Normalized: {sentence2_normalized}")
+                            st.text(f"Embedding shape: {embedding2.shape}")
+                            st.text(f"Embedding norm: {np.linalg.norm(embedding2):.4f}")
+                            # Show first 10 dimensions of embedding
+                            st.markdown("**First 10 embedding dimensions:**")
+                            embedding_preview = embedding2[0][:10]
+                            for i, val in enumerate(embedding_preview):
+                                st.text(f"Dim {i}: {val:.4f}")
+                        st.markdown("---")
+                        st.markdown("**Embedding Statistics:**")
+                        col1, col2, col3 = st.columns(3)
+                        with col1:
+                            st.metric("Embedding Dimensions", "384")
+                            st.metric("Exact Similarity", f"{similarity:.6f}")
+                        with col2:
+                            st.metric("Rounded Similarity", f"{similarity_rounded:.2f}")
+                            dot_product = np.dot(embedding1[0], embedding2[0])
+                            st.metric("Dot Product", f"{dot_product:.4f}")
+                        with col3:
+                            # Calculate angle between vectors
+                            angle = np.arccos(np.clip(similarity, -1.0, 1.0))
+                            angle_degrees = np.degrees(angle)
+                            st.metric("Angle (degrees)", f"{angle_degrees:.2f}°")
+                            st.metric("Model", "all-MiniLM-L6-v2")
+                    # Save to history
+                    st.session_state.history.append({
+                        "sentence1": sentence1,
+                        "sentence2": sentence2,
+                        "similarity": similarity_rounded,
+                        "explanation": explanation
+                    })
+                else:
+                    st.error(f"API Error: {response.status_code}")
+                    st.error(response.text)
+        except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
+# Display history
+if st.session_state.history:
+    st.markdown("---")
+    st.markdown("### 📜 Previous Calculations")
+    for i, item in enumerate(reversed(st.session_state.history[-5:])):  # Show last 5
+        with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' - Score: {item['similarity']:.2f}"):
+            st.markdown(item['explanation'])
+# Info box about semantic similarity
+with st.expander("ℹ️ Understanding Semantic Similarity"):
+    st.markdown("""
+    ### Semantic Similarity vs Word Overlap
+    **Transformer-based embeddings** (like all-MiniLM-L6-v2) capture the **actual meaning** of sentences, not just word overlap.
+    Examples:
+    - "The car is fast" vs "The automobile is quick" → High similarity (~0.90)
+    - "I love dogs" vs "I hate dogs" → Moderate similarity (~0.60) - similar topic, opposite sentiment
+    - "You are pretty" vs "You are ugly" → Moderate similarity (~0.40-0.50) - same structure, opposite meaning
+    - "The cat sat on the mat" vs "Python is a programming language" → Low similarity (~0.10)
+    The model understands:
+    - **Synonyms** (car/automobile, fast/quick)
+    - **Context** (word meanings in sentences)
+    - **Semantic relationships** (opposites, related concepts)
+    - **Sentence structure** and grammatical patterns
+    """)
+# Footer
+st.markdown("---")
+st.markdown("""
+<div style='text-align: center'>
+    <p>Made with ❤️ using Streamlit | Powered by Sentence Transformers & OpenRouter API</p>
+</div>
+""", unsafe_allow_html=True)