Spaces:

uumerrr684
/

Cosine_Similarity_Explainer

Runtime error

App Files Files Community

uumerrr684 commited on Aug 17, 2025

Commit

7b83a38

verified ·

1 Parent(s): 90a486a

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -68

app.py CHANGED Viewed

@@ -57,10 +57,10 @@ with st.sidebar:
 col1, col2 = st.columns(2)
 with col1:
-    sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are pretty")
 with col2:
-    sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are ugly")
 # Calculate button
 if st.button("🎯 Calculate & Explain", type="primary"):
@@ -83,65 +83,84 @@ if st.button("🎯 Calculate & Explain", type="primary"):
                 # Calculate cosine similarity
                 similarity = cosine_similarity(embedding1, embedding2)[0][0]
-                # Round to 2 decimal places
-                similarity_rounded = round(similarity, 2)
             # Display similarity score
             st.success(f"**Semantic similarity between:**")
             st.info(f'"{sentence1}" and "{sentence2}" → **{similarity_rounded:.2f}**')
-            # Show similarity meter
-            progress_color = "normal"
             if similarity_rounded < 0.3:
-                progress_color = "normal"
                 similarity_desc = "Low similarity"
             elif similarity_rounded < 0.7:
                 similarity_desc = "Moderate similarity"
             else:
                 similarity_desc = "High similarity"
-            st.progress(similarity_rounded, text=similarity_desc)
-            # Create the prompt for the AI
-            prompt = f"""You are an expert in Natural Language Processing and semantic similarity using transformer-based embeddings.
-I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which generates 384-dimensional dense vector embeddings that capture semantic meaning.
-Original Sentence 1: "{sentence1}"
-Original Sentence 2: "{sentence2}"
-Normalized (lowercase) for embedding:
-Sentence 1: "{sentence1_normalized}"
-Sentence 2: "{sentence2_normalized}"
-Calculated Semantic Similarity Score: {similarity_rounded:.2f}
-Please explain:
-1. What this similarity score means (0.00 = completely different meaning, 1.00 = identical meaning)
-2. Why these two specific sentences resulted in a score of {similarity_rounded:.2f}
-3. What semantic features (meaning, context, sentiment) contributed to this score
-4. How transformer embeddings capture deeper meaning beyond just word overlap
-5. Whether this score makes intuitive sense given the semantic relationship between the sentences
-Note: This uses semantic embeddings, not TF-IDF, so the score reflects actual meaning similarity, not just word overlap."""
-            # Call OpenRouter API
-            with st.spinner("Getting AI explanation..."):
                 headers = {
                     "Authorization": f"Bearer {api_key}",
                     "Content-Type": "application/json",
-                    "HTTP-Referer": "https://github.com/yourusername/semantic-similarity-app",
                     "X-Title": "Semantic Similarity Explainer"
                 }
                 data = {
                     "model": "openai/gpt-3.5-turbo",
                     "messages": [
-                        {"role": "system", "content": "You are an expert in NLP and transformer-based semantic similarity analysis. Provide clear, educational explanations about how embeddings capture meaning."},
-                        {"role": "user", "content": prompt}
                     ],
-                    "temperature": 0.7,
-                    "max_tokens": 600
                 }
                 response = requests.post(
@@ -155,54 +174,70 @@ Note: This uses semantic embeddings, not TF-IDF, so the score reflects actual me
                     explanation = result['choices'][0]['message']['content']
                     # Display results in tabs
-                    tab1, tab2, tab3 = st.tabs(["📊 AI Explanation", "📝 Full Prompt Sent", "🔧 Technical Details"])
                     with tab1:
-                        st.markdown("### AI Explanation")
-                        st.markdown(explanation)
                     with tab2:
-                        st.markdown("### Full Prompt Sent to GPT-3.5-turbo")
-                        st.code(prompt, language="text")
                     with tab3:
-                        st.markdown("### Technical Details")
                         col1, col2 = st.columns(2)
                         with col1:
-                            st.markdown("**Sentence 1 Details:**")
                             st.text(f"Original: {sentence1}")
                             st.text(f"Normalized: {sentence1_normalized}")
                             st.text(f"Embedding shape: {embedding1.shape}")
-                            st.text(f"Embedding norm: {np.linalg.norm(embedding1):.4f}")
-                            # Show first 10 dimensions of embedding
                             st.markdown("**First 10 embedding dimensions:**")
                             embedding_preview = embedding1[0][:10]
                             for i, val in enumerate(embedding_preview):
                                 st.text(f"Dim {i}: {val:.4f}")
                         with col2:
-                            st.markdown("**Sentence 2 Details:**")
                             st.text(f"Original: {sentence2}")
                             st.text(f"Normalized: {sentence2_normalized}")
                             st.text(f"Embedding shape: {embedding2.shape}")
-                            st.text(f"Embedding norm: {np.linalg.norm(embedding2):.4f}")
-                            # Show first 10 dimensions of embedding
                             st.markdown("**First 10 embedding dimensions:**")
                             embedding_preview = embedding2[0][:10]
                             for i, val in enumerate(embedding_preview):
                                 st.text(f"Dim {i}: {val:.4f}")
                         st.markdown("---")
-                        st.markdown("**Embedding Statistics:**")
                         col1, col2, col3 = st.columns(3)
                         with col1:
                             st.metric("Embedding Dimensions", "384")
-                            st.metric("Exact Similarity", f"{similarity:.6f}")
                         with col2:
                             st.metric("Rounded Similarity", f"{similarity_rounded:.2f}")
@@ -211,10 +246,10 @@ Note: This uses semantic embeddings, not TF-IDF, so the score reflects actual me
                         with col3:
                             # Calculate angle between vectors
-                            angle = np.arccos(np.clip(similarity, -1.0, 1.0))
                             angle_degrees = np.degrees(angle)
-                            st.metric("Angle (degrees)", f"{angle_degrees:.2f}°")
-                            st.metric("Model", "all-MiniLM-L6-v2")
                     # Save to history
                     st.session_state.history.append({
@@ -224,12 +259,15 @@ Note: This uses semantic embeddings, not TF-IDF, so the score reflects actual me
                         "explanation": explanation
                     })
                 else:
-                    st.error(f"API Error: {response.status_code}")
-                    st.error(response.text)
         except Exception as e:
-            st.error(f"An error occurred: {str(e)}")
 # Display history
 if st.session_state.history:
@@ -237,33 +275,33 @@ if st.session_state.history:
     st.markdown("### 📜 Previous Calculations")
     for i, item in enumerate(reversed(st.session_state.history[-5:])):  # Show last 5
-        with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' - Score: {item['similarity']:.2f}"):
             st.markdown(item['explanation'])
 # Info box about semantic similarity
-with st.expander("ℹ️ Understanding Semantic Similarity"):
     st.markdown("""
-    ### Semantic Similarity vs Word Overlap
-    **Transformer-based embeddings** (like all-MiniLM-L6-v2) capture the **actual meaning** of sentences, not just word overlap.
-    Examples:
-    - "The car is fast" vs "The automobile is quick" → High similarity (~0.90)
-    - "I love dogs" vs "I hate dogs" → Moderate similarity (~0.60) - similar topic, opposite sentiment
-    - "You are pretty" vs "You are ugly" → Moderate similarity (~0.40-0.50) - same structure, opposite meaning
-    - "The cat sat on the mat" vs "Python is a programming language" → Low similarity (~0.10)
-    The model understands:
-    - **Synonyms** (car/automobile, fast/quick)
-    - **Context** (word meanings in sentences)
-    - **Semantic relationships** (opposites, related concepts)
-    - **Sentence structure** and grammatical patterns
     """)
 # Footer
 st.markdown("---")
 st.markdown("""
 <div style='text-align: center'>
-    <p>Made with ❤️ using Streamlit | Powered by Sentence Transformers & OpenRouter API</p>
 </div>
-""", unsafe_allow_html=True)

 col1, col2 = st.columns(2)
 with col1:
+    sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are hot")
 with col2:
+    sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are cold")
 # Calculate button
 if st.button("🎯 Calculate & Explain", type="primary"):
                 # Calculate cosine similarity
                 similarity = cosine_similarity(embedding1, embedding2)[0][0]
+                # Convert to Python float to fix the progress bar error
+                similarity_float = float(similarity)
+                similarity_rounded = round(similarity_float, 2)
             # Display similarity score
             st.success(f"**Semantic similarity between:**")
             st.info(f'"{sentence1}" and "{sentence2}" → **{similarity_rounded:.2f}**')
+            # Show similarity meter (fixed the float32 error)
             if similarity_rounded < 0.3:
                 similarity_desc = "Low similarity"
             elif similarity_rounded < 0.7:
                 similarity_desc = "Moderate similarity"
             else:
                 similarity_desc = "High similarity"
+            # Convert to regular Python float for progress bar
+            st.progress(float(similarity_rounded), text=similarity_desc)
+            # Create a comprehensive prompt for the AI to explain WHY this specific score occurred
+            detailed_prompt = f"""You are an expert in Natural Language Processing and semantic similarity analysis using transformer-based embeddings.
+I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which creates 384-dimensional vector embeddings that capture deep semantic meaning.
+**ANALYSIS REQUEST:**
+Sentence 1: "{sentence1}"
+Sentence 2: "{sentence2}"
+Cosine Similarity Score: {similarity_rounded:.2f}
+Please provide a detailed explanation of WHY these two specific sentences resulted in a similarity score of {similarity_rounded:.2f}.
+**Your analysis should cover:**
+1. **Score Interpretation**: What does {similarity_rounded:.2f} mean on the 0.00-1.00 scale? Is this low, moderate, or high similarity?
+2. **Semantic Analysis**:
+   - What are the key semantic elements in each sentence?
+   - What similarities did the transformer model detect?
+   - What differences contributed to the score not being higher/lower?
+3. **Linguistic Features**:
+   - Sentence structure patterns
+   - Word relationships (synonyms, antonyms, related concepts)
+   - Grammatical similarities
+   - Contextual meaning
+4. **Transformer Model Behavior**:
+   - How does all-MiniLM-L6-v2 process these sentences?
+   - What semantic features likely contributed most to this score?
+   - Why this score makes sense from a deep learning perspective
+5. **Intuitive Validation**: Does this {similarity_rounded:.2f} score match what a human would expect when comparing these sentences?
+Please be specific about these exact sentences and this exact score of {similarity_rounded:.2f}. Explain the reasoning behind this particular numerical result."""
+            # Call OpenRouter API with the detailed prompt
+            with st.spinner("🤖 AI is analyzing why you got this specific similarity score..."):
                 headers = {
                     "Authorization": f"Bearer {api_key}",
                     "Content-Type": "application/json",
+                    "HTTP-Referer": "https://github.com/semantic-similarity-app",
                     "X-Title": "Semantic Similarity Explainer"
                 }
                 data = {
                     "model": "openai/gpt-3.5-turbo",
                     "messages": [
+                        {
+                            "role": "system",
+                            "content": "You are an expert NLP researcher specializing in transformer-based semantic similarity analysis. Provide detailed, educational explanations about how specific cosine similarity scores are generated by embedding models."
+                        },
+                        {
+                            "role": "user",
+                            "content": detailed_prompt
+                        }
                     ],
+                    "temperature": 0.3,  # Lower temperature for more focused explanations
+                    "max_tokens": 800
                 }
                 response = requests.post(
                     explanation = result['choices'][0]['message']['content']
                     # Display results in tabs
+                    tab1, tab2, tab3 = st.tabs(["🤖 AI Explanation", "📝 Prompt Sent to AI", "🔧 Technical Details"])
                     with tab1:
+                        st.markdown("### 🧠 Why You Got This Similarity Score")
+                        st.markdown("**AI Analysis:**")
+                        # Create a nice container for the AI explanation
+                        with st.container():
+                            st.markdown(f"""
+                            <div style="background-color: #f0f2f6; padding: 20px; border-radius: 10px; border-left: 4px solid #1f77b4;">
+                            {explanation}
+                            </div>
+                            """, unsafe_allow_html=True)
                     with tab2:
+                        st.markdown("### 📤 Exact Prompt Sent to GPT-3.5-Turbo")
+                        st.markdown("This is exactly what was sent to the AI to generate the explanation:")
+                        st.code(detailed_prompt, language="text")
+                        st.markdown("**API Details:**")
+                        st.json({
+                            "model": "openai/gpt-3.5-turbo",
+                            "temperature": 0.3,
+                            "max_tokens": 800,
+                            "system_message": "You are an expert NLP researcher..."
+                        })
                     with tab3:
+                        st.markdown("### 🔧 Technical Details")
                         col1, col2 = st.columns(2)
                         with col1:
+                            st.markdown("**Sentence 1 Analysis:**")
                             st.text(f"Original: {sentence1}")
                             st.text(f"Normalized: {sentence1_normalized}")
                             st.text(f"Embedding shape: {embedding1.shape}")
+                            st.text(f"Embedding L2 norm: {np.linalg.norm(embedding1):.4f}")
                             st.markdown("**First 10 embedding dimensions:**")
                             embedding_preview = embedding1[0][:10]
                             for i, val in enumerate(embedding_preview):
                                 st.text(f"Dim {i}: {val:.4f}")
                         with col2:
+                            st.markdown("**Sentence 2 Analysis:**")
                             st.text(f"Original: {sentence2}")
                             st.text(f"Normalized: {sentence2_normalized}")
                             st.text(f"Embedding shape: {embedding2.shape}")
+                            st.text(f"Embedding L2 norm: {np.linalg.norm(embedding2):.4f}")
                             st.markdown("**First 10 embedding dimensions:**")
                             embedding_preview = embedding2[0][:10]
                             for i, val in enumerate(embedding_preview):
                                 st.text(f"Dim {i}: {val:.4f}")
                         st.markdown("---")
+                        st.markdown("**Similarity Computation Details:**")
                         col1, col2, col3 = st.columns(3)
                         with col1:
                             st.metric("Embedding Dimensions", "384")
+                            st.metric("Exact Similarity", f"{similarity_float:.6f}")
                         with col2:
                             st.metric("Rounded Similarity", f"{similarity_rounded:.2f}")
                         with col3:
                             # Calculate angle between vectors
+                            angle = np.arccos(np.clip(similarity_float, -1.0, 1.0))
                             angle_degrees = np.degrees(angle)
+                            st.metric("Vector Angle (degrees)", f"{angle_degrees:.2f}°")
+                            st.metric("Model Used", "all-MiniLM-L6-v2")
                     # Save to history
                     st.session_state.history.append({
                         "explanation": explanation
                     })
+                    st.success("✅ Analysis complete! Check the tabs above for detailed explanations.")
                 else:
+                    st.error(f"❌ API Error: {response.status_code}")
+                    st.error(f"Response: {response.text}")
         except Exception as e:
+            st.error(f"❌ An error occurred: {str(e)}")
+            st.error("Please check your API key and internet connection.")
 # Display history
 if st.session_state.history:
     st.markdown("### 📜 Previous Calculations")
     for i, item in enumerate(reversed(st.session_state.history[-5:])):  # Show last 5
+        with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' → Score: {item['similarity']:.2f}"):
             st.markdown(item['explanation'])
 # Info box about semantic similarity
+with st.expander("ℹ️ Understanding Semantic Similarity Scores"):
     st.markdown("""
+    ### How to Interpret Cosine Similarity Scores
+    **What the numbers mean:**
+    - **0.90 - 1.00**: Nearly identical meaning (e.g., "The car is fast" vs "The automobile is quick")
+    - **0.70 - 0.89**: High semantic similarity (e.g., "I love dogs" vs "I adore puppies")
+    - **0.50 - 0.69**: Moderate similarity (e.g., "You are hot" vs "You are cold" - same structure, opposite meaning)
+    - **0.30 - 0.49**: Low similarity (e.g., "I like pizza" vs "Mathematics is difficult")
+    - **0.00 - 0.29**: Very low similarity (e.g., "Hello world" vs "Quantum physics equations")
+    **Why transformer embeddings are powerful:**
+    - They understand **context** and **meaning**, not just word overlap
+    - They capture **relationships** between words (synonyms, antonyms, related concepts)
+    - They consider **sentence structure** and **grammatical patterns**
+    - They detect **semantic similarity** even with different words
     """)
 # Footer
 st.markdown("---")
 st.markdown("""
 <div style='text-align: center'>
+    <p>🚀 Made with ❤️ using Streamlit | Powered by Sentence Transformers & OpenRouter API</p>
+    <p><small>Each calculation automatically sends your sentences and similarity score to GPT-3.5-turbo for detailed analysis</small></p>
 </div>
+""", unsafe_allow_html=True)