uumerrr684 commited on
Commit
79d1bc3
Β·
verified Β·
1 Parent(s): 8828e89

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -0
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import requests
6
+ import json
7
+ import os
8
+
9
+ # Page config
10
+ st.set_page_config(
11
+ page_title="Semantic Similarity Explainer",
12
+ page_icon="πŸ”",
13
+ layout="wide"
14
+ )
15
+
16
+ # Title and description
17
+ st.title("πŸ” Semantic Similarity Explainer with AI")
18
+ st.markdown("""
19
+ This app calculates the **semantic similarity** between two sentences using transformer-based embeddings (all-MiniLM-L6-v2) and uses AI to explain why that specific score makes sense.
20
+ """)
21
+
22
+ # Initialize session state
23
+ if 'history' not in st.session_state:
24
+ st.session_state.history = []
25
+
26
+ # Cache the model loading
27
+ @st.cache_resource
28
+ def load_model():
29
+ return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
30
+
31
+ # Load the model
32
+ with st.spinner("Loading transformer model..."):
33
+ model = load_model()
34
+
35
+ # Sidebar for API key
36
+ with st.sidebar:
37
+ st.header("βš™οΈ Configuration")
38
+ api_key = st.text_input("OpenRouter API Key", type="password", help="Get your API key from https://openrouter.ai/keys")
39
+
40
+ st.markdown("---")
41
+ st.markdown("""
42
+ ### How it works:
43
+ 1. Enter two sentences
44
+ 2. Generate embeddings using transformer
45
+ 3. Calculate cosine similarity
46
+ 4. AI explains the similarity score
47
+ 5. View the full prompt sent to AI
48
+ """)
49
+
50
+ st.info("""
51
+ **Model:** all-MiniLM-L6-v2
52
+
53
+ This transformer model creates 384-dimensional embeddings that capture semantic meaning, not just word overlap.
54
+ """)
55
+
56
+ # Main content
57
+ col1, col2 = st.columns(2)
58
+
59
+ with col1:
60
+ sentence1 = st.text_input("Enter first sentence:", placeholder="e.g., you are pretty")
61
+
62
+ with col2:
63
+ sentence2 = st.text_input("Enter second sentence:", placeholder="e.g., you are ugly")
64
+
65
+ # Calculate button
66
+ if st.button("🎯 Calculate & Explain", type="primary"):
67
+ if not sentence1 or not sentence2:
68
+ st.error("Please enter both sentences!")
69
+ elif not api_key:
70
+ st.error("Please enter your OpenRouter API key in the sidebar!")
71
+ else:
72
+ try:
73
+ # Normalize to lowercase for consistency
74
+ sentence1_normalized = sentence1.lower().strip()
75
+ sentence2_normalized = sentence2.lower().strip()
76
+
77
+ # Generate embeddings
78
+ with st.spinner("Generating semantic embeddings..."):
79
+ embeddings = model.encode([sentence1_normalized, sentence2_normalized])
80
+ embedding1 = embeddings[0].reshape(1, -1)
81
+ embedding2 = embeddings[1].reshape(1, -1)
82
+
83
+ # Calculate cosine similarity
84
+ similarity = cosine_similarity(embedding1, embedding2)[0][0]
85
+
86
+ # Round to 2 decimal places
87
+ similarity_rounded = round(similarity, 2)
88
+
89
+ # Display similarity score
90
+ st.success(f"**Semantic similarity between:**")
91
+ st.info(f'"{sentence1}" and "{sentence2}" β†’ **{similarity_rounded:.2f}**')
92
+
93
+ # Show similarity meter
94
+ progress_color = "normal"
95
+ if similarity_rounded < 0.3:
96
+ progress_color = "normal"
97
+ similarity_desc = "Low similarity"
98
+ elif similarity_rounded < 0.7:
99
+ similarity_desc = "Moderate similarity"
100
+ else:
101
+ similarity_desc = "High similarity"
102
+
103
+ st.progress(similarity_rounded, text=similarity_desc)
104
+
105
+ # Create the prompt for the AI
106
+ prompt = f"""You are an expert in Natural Language Processing and semantic similarity using transformer-based embeddings.
107
+
108
+ I have calculated the semantic similarity between two sentences using the 'all-MiniLM-L6-v2' transformer model, which generates 384-dimensional dense vector embeddings that capture semantic meaning.
109
+
110
+ Original Sentence 1: "{sentence1}"
111
+ Original Sentence 2: "{sentence2}"
112
+
113
+ Normalized (lowercase) for embedding:
114
+ Sentence 1: "{sentence1_normalized}"
115
+ Sentence 2: "{sentence2_normalized}"
116
+
117
+ Calculated Semantic Similarity Score: {similarity_rounded:.2f}
118
+
119
+ Please explain:
120
+ 1. What this similarity score means (0.00 = completely different meaning, 1.00 = identical meaning)
121
+ 2. Why these two specific sentences resulted in a score of {similarity_rounded:.2f}
122
+ 3. What semantic features (meaning, context, sentiment) contributed to this score
123
+ 4. How transformer embeddings capture deeper meaning beyond just word overlap
124
+ 5. Whether this score makes intuitive sense given the semantic relationship between the sentences
125
+
126
+ Note: This uses semantic embeddings, not TF-IDF, so the score reflects actual meaning similarity, not just word overlap."""
127
+
128
+ # Call OpenRouter API
129
+ with st.spinner("Getting AI explanation..."):
130
+ headers = {
131
+ "Authorization": f"Bearer {api_key}",
132
+ "Content-Type": "application/json",
133
+ "HTTP-Referer": "https://github.com/yourusername/semantic-similarity-app",
134
+ "X-Title": "Semantic Similarity Explainer"
135
+ }
136
+
137
+ data = {
138
+ "model": "openai/gpt-3.5-turbo",
139
+ "messages": [
140
+ {"role": "system", "content": "You are an expert in NLP and transformer-based semantic similarity analysis. Provide clear, educational explanations about how embeddings capture meaning."},
141
+ {"role": "user", "content": prompt}
142
+ ],
143
+ "temperature": 0.7,
144
+ "max_tokens": 600
145
+ }
146
+
147
+ response = requests.post(
148
+ "https://openrouter.ai/api/v1/chat/completions",
149
+ headers=headers,
150
+ json=data
151
+ )
152
+
153
+ if response.status_code == 200:
154
+ result = response.json()
155
+ explanation = result['choices'][0]['message']['content']
156
+
157
+ # Display results in tabs
158
+ tab1, tab2, tab3 = st.tabs(["πŸ“Š AI Explanation", "πŸ“ Full Prompt Sent", "πŸ”§ Technical Details"])
159
+
160
+ with tab1:
161
+ st.markdown("### AI Explanation")
162
+ st.markdown(explanation)
163
+
164
+ with tab2:
165
+ st.markdown("### Full Prompt Sent to GPT-3.5-turbo")
166
+ st.code(prompt, language="text")
167
+
168
+ with tab3:
169
+ st.markdown("### Technical Details")
170
+
171
+ col1, col2 = st.columns(2)
172
+
173
+ with col1:
174
+ st.markdown("**Sentence 1 Details:**")
175
+ st.text(f"Original: {sentence1}")
176
+ st.text(f"Normalized: {sentence1_normalized}")
177
+ st.text(f"Embedding shape: {embedding1.shape}")
178
+ st.text(f"Embedding norm: {np.linalg.norm(embedding1):.4f}")
179
+
180
+ # Show first 10 dimensions of embedding
181
+ st.markdown("**First 10 embedding dimensions:**")
182
+ embedding_preview = embedding1[0][:10]
183
+ for i, val in enumerate(embedding_preview):
184
+ st.text(f"Dim {i}: {val:.4f}")
185
+
186
+ with col2:
187
+ st.markdown("**Sentence 2 Details:**")
188
+ st.text(f"Original: {sentence2}")
189
+ st.text(f"Normalized: {sentence2_normalized}")
190
+ st.text(f"Embedding shape: {embedding2.shape}")
191
+ st.text(f"Embedding norm: {np.linalg.norm(embedding2):.4f}")
192
+
193
+ # Show first 10 dimensions of embedding
194
+ st.markdown("**First 10 embedding dimensions:**")
195
+ embedding_preview = embedding2[0][:10]
196
+ for i, val in enumerate(embedding_preview):
197
+ st.text(f"Dim {i}: {val:.4f}")
198
+
199
+ st.markdown("---")
200
+ st.markdown("**Embedding Statistics:**")
201
+ col1, col2, col3 = st.columns(3)
202
+
203
+ with col1:
204
+ st.metric("Embedding Dimensions", "384")
205
+ st.metric("Exact Similarity", f"{similarity:.6f}")
206
+
207
+ with col2:
208
+ st.metric("Rounded Similarity", f"{similarity_rounded:.2f}")
209
+ dot_product = np.dot(embedding1[0], embedding2[0])
210
+ st.metric("Dot Product", f"{dot_product:.4f}")
211
+
212
+ with col3:
213
+ # Calculate angle between vectors
214
+ angle = np.arccos(np.clip(similarity, -1.0, 1.0))
215
+ angle_degrees = np.degrees(angle)
216
+ st.metric("Angle (degrees)", f"{angle_degrees:.2f}Β°")
217
+ st.metric("Model", "all-MiniLM-L6-v2")
218
+
219
+ # Save to history
220
+ st.session_state.history.append({
221
+ "sentence1": sentence1,
222
+ "sentence2": sentence2,
223
+ "similarity": similarity_rounded,
224
+ "explanation": explanation
225
+ })
226
+
227
+ else:
228
+ st.error(f"API Error: {response.status_code}")
229
+ st.error(response.text)
230
+
231
+ except Exception as e:
232
+ st.error(f"An error occurred: {str(e)}")
233
+
234
+ # Display history
235
+ if st.session_state.history:
236
+ st.markdown("---")
237
+ st.markdown("### πŸ“œ Previous Calculations")
238
+
239
+ for i, item in enumerate(reversed(st.session_state.history[-5:])): # Show last 5
240
+ with st.expander(f"'{item['sentence1']}' vs '{item['sentence2']}' - Score: {item['similarity']:.2f}"):
241
+ st.markdown(item['explanation'])
242
+
243
+ # Info box about semantic similarity
244
+ with st.expander("ℹ️ Understanding Semantic Similarity"):
245
+ st.markdown("""
246
+ ### Semantic Similarity vs Word Overlap
247
+
248
+ **Transformer-based embeddings** (like all-MiniLM-L6-v2) capture the **actual meaning** of sentences, not just word overlap.
249
+
250
+ Examples:
251
+ - "The car is fast" vs "The automobile is quick" β†’ High similarity (~0.90)
252
+ - "I love dogs" vs "I hate dogs" β†’ Moderate similarity (~0.60) - similar topic, opposite sentiment
253
+ - "You are pretty" vs "You are ugly" β†’ Moderate similarity (~0.40-0.50) - same structure, opposite meaning
254
+ - "The cat sat on the mat" vs "Python is a programming language" β†’ Low similarity (~0.10)
255
+
256
+ The model understands:
257
+ - **Synonyms** (car/automobile, fast/quick)
258
+ - **Context** (word meanings in sentences)
259
+ - **Semantic relationships** (opposites, related concepts)
260
+ - **Sentence structure** and grammatical patterns
261
+ """)
262
+
263
+ # Footer
264
+ st.markdown("---")
265
+ st.markdown("""
266
+ <div style='text-align: center'>
267
+ <p>Made with ❀️ using Streamlit | Powered by Sentence Transformers & OpenRouter API</p>
268
+ </div>
269
+ """, unsafe_allow_html=True)