Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,38 +1,72 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import google.generativeai as genai
|
|
|
|
| 3 |
|
| 4 |
# Configure Gemini API
|
| 5 |
genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
|
| 6 |
|
| 7 |
-
st.title("Embedding Test")
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
placeholder="Type your text here...")
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
else:
|
| 19 |
-
with st.spinner("
|
| 20 |
try:
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Display results
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
st.subheader("
|
| 32 |
-
st.write(
|
| 33 |
-
st.code(str(embedding))
|
| 34 |
|
| 35 |
-
st.
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
except Exception as e:
|
| 38 |
-
st.error(f"Error
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import google.generativeai as genai
|
| 3 |
+
import numpy as np
|
| 4 |
|
| 5 |
# Configure Gemini API
|
| 6 |
genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
|
| 7 |
|
| 8 |
+
st.title("Text Embedding Similarity Test")
|
| 9 |
|
| 10 |
+
def split_into_chunks(text, chunk_size=500):
|
| 11 |
+
"""Split text into chunks of approximately specified character length"""
|
| 12 |
+
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
|
|
|
| 13 |
|
| 14 |
+
def get_embedding(text):
|
| 15 |
+
"""Get embedding for a single text chunk"""
|
| 16 |
+
return genai.embed_content(
|
| 17 |
+
model="models/text-embedding-004",
|
| 18 |
+
content=text
|
| 19 |
+
)['embedding']
|
| 20 |
+
|
| 21 |
+
def cosine_similarity(vec1, vec2):
|
| 22 |
+
"""Compute cosine similarity between two vectors"""
|
| 23 |
+
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
| 24 |
+
|
| 25 |
+
# Text input areas
|
| 26 |
+
col1, col2 = st.columns(2)
|
| 27 |
+
with col1:
|
| 28 |
+
input_text1 = st.text_area("Enter your first text:",
|
| 29 |
+
height=200,
|
| 30 |
+
placeholder="Type or paste your first text here...")
|
| 31 |
+
|
| 32 |
+
with col2:
|
| 33 |
+
input_text2 = st.text_area("Enter text to compare:",
|
| 34 |
+
height=200,
|
| 35 |
+
placeholder="Type or paste text to compare...")
|
| 36 |
+
|
| 37 |
+
if st.button("Run Similarity Test"):
|
| 38 |
+
if not input_text1.strip() or not input_text2.strip():
|
| 39 |
+
st.warning("Please enter text in both input fields.")
|
| 40 |
else:
|
| 41 |
+
with st.spinner("Analyzing texts..."):
|
| 42 |
try:
|
| 43 |
+
# Process first text into chunks
|
| 44 |
+
chunks = split_into_chunks(input_text1)
|
| 45 |
+
if len(chunks) > 1:
|
| 46 |
+
st.info(f"Split first text into {len(chunks)} chunks")
|
| 47 |
+
|
| 48 |
+
# Generate embeddings for all chunks
|
| 49 |
+
embeddings = [get_embedding(chunk) for chunk in chunks]
|
| 50 |
+
|
| 51 |
+
# Generate embedding for comparison text
|
| 52 |
+
compare_embedding = get_embedding(input_text2)
|
| 53 |
|
| 54 |
+
# Calculate similarities
|
| 55 |
+
similarities = [cosine_similarity(emb, compare_embedding) for emb in embeddings]
|
| 56 |
+
max_score = max(similarities)
|
| 57 |
+
max_index = similarities.index(max_score)
|
| 58 |
|
| 59 |
# Display results
|
| 60 |
+
st.subheader("π Similarity Results")
|
| 61 |
+
st.write(f"**Highest similarity score:** {max_score:.4f}")
|
| 62 |
|
| 63 |
+
st.subheader("π§© Most Similar Chunk")
|
| 64 |
+
st.write(chunks[max_index])
|
|
|
|
| 65 |
|
| 66 |
+
st.subheader("π All Chunk Similarities")
|
| 67 |
+
for i, (chunk, score) in enumerate(zip(chunks, similarities)):
|
| 68 |
+
st.write(f"Chunk {i+1} ({len(chunk)} chars): {score:.4f}")
|
| 69 |
+
st.expander(f"View chunk {i+1}").write(chunk)
|
| 70 |
|
| 71 |
except Exception as e:
|
| 72 |
+
st.error(f"Error processing texts: {str(e)}")
|