Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import warnings
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
from scipy.spatial.distance import cosine
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
# Suppress specific FutureWarning from transformers
|
| 8 |
+
warnings.filterwarnings("ignore", category=FutureWarning, message=".*clean_up_tokenization_spaces.*")
|
| 9 |
+
|
| 10 |
+
# Initialize model
|
| 11 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
| 12 |
+
|
| 13 |
+
# Function to chunk text into smaller parts
|
| 14 |
+
def chunk_text(text, chunk_size=500):
|
| 15 |
+
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
| 16 |
+
|
| 17 |
+
# Function to create embeddings
|
| 18 |
+
def create_embeddings(chunks):
|
| 19 |
+
try:
|
| 20 |
+
embeddings = model.encode(chunks, show_progress_bar=False)
|
| 21 |
+
return embeddings
|
| 22 |
+
except Exception as e:
|
| 23 |
+
st.error(f"Error creating embeddings: {e}")
|
| 24 |
+
return np.array([])
|
| 25 |
+
|
| 26 |
+
# Function to calculate similarity ratio and find matches
|
| 27 |
+
def calculate_similarity_ratio_and_find_matches(embeddings1, embeddings2):
|
| 28 |
+
try:
|
| 29 |
+
# Efficiently compute pairwise similarities
|
| 30 |
+
similarities = np.dot(embeddings1, embeddings2.T) # Dot product
|
| 31 |
+
max_similarities = np.max(similarities, axis=1) # Max similarity for each chunk in embeddings1
|
| 32 |
+
average_similarity = np.mean(max_similarities)
|
| 33 |
+
return average_similarity
|
| 34 |
+
except Exception as e:
|
| 35 |
+
st.error(f"Error calculating similarity ratio: {e}")
|
| 36 |
+
return 0
|
| 37 |
+
|
| 38 |
+
# Function to calculate word similarity ratio
|
| 39 |
+
def calculate_word_similarity_ratio(text1, text2):
|
| 40 |
+
try:
|
| 41 |
+
words1 = text1.split()
|
| 42 |
+
words2 = text2.split()
|
| 43 |
+
|
| 44 |
+
# Handle cases where there are no words in the texts
|
| 45 |
+
if not words1 or not words2:
|
| 46 |
+
return 0
|
| 47 |
+
|
| 48 |
+
word_embeddings1 = model.encode(words1)
|
| 49 |
+
word_embeddings2 = model.encode(words2)
|
| 50 |
+
|
| 51 |
+
# Calculate pairwise similarities
|
| 52 |
+
similarities = np.array([
|
| 53 |
+
max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0)
|
| 54 |
+
for emb1 in word_embeddings1
|
| 55 |
+
])
|
| 56 |
+
|
| 57 |
+
# Return the average similarity if similarities array is not empty
|
| 58 |
+
average_word_similarity = np.mean(similarities) if similarities.size > 0 else 0
|
| 59 |
+
return average_word_similarity
|
| 60 |
+
except Exception as e:
|
| 61 |
+
st.error(f"Error calculating word similarity ratio: {e}")
|
| 62 |
+
return 0
|
| 63 |
+
|
| 64 |
+
# Streamlit UI
|
| 65 |
+
st.title("Text-Based Similarity Comparison")
|
| 66 |
+
|
| 67 |
+
# Create two columns for text input
|
| 68 |
+
col1, col2 = st.columns(2)
|
| 69 |
+
|
| 70 |
+
with col1:
|
| 71 |
+
text_input_1 = st.text_area("Enter the first text", height=300)
|
| 72 |
+
|
| 73 |
+
with col2:
|
| 74 |
+
text_input_2 = st.text_area("Enter the second text", height=300)
|
| 75 |
+
|
| 76 |
+
if text_input_1 and text_input_2:
|
| 77 |
+
if st.button("Submit"):
|
| 78 |
+
# Process texts
|
| 79 |
+
chunks_1 = chunk_text(text_input_1)
|
| 80 |
+
chunks_2 = chunk_text(text_input_2)
|
| 81 |
+
embeddings_1 = create_embeddings(chunks_1)
|
| 82 |
+
embeddings_2 = create_embeddings(chunks_2)
|
| 83 |
+
|
| 84 |
+
# Calculate and display similarity ratio
|
| 85 |
+
if embeddings_1.size > 0 and embeddings_2.size > 0:
|
| 86 |
+
similarity_ratio = calculate_similarity_ratio_and_find_matches(embeddings_1, embeddings_2)
|
| 87 |
+
word_similarities = []
|
| 88 |
+
|
| 89 |
+
# Calculate and display word similarity ratio
|
| 90 |
+
min_chunks = min(len(chunks_1), len(chunks_2))
|
| 91 |
+
for i in range(min_chunks):
|
| 92 |
+
word_similarity_ratio = calculate_word_similarity_ratio(chunks_1[i], chunks_2[i])
|
| 93 |
+
word_similarities.append(word_similarity_ratio * 100)
|
| 94 |
+
|
| 95 |
+
# Create card layout for similarity
|
| 96 |
+
similarity_card = """
|
| 97 |
+
<div style="border: 1px solid #ddd; border-radius: 10px; padding: 20px; margin: 10px; width: 700px; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.1); display: flex; flex-direction: row; justify-content: space-between;">
|
| 98 |
+
<div style="flex: 1; margin-right: 10px;">
|
| 99 |
+
<h3 style="font-size: 18px; margin: 0;">Context Similarity</h3>
|
| 100 |
+
<p style="font-size: 24px; color: #2e8b57; margin: 10px 0;">{:.2f}%</p>
|
| 101 |
+
</div>
|
| 102 |
+
<div style="flex: 1; margin-left: 10px;">
|
| 103 |
+
<h3 style="font-size: 18px; margin: 0;">Word Similarity</h3>
|
| 104 |
+
{}
|
| 105 |
+
</div>
|
| 106 |
+
</div>
|
| 107 |
+
""".format(
|
| 108 |
+
similarity_ratio * 100,
|
| 109 |
+
"".join([
|
| 110 |
+
f"""
|
| 111 |
+
<p style="font-size: 18px; color: #4682b4; margin: 5px 0;">Chunk {i+1}: {word_similarity:.2f}%</p>
|
| 112 |
+
""" for i, word_similarity in enumerate(word_similarities)
|
| 113 |
+
])
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# Display card
|
| 117 |
+
st.markdown(f"""
|
| 118 |
+
<div style="display: flex; justify-content: center; flex-wrap: wrap;">
|
| 119 |
+
{similarity_card}
|
| 120 |
+
</div>
|
| 121 |
+
""", unsafe_allow_html=True)
|
| 122 |
+
else:
|
| 123 |
+
st.error("Error in processing the texts.")
|
| 124 |
+
else:
|
| 125 |
+
st.info("Please enter text in both fields and click Submit to check similarity.")
|