vIVANsy commited on
Commit
145bb32
·
verified ·
1 Parent(s): d6627eb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import warnings
3
+ from sentence_transformers import SentenceTransformer
4
+ from scipy.spatial.distance import cosine
5
+ import numpy as np
6
+
7
+ # Suppress specific FutureWarning from transformers
8
+ warnings.filterwarnings("ignore", category=FutureWarning, message=".*clean_up_tokenization_spaces.*")
9
+
10
+ # Initialize model
11
+ model = SentenceTransformer('all-mpnet-base-v2')
12
+
13
+ # Function to chunk text into smaller parts
14
+ def chunk_text(text, chunk_size=500):
15
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
16
+
17
+ # Function to create embeddings
18
+ def create_embeddings(chunks):
19
+ try:
20
+ embeddings = model.encode(chunks, show_progress_bar=False)
21
+ return embeddings
22
+ except Exception as e:
23
+ st.error(f"Error creating embeddings: {e}")
24
+ return np.array([])
25
+
26
+ # Function to calculate similarity ratio and find matches
27
+ def calculate_similarity_ratio_and_find_matches(embeddings1, embeddings2):
28
+ try:
29
+ # Efficiently compute pairwise similarities
30
+ similarities = np.dot(embeddings1, embeddings2.T) # Dot product
31
+ max_similarities = np.max(similarities, axis=1) # Max similarity for each chunk in embeddings1
32
+ average_similarity = np.mean(max_similarities)
33
+ return average_similarity
34
+ except Exception as e:
35
+ st.error(f"Error calculating similarity ratio: {e}")
36
+ return 0
37
+
38
+ # Function to calculate word similarity ratio
39
+ def calculate_word_similarity_ratio(text1, text2):
40
+ try:
41
+ words1 = text1.split()
42
+ words2 = text2.split()
43
+
44
+ # Handle cases where there are no words in the texts
45
+ if not words1 or not words2:
46
+ return 0
47
+
48
+ word_embeddings1 = model.encode(words1)
49
+ word_embeddings2 = model.encode(words2)
50
+
51
+ # Calculate pairwise similarities
52
+ similarities = np.array([
53
+ max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0)
54
+ for emb1 in word_embeddings1
55
+ ])
56
+
57
+ # Return the average similarity if similarities array is not empty
58
+ average_word_similarity = np.mean(similarities) if similarities.size > 0 else 0
59
+ return average_word_similarity
60
+ except Exception as e:
61
+ st.error(f"Error calculating word similarity ratio: {e}")
62
+ return 0
63
+
64
+ # Streamlit UI
65
+ st.title("Text-Based Similarity Comparison")
66
+
67
+ # Create two columns for text input
68
+ col1, col2 = st.columns(2)
69
+
70
+ with col1:
71
+ text_input_1 = st.text_area("Enter the first text", height=300)
72
+
73
+ with col2:
74
+ text_input_2 = st.text_area("Enter the second text", height=300)
75
+
76
+ if text_input_1 and text_input_2:
77
+ if st.button("Submit"):
78
+ # Process texts
79
+ chunks_1 = chunk_text(text_input_1)
80
+ chunks_2 = chunk_text(text_input_2)
81
+ embeddings_1 = create_embeddings(chunks_1)
82
+ embeddings_2 = create_embeddings(chunks_2)
83
+
84
+ # Calculate and display similarity ratio
85
+ if embeddings_1.size > 0 and embeddings_2.size > 0:
86
+ similarity_ratio = calculate_similarity_ratio_and_find_matches(embeddings_1, embeddings_2)
87
+ word_similarities = []
88
+
89
+ # Calculate and display word similarity ratio
90
+ min_chunks = min(len(chunks_1), len(chunks_2))
91
+ for i in range(min_chunks):
92
+ word_similarity_ratio = calculate_word_similarity_ratio(chunks_1[i], chunks_2[i])
93
+ word_similarities.append(word_similarity_ratio * 100)
94
+
95
+ # Create card layout for similarity
96
+ similarity_card = """
97
+ <div style="border: 1px solid #ddd; border-radius: 10px; padding: 20px; margin: 10px; width: 700px; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.1); display: flex; flex-direction: row; justify-content: space-between;">
98
+ <div style="flex: 1; margin-right: 10px;">
99
+ <h3 style="font-size: 18px; margin: 0;">Context Similarity</h3>
100
+ <p style="font-size: 24px; color: #2e8b57; margin: 10px 0;">{:.2f}%</p>
101
+ </div>
102
+ <div style="flex: 1; margin-left: 10px;">
103
+ <h3 style="font-size: 18px; margin: 0;">Word Similarity</h3>
104
+ {}
105
+ </div>
106
+ </div>
107
+ """.format(
108
+ similarity_ratio * 100,
109
+ "".join([
110
+ f"""
111
+ <p style="font-size: 18px; color: #4682b4; margin: 5px 0;">Chunk {i+1}: {word_similarity:.2f}%</p>
112
+ """ for i, word_similarity in enumerate(word_similarities)
113
+ ])
114
+ )
115
+
116
+ # Display card
117
+ st.markdown(f"""
118
+ <div style="display: flex; justify-content: center; flex-wrap: wrap;">
119
+ {similarity_card}
120
+ </div>
121
+ """, unsafe_allow_html=True)
122
+ else:
123
+ st.error("Error in processing the texts.")
124
+ else:
125
+ st.info("Please enter text in both fields and click Submit to check similarity.")