vIVANsy commited on
Commit
4c09931
·
verified ·
1 Parent(s): 80e0e29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -11
app.py CHANGED
@@ -2,7 +2,14 @@ import streamlit as st
2
  import warnings
3
  from sentence_transformers import SentenceTransformer
4
  from scipy.spatial.distance import cosine
 
5
  import numpy as np
 
 
 
 
 
 
6
 
7
  # Suppress specific FutureWarning from transformers
8
  warnings.filterwarnings("ignore", category=FutureWarning, message=".*clean_up_tokenization_spaces.*")
@@ -10,6 +17,10 @@ warnings.filterwarnings("ignore", category=FutureWarning, message=".*clean_up_to
10
  # Initialize model
11
  model = SentenceTransformer('all-mpnet-base-v2')
12
 
 
 
 
 
13
  # Function to chunk text into smaller parts
14
  def chunk_text(text, chunk_size=500):
15
  return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
@@ -26,14 +37,13 @@ def create_embeddings(chunks):
26
  # Function to calculate similarity ratio and find matches
27
  def calculate_similarity_ratio_and_find_matches(embeddings1, embeddings2):
28
  try:
29
- # Efficiently compute pairwise similarities
30
  similarities = np.dot(embeddings1, embeddings2.T) # Dot product
31
  max_similarities = np.max(similarities, axis=1) # Max similarity for each chunk in embeddings1
32
  average_similarity = np.mean(max_similarities)
33
- return average_similarity
34
  except Exception as e:
35
  st.error(f"Error calculating similarity ratio: {e}")
36
- return 0
37
 
38
  # Function to calculate word similarity ratio
39
  def calculate_word_similarity_ratio(text1, text2):
@@ -41,39 +51,75 @@ def calculate_word_similarity_ratio(text1, text2):
41
  words1 = text1.split()
42
  words2 = text2.split()
43
 
44
- # Handle cases where there are no words in the texts
45
  if not words1 or not words2:
46
  return 0
47
 
48
  word_embeddings1 = model.encode(words1)
49
  word_embeddings2 = model.encode(words2)
50
 
51
- # Calculate pairwise similarities
52
  similarities = np.array([
53
  max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0)
54
  for emb1 in word_embeddings1
55
  ])
56
 
57
- # Return the average similarity if similarities array is not empty
58
  average_word_similarity = np.mean(similarities) if similarities.size > 0 else 0
59
  return average_word_similarity
60
  except Exception as e:
61
  st.error(f"Error calculating word similarity ratio: {e}")
62
  return 0
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # Streamlit UI
 
 
 
 
65
  st.title("Text-Based Similarity Comparison")
66
 
67
  # Create two columns for text input
68
  col1, col2 = st.columns(2)
69
 
70
  with col1:
71
- text_input_1 = st.text_area("Enter the first text", height=300)
 
 
 
 
 
72
 
73
  with col2:
74
- text_input_2 = st.text_area("Enter the second text", height=300)
 
 
 
 
 
75
 
76
- if text_input_1 and text_input_2:
77
  if st.button("Submit"):
78
  # Process texts
79
  chunks_1 = chunk_text(text_input_1)
@@ -83,7 +129,7 @@ if text_input_1 and text_input_2:
83
 
84
  # Calculate and display similarity ratio
85
  if embeddings_1.size > 0 and embeddings_2.size > 0:
86
- similarity_ratio = calculate_similarity_ratio_and_find_matches(embeddings_1, embeddings_2)
87
  word_similarities = []
88
 
89
  # Calculate and display word similarity ratio
@@ -92,12 +138,62 @@ if text_input_1 and text_input_2:
92
  word_similarity_ratio = calculate_word_similarity_ratio(chunks_1[i], chunks_2[i])
93
  word_similarities.append(word_similarity_ratio * 100)
94
 
 
 
 
 
95
  # Display similarity results
96
  st.write(f"**Context Similarity:** {similarity_ratio * 100:.2f}%")
 
 
97
  st.write("### Word Similarity Ratios:")
98
  for i, word_similarity in enumerate(word_similarities):
99
  st.write(f"**Chunk {i+1}:** {word_similarity:.2f}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  else:
101
  st.error("Error in processing the texts.")
102
  else:
103
- st.info("Please enter text in both fields and click Submit to check similarity.")
 
 
 
 
 
 
 
 
2
  import warnings
3
  from sentence_transformers import SentenceTransformer
4
  from scipy.spatial.distance import cosine
5
+ from sklearn.feature_extraction.text import CountVectorizer
6
  import numpy as np
7
+ import PyPDF2
8
+ import seaborn as sns
9
+ import matplotlib.pyplot as plt
10
+ import pandas as pd
11
+ from difflib import SequenceMatcher
12
+ import streamlit_shadcn_ui as ui
13
 
14
  # Suppress specific FutureWarning from transformers
15
  warnings.filterwarnings("ignore", category=FutureWarning, message=".*clean_up_tokenization_spaces.*")
 
17
  # Initialize model
18
  model = SentenceTransformer('all-mpnet-base-v2')
19
 
20
+ # Initialize session state for results table if not already present
21
+ if 'results_df' not in st.session_state:
22
+ st.session_state.results_df = pd.DataFrame(columns=["LLM1", "LLM2", "Context Similarity (%)", "Levenshtein Similarity (%)", "Jaccard Similarity (%)"])
23
+
24
  # Function to chunk text into smaller parts
25
  def chunk_text(text, chunk_size=500):
26
  return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
 
37
  # Function to calculate similarity ratio and find matches
38
  def calculate_similarity_ratio_and_find_matches(embeddings1, embeddings2):
39
  try:
 
40
  similarities = np.dot(embeddings1, embeddings2.T) # Dot product
41
  max_similarities = np.max(similarities, axis=1) # Max similarity for each chunk in embeddings1
42
  average_similarity = np.mean(max_similarities)
43
+ return similarities, average_similarity
44
  except Exception as e:
45
  st.error(f"Error calculating similarity ratio: {e}")
46
+ return np.array([]), 0
47
 
48
  # Function to calculate word similarity ratio
49
  def calculate_word_similarity_ratio(text1, text2):
 
51
  words1 = text1.split()
52
  words2 = text2.split()
53
 
 
54
  if not words1 or not words2:
55
  return 0
56
 
57
  word_embeddings1 = model.encode(words1)
58
  word_embeddings2 = model.encode(words2)
59
 
 
60
  similarities = np.array([
61
  max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0)
62
  for emb1 in word_embeddings1
63
  ])
64
 
 
65
  average_word_similarity = np.mean(similarities) if similarities.size > 0 else 0
66
  return average_word_similarity
67
  except Exception as e:
68
  st.error(f"Error calculating word similarity ratio: {e}")
69
  return 0
70
 
71
+ # Function to extract text from PDF
72
+ def extract_pdf_text(pdf_file):
73
+ try:
74
+ reader = PyPDF2.PdfReader(pdf_file)
75
+ text = ""
76
+ for page in reader.pages:
77
+ text += page.extract_text()
78
+ return text
79
+ except Exception as e:
80
+ st.error(f"Error extracting text from PDF: {e}")
81
+ return ""
82
+
83
+ # Function to calculate Levenshtein distance
84
+ def calculate_levenshtein_ratio(text1, text2):
85
+ return SequenceMatcher(None, text1, text2).ratio()
86
+
87
+ # Function to calculate Jaccard similarity
88
+ def calculate_jaccard_similarity(text1, text2):
89
+ vectorizer = CountVectorizer(binary=True).fit_transform([text1, text2])
90
+ vectors = vectorizer.toarray()
91
+ # Compute the intersection and union for Jaccard Similarity
92
+ intersection = np.sum(np.minimum(vectors[0], vectors[1]))
93
+ union = np.sum(np.maximum(vectors[0], vectors[1]))
94
+ return intersection / union if union != 0 else 0
95
+
96
  # Streamlit UI
97
+ st.sidebar.title("LLM Details")
98
+ llm1_name = st.sidebar.text_input("What is LLM1?", "LLM1")
99
+ llm2_name = st.sidebar.text_input("What is LLM2?", "LLM2")
100
+
101
  st.title("Text-Based Similarity Comparison")
102
 
103
  # Create two columns for text input
104
  col1, col2 = st.columns(2)
105
 
106
  with col1:
107
+ st.write(f"**{llm1_name} response**")
108
+ upload_pdf_1 = st.file_uploader(f"Upload PDF for {llm1_name} response", type="pdf", key="pdf1")
109
+ if upload_pdf_1:
110
+ text_input_1 = extract_pdf_text(upload_pdf_1)
111
+ else:
112
+ text_input_1 = st.text_area(f" Text for {llm1_name}", height=150, key="text1")
113
 
114
  with col2:
115
+ st.write(f"**{llm2_name} response**")
116
+ upload_pdf_2 = st.file_uploader(f"Upload PDF for {llm2_name} response", type="pdf", key="pdf2")
117
+ if upload_pdf_2:
118
+ text_input_2 = extract_pdf_text(upload_pdf_2)
119
+ else:
120
+ text_input_2 = st.text_area(f" Text for {llm2_name}", height=150, key="text2")
121
 
122
+ if (text_input_1 and text_input_2) or (upload_pdf_1 and upload_pdf_2):
123
  if st.button("Submit"):
124
  # Process texts
125
  chunks_1 = chunk_text(text_input_1)
 
129
 
130
  # Calculate and display similarity ratio
131
  if embeddings_1.size > 0 and embeddings_2.size > 0:
132
+ similarities, similarity_ratio = calculate_similarity_ratio_and_find_matches(embeddings_1, embeddings_2)
133
  word_similarities = []
134
 
135
  # Calculate and display word similarity ratio
 
138
  word_similarity_ratio = calculate_word_similarity_ratio(chunks_1[i], chunks_2[i])
139
  word_similarities.append(word_similarity_ratio * 100)
140
 
141
+ # Calculate Levenshtein and Jaccard ratios
142
+ levenshtein_ratio = calculate_levenshtein_ratio(text_input_1, text_input_2) * 100
143
+ jaccard_similarity = calculate_jaccard_similarity(text_input_1, text_input_2) * 100
144
+
145
  # Display similarity results
146
  st.write(f"**Context Similarity:** {similarity_ratio * 100:.2f}%")
147
+ st.write(f"**Levenshtein Similarity:** {levenshtein_ratio:.2f}%")
148
+ st.write(f"**Jaccard Similarity:** {jaccard_similarity:.2f}%")
149
  st.write("### Word Similarity Ratios:")
150
  for i, word_similarity in enumerate(word_similarities):
151
  st.write(f"**Chunk {i+1}:** {word_similarity:.2f}%")
152
+
153
+ # Update session state DataFrame
154
+ new_result = {
155
+ "LLM1": llm1_name,
156
+ "LLM2": llm2_name,
157
+ "Context Similarity (%)": similarity_ratio * 100,
158
+ "Levenshtein Similarity (%)": levenshtein_ratio,
159
+ "Jaccard Similarity (%)": jaccard_similarity
160
+ }
161
+ st.session_state.results_df = pd.concat([st.session_state.results_df, pd.DataFrame([new_result])], ignore_index=True)
162
+
163
+ # Display updated table
164
+ st.write("### Similarity Results")
165
+ ui.table(data=st.session_state.results_df, maxHeight=300)
166
+
167
+ # Plot similarity results
168
+ st.write("### Similarity Metrics Visualization")
169
+
170
+ # Plot using matplotlib
171
+ fig, ax = plt.subplots(figsize=(12, 6))
172
+ metrics_df = st.session_state.results_df.copy()
173
+ metrics_df['Index'] = metrics_df.index
174
+
175
+ # Plot each metric
176
+ ax.plot(metrics_df['Index'], metrics_df['Context Similarity (%)'], label='Context Similarity', marker='o')
177
+ ax.plot(metrics_df['Index'], metrics_df['Levenshtein Similarity (%)'], label='Levenshtein Similarity', marker='o')
178
+ ax.plot(metrics_df['Index'], metrics_df['Jaccard Similarity (%)'], label='Jaccard Similarity', marker='o')
179
+
180
+ # Labels and title
181
+ ax.set_xlabel('Comparison Index')
182
+ ax.set_ylabel('Percentage')
183
+ ax.set_title('Similarity Metrics Over Comparisons')
184
+ ax.legend()
185
+ ax.grid(True)
186
+
187
+ st.pyplot(fig)
188
+
189
  else:
190
  st.error("Error in processing the texts.")
191
  else:
192
+ st.info("Please enter text in both fields or upload PDFs, and click Submit to check similarity.")
193
+
194
+ # Add Clear button
195
+ if st.button("Clear All"):
196
+ # Reset session state DataFrame
197
+ st.session_state.results_df = pd.DataFrame(columns=["LLM1", "LLM2", "Context Similarity (%)", "Levenshtein Similarity (%)", "Jaccard Similarity (%)"])
198
+ st.success("All results have been cleared.")
199
+ st.experimental_rerun()