Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics import pairwise_distances | |
| from rouge_score import rouge_scorer | |
| import gensim.downloader as api | |
| from sentence_transformers import SentenceTransformer | |
| from scipy.spatial.distance import cosine | |
| import PyPDF2 | |
| import spacy | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| from spacy.cli import download | |
| download("en_core_web_sm") | |
| nlp = spacy.load("en_core_web_sm") | |
| from difflib import SequenceMatcher | |
| # Load spaCy model | |
| nlp = spacy.load('en_core_web_sm') | |
| # Load stop words from spaCy | |
| stop_words = set(nlp.Defaults.stop_words) | |
| # Initialize models | |
| def load_models(): | |
| model = SentenceTransformer('all-mpnet-base-v2') | |
| tfidf_vectorizer = TfidfVectorizer() | |
| word2vec_model = api.load("word2vec-google-news-300") # Load Word2Vec model | |
| return model, tfidf_vectorizer, word2vec_model | |
| model, tfidf_vectorizer, word2vec_model = load_models() | |
| # Initialize session state for results table if not already present | |
| if 'results_df' not in st.session_state: | |
| st.session_state.results_df = pd.DataFrame(columns=[ | |
| "LLM1", "LLM2", | |
| "Paraphrasing Similarity (%)", | |
| "Direct Text Comparison (%)", | |
| "Summarization Similarity (%)", | |
| "Combined Similarity (%)" | |
| ]) | |
| # Initialize session state for radar chart data | |
| if 'radar_chart_data' not in st.session_state: | |
| st.session_state.radar_chart_data = [] | |
| # Functions (same as before) | |
| def chunk_text(text, chunk_size=500): | |
| return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| def create_embeddings(chunks): | |
| try: | |
| embeddings = model.encode(chunks, show_progress_bar=False) | |
| return embeddings | |
| except Exception as e: | |
| st.error(f"Error creating embeddings: {e}") | |
| return np.array([]) | |
| def calculate_similarity_ratio_and_find_matches(embeddings1, embeddings2): | |
| try: | |
| similarities = np.dot(embeddings1, embeddings2.T) # Dot product | |
| max_similarities = np.max(similarities, axis=1) # Max similarity for each chunk in embeddings1 | |
| average_similarity = np.mean(max_similarities) | |
| return similarities, average_similarity | |
| except Exception as e: | |
| st.error(f"Error calculating similarity ratio: {e}") | |
| return np.array([]), 0 | |
| def calculate_word_similarity_ratio(text1, text2): | |
| try: | |
| doc1 = nlp(text1) | |
| doc2 = nlp(text2) | |
| words1 = [token.text for token in doc1 if not token.is_stop and not token.is_punct] | |
| words2 = [token.text for token in doc2 if not token.is_stop and not token.is_punct] | |
| if not words1 or not words2: | |
| return 0 | |
| word_embeddings1 = model.encode(words1) | |
| word_embeddings2 = model.encode(words2) | |
| similarities = np.array([ | |
| max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0) | |
| for emb1 in word_embeddings1 | |
| ]) | |
| average_word_similarity = np.mean(similarities) if similarities.size > 0 else 0 | |
| return average_word_similarity | |
| except Exception as e: | |
| st.error(f"Error calculating word similarity ratio: {e}") | |
| return 0 | |
| def calculate_bleu_score(reference, candidate): | |
| from nltk.translate.bleu_score import sentence_bleu | |
| return sentence_bleu([reference.split()], candidate.split()) | |
| def calculate_rouge_l_score(reference, candidate): | |
| scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) | |
| scores = scorer.score(reference, candidate) | |
| return scores['rougeL'].fmeasure * 100 | |
| def calculate_bertscore(reference, candidate): | |
| import bert_score | |
| P, R, F1 = bert_score.score([candidate], [reference], model_type='bert-base-uncased') | |
| return F1.mean().item() * 100 | |
| def calculate_wmd(reference, candidate): | |
| doc1 = nlp(reference.lower()) | |
| doc2 = nlp(candidate.lower()) | |
| reference_tokens = [token.text for token in doc1 if not token.is_stop and not token.is_punct] | |
| candidate_tokens = [token.text for token in doc2 if not token.is_stop and not token.is_punct] | |
| return word2vec_model.wmdistance(reference_tokens, candidate_tokens) | |
| def extract_pdf_text(pdf_file): | |
| try: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| except Exception as e: | |
| st.error(f"Error extracting text from PDF: {e}") | |
| return "" | |
| def calculate_levenshtein_ratio(text1, text2): | |
| return SequenceMatcher(None, text1, text2).ratio() | |
| def calculate_jaccard_similarity(text1, text2): | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| vectorizer = CountVectorizer(binary=True).fit_transform([text1, text2]) | |
| vectors = vectorizer.toarray() | |
| intersection = np.sum(np.minimum(vectors[0], vectors[1])) | |
| union = np.sum(np.maximum(vectors[0], vectors[1])) | |
| return intersection / union if union != 0 else 0 | |
| def calculate_tfidf_cosine_similarity(text1, text2): | |
| tfidf_matrix = tfidf_vectorizer.fit_transform([text1, text2]) | |
| return 1 - pairwise_distances(tfidf_matrix, metric='cosine')[0, 1] | |
| def calculate_paraphrasing_similarity(text1, text2): | |
| try: | |
| chunks_1 = chunk_text(text1) | |
| chunks_2 = chunk_text(text2) | |
| embeddings_1 = create_embeddings(chunks_1) | |
| embeddings_2 = create_embeddings(chunks_2) | |
| if embeddings_1.size > 0 and embeddings_2.size > 0: | |
| similarities, average_similarity = calculate_similarity_ratio_and_find_matches(embeddings_1, embeddings_2) | |
| return average_similarity * 100 | |
| return 0 | |
| except Exception as e: | |
| st.error(f"Error calculating paraphrasing similarity: {e}") | |
| return 0 | |
| def calculate_direct_text_comparison_similarity(text1, text2): | |
| try: | |
| levenshtein_ratio = calculate_levenshtein_ratio(text1, text2) * 100 | |
| jaccard_similarity = calculate_jaccard_similarity(text1, text2) * 100 | |
| tfidf_cosine_similarity = calculate_tfidf_cosine_similarity(text1, text2) * 100 | |
| bleu_score = calculate_bleu_score(text1, text2) * 100 | |
| rouge_l_score = calculate_rouge_l_score(text1, text2) | |
| bertscore = calculate_bertscore(text1, text2) | |
| return (levenshtein_ratio * 0.1 + | |
| jaccard_similarity * 0.2 + | |
| tfidf_cosine_similarity * 0.2 + | |
| bleu_score * 0.2 + | |
| rouge_l_score * 0.2 + | |
| bertscore * 0.2) / 1.1 | |
| except Exception as e: | |
| st.error(f"Error calculating direct text comparison similarity: {e}") | |
| return 0 | |
| def calculate_summarization_similarity(text1, text2): | |
| try: | |
| wmd = calculate_wmd(text1, text2) | |
| return (1 - wmd) * 100 | |
| except Exception as e: | |
| st.error(f"Error calculating summarization similarity: {e}") | |
| return 0 | |
| # Streamlit UI | |
| st.title("Text-Based Similarity Comparison") | |
| st.markdown("*Use in wide mode*") | |
| # Create a two-column layout for input | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.sidebar.title("LLM Details") | |
| llm1_name = st.sidebar.text_input("What is LLM1?", "LLM1") | |
| llm2_name = st.sidebar.text_input("What is LLM2?", "LLM2") | |
| st.write("## Input") | |
| # Create two columns for text input | |
| input_col1, input_col2 = st.columns(2) | |
| with input_col1: | |
| st.write(f"{llm1_name} response") | |
| upload_pdf_1 = st.file_uploader(f"Upload PDF for {llm1_name} response", type="pdf", key="pdf1") | |
| if upload_pdf_1: | |
| text_input_1 = extract_pdf_text(upload_pdf_1) | |
| else: | |
| text_input_1 = st.text_area(f"Text for {llm1_name}", height=150, key="text1") | |
| with input_col2: | |
| st.write(f"{llm2_name} response") | |
| upload_pdf_2 = st.file_uploader(f"Upload PDF for {llm2_name} response", type="pdf", key="pdf2") | |
| if upload_pdf_2: | |
| text_input_2 = extract_pdf_text(upload_pdf_2) | |
| else: | |
| text_input_2 = st.text_area(f"Text for {llm2_name}", height=150, key="text2") | |
| if (text_input_1 and text_input_2) or (upload_pdf_1 and upload_pdf_2): | |
| if st.button("Submit"): | |
| # Calculate similarity metrics | |
| paraphrasing_similarity = calculate_paraphrasing_similarity(text_input_1, text_input_2) | |
| direct_text_comparison_similarity = calculate_direct_text_comparison_similarity(text_input_1, text_input_2) | |
| summarization_similarity = calculate_summarization_similarity(text_input_1, text_input_2) | |
| if summarization_similarity<0: | |
| summarization_similarity=0 | |
| if direct_text_comparison_similarity<0: | |
| direct_text_comparison_similarity=0 | |
| # Combine all metrics into a single similarity score | |
| total_similarity = (paraphrasing_similarity * 0.6 + # High weight | |
| direct_text_comparison_similarity * 0.3 + # Moderate weight | |
| summarization_similarity * 0.1) # Low weight | |
| # Update results table in session state | |
| new_row = pd.Series({ | |
| "LLM1": llm1_name, | |
| "LLM2": llm2_name, | |
| "Paraphrasing Similarity (%)": paraphrasing_similarity, | |
| "Direct Text Comparison (%)": direct_text_comparison_similarity, | |
| "Summarization Similarity (%)": summarization_similarity, | |
| "Combined Similarity (%)": total_similarity | |
| }) | |
| st.session_state.results_df = pd.concat([st.session_state.results_df, new_row.to_frame().T], ignore_index=True) | |
| # Add new data for radar chart | |
| st.session_state.radar_chart_data.append({ | |
| "name": f"{llm1_name} vs {llm2_name}", | |
| "paraphrasing_similarity": paraphrasing_similarity, | |
| "direct_text_comparison_similarity": direct_text_comparison_similarity, | |
| "summarization_similarity": summarization_similarity | |
| }) | |
| # Display metrics with large and bold text | |
| # Define a style for the combined score | |
| combined_score_style = """ | |
| <style> | |
| .combined-score { | |
| font-size: 48px; | |
| font-weight: bold; | |
| color: #4CAF50; /* Green color for positive emphasis */ | |
| background-color: #f0f0f5; | |
| padding: 20px; | |
| border-radius: 15px; | |
| text-align: center; | |
| margin-top: 30px; | |
| box-shadow: 2px 2px 12px rgba(0, 0, 0, 0.1); | |
| } | |
| </style> | |
| """ | |
| good_case = """ | |
| <style> | |
| .good { | |
| font-size: 48px; | |
| font-weight: bold; | |
| color: #4CAF50; /* Green color for positive emphasis */ | |
| background-color: #f0f0f5; | |
| padding: 20px; | |
| border-radius: 15px; | |
| text-align: center; | |
| margin-top: 30px; | |
| box-shadow: 2px 2px 12px rgba(0, 0, 0, 0.1); | |
| } | |
| </style> | |
| """ | |
| bad_case = """ | |
| <style> | |
| .bad { | |
| font-size: 48px; | |
| font-weight: bold; | |
| color: #FF0000; /* Red color for negative emphasis */ | |
| background-color: #f0f0f5; | |
| padding: 20px; | |
| border-radius: 15px; | |
| text-align: center; | |
| margin-top: 30px; | |
| box-shadow: 2px 2px 12px rgba(0, 0, 0, 0.1); | |
| } | |
| </style> | |
| """ | |
| # Apply the style | |
| st.markdown(combined_score_style, unsafe_allow_html=True) | |
| st.markdown(good_case, unsafe_allow_html=True) | |
| st.markdown(bad_case, unsafe_allow_html=True) | |
| # Display the combined similarity score | |
| st.markdown(f'<div class="combined-score">Combined Similarity Score: {total_similarity:.2f}%</div>', unsafe_allow_html=True) | |
| # Calculate context-words difference | |
| context_words_diff = int(paraphrasing_similarity) - int(direct_text_comparison_similarity) | |
| # Display distinguishing factor | |
| if total_similarity >= 100: | |
| st.markdown(f'<div class="bad">Similar Responses</div>', unsafe_allow_html=True) | |
| elif total_similarity >= 55: | |
| if context_words_diff >= 42 and context_words_diff < 57.08: | |
| st.markdown(f'<div class="bad">Similar Responses</div>', unsafe_allow_html=True) | |
| elif context_words_diff > 35: | |
| st.markdown(f'<div class="good">Response 2 is better.</div>', unsafe_allow_html=True) | |
| else: | |
| st.markdown(f'<div class="bad">Similar Responses</div>', unsafe_allow_html=True) | |
| else: | |
| st.markdown(f'<div class="bad">Similar Responses</div>', unsafe_allow_html=True) | |
| with col2: | |
| # Display radar chart | |
| if st.session_state.radar_chart_data: | |
| st.subheader("Metrics Comparison") | |
| st.markdown("*Larger area = More similarity of responses.*") | |
| labels = ["Context similarity", "Words Similarity", "Summarization Similarity"] | |
| num_vars = len(labels) | |
| angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() | |
| angles += angles[:1] | |
| fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True)) | |
| # Plot each response with a different color | |
| color_palette = sns.color_palette("husl", len(st.session_state.radar_chart_data)) | |
| for idx, data in enumerate(st.session_state.radar_chart_data): | |
| values = [ | |
| data["paraphrasing_similarity"], | |
| data["direct_text_comparison_similarity"], | |
| data["summarization_similarity"] | |
| ] | |
| values += values[:1] | |
| ax.fill(angles, values, color=color_palette[idx], alpha=0.25, label=data["name"]) | |
| ax.plot(angles, values, color=color_palette[idx], linewidth=2, linestyle='solid') | |
| ax.set_yticklabels([]) | |
| ax.set_xticks(angles[:-1]) | |
| ax.set_xticklabels(labels) | |
| plt.title("Radar Chart of Similarity Metrics") | |
| plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1)) | |
| st.pyplot(fig) | |
| # Display metrics sliders beside the radar chart | |
| if st.session_state.radar_chart_data: | |
| st.subheader("Similarity Factors") | |
| st.markdown("*100 being the best case*") | |
| slider_labels = { | |
| "paraphrasing_similarity": "Context", | |
| "direct_text_comparison_similarity": "Words", | |
| "summarization_similarity": "Summary" | |
| } | |
| metrics = st.session_state.radar_chart_data[-1] | |
| for metric_name in ["paraphrasing_similarity", "direct_text_comparison_similarity", "summarization_similarity"]: | |
| st.slider( | |
| slider_labels[metric_name], | |
| 0, 100, | |
| int(metrics[metric_name]), | |
| key=metric_name, | |
| disabled=True, # Make the slider non-editable | |
| format="%.0f" # Format the slider value to be an integer | |
| ) | |
| # Create a three-column layout for the results table and action buttons | |
| results_col, actions_col = st.columns([2, 1]) | |
| with results_col: | |
| st.write("## Detailed Results Table") | |
| if not st.session_state.results_df.empty: | |
| st.write(st.session_state.results_df) | |
| # Download the results as a CSV file | |
| csv_data = st.session_state.results_df.to_csv(index=False).encode('utf-8') | |
| st.download_button(label="Download Results as CSV", data=csv_data, file_name='similarity_results.csv', mime='text/csv') | |
| with actions_col: | |
| if st.button("Reset Table"): | |
| st.session_state.results_df = pd.DataFrame(columns=[ | |
| "LLM1", "LLM2", | |
| "Paraphrasing Similarity (%)", | |
| "Direct Text Comparison (%)", | |
| "Summarization Similarity (%)", | |
| "Combined Similarity (%)" | |
| ]) | |
| st.session_state.radar_chart_data = [] | |
| st.write("Results table has been reset.") | |
| # Add an "About" button in the sidebar | |
| if st.sidebar.button("About"): | |
| st.sidebar.markdown(""" | |
| ### About This App | |
| This app compares text similarity between different responses from Language Models (LLMs). | |
| It calculates various similarity metrics and provides a comprehensive comparison using a radar chart. | |
| **Features:** | |
| - Upload or input text for comparison. | |
| - Calculate and display multiple similarity metrics. | |
| - Visualize the results using a radar chart. | |
| - Download the results as a CSV file. | |
| **Similarity Metrics:** | |
| 1. **Paraphrasing Similarity**: | |
| - Compares chunks of text from both LLM responses using embeddings generated by a pre-trained model. | |
| - Calculates the average cosine similarity between the chunks. | |
| 2. **Direct Text Comparison**: | |
| - Uses a combination of metrics: | |
| - **Levenshtein Ratio**: Measures the similarity based on the minimum edit distance. | |
| - **Jaccard Similarity**: Compares the overlap of unique words. | |
| - **TF-IDF Cosine Similarity**: Compares the text using TF-IDF vectorization. | |
| - **BLEU Score**: Evaluates the overlap of n-grams. | |
| - **ROUGE-L Score**: Measures the longest matching sequence of words. | |
| - **BERTScore**: Uses BERT embeddings to compare sentence similarity. | |
| 3. **Summarization Similarity**: | |
| - Uses the Word Mover's Distance (WMD) to compare the semantic distance between the summaries of the texts. | |
| 4. **Combined Similarity**: | |
| - A weighted average of the above metrics to provide an overall similarity score. | |
| **Developed with:** | |
| - Streamlit | |
| - Sentence Transformers | |
| - SpaCy | |
| - Scikit-learn | |
| - NLTK | |
| - Gensim | |
| """) |