import streamlit as st import PyPDF2 import faiss from sentence_transformers import SentenceTransformer from scipy.spatial.distance import cosine import numpy as np # Initialize model and FAISS index model = SentenceTransformer('all-mpnet-base-v2') vector_store_1 = faiss.IndexFlatL2(768) vector_store_2 = faiss.IndexFlatL2(768) # Function to extract text from PDF def extract_pdf_text(pdf_file): reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text # Function to chunk text into smaller parts def chunk_text(text, chunk_size=500): return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] # Function to create embeddings and add to FAISS index def create_embeddings(chunks, vector_store): embeddings = model.encode(chunks) vector_store.add(embeddings) return embeddings # Function to calculate similarity ratio and find matches def calculate_similarity_ratio_and_find_matches(chunks1, chunks2, embeddings1, embeddings2): similarities = [] for i, emb1 in enumerate(embeddings1): # Find the most similar chunk in the second document best_similarity = 1 - min([cosine(emb1, emb2) for emb2 in embeddings2]) similarities.append(best_similarity) average_similarity = sum(similarities) / len(similarities) return average_similarity # Function to calculate word similarity ratio def calculate_word_similarity_ratio(text1, text2): words1 = text1.split() words2 = text2.split() # Generate embeddings for words word_embeddings1 = model.encode(words1) word_embeddings2 = model.encode(words2) # Calculate word similarities similarities = [] for emb1 in word_embeddings1: similarities.append(max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0)) average_word_similarity = np.mean(similarities) return average_word_similarity # Streamlit UI st.title("RAGBot: PDF-Based Context Similarity Comparison") st.header("Analytics") # Input 1 st.subheader("Input 1") text_input_1 = st.text_area("Enter text for Input 1 (optional):") uploaded_file_1 = st.file_uploader("Upload a PDF for Input 1", type="pdf", key="file1") # Input 2 st.subheader("Input 2") text_input_2 = st.text_area("Enter text for Input 2 (optional):") uploaded_file_2 = st.file_uploader("Upload a PDF for Input 2", type="pdf", key="file2") # Submit button if st.button("Submit"): if (text_input_1 or uploaded_file_1) and (text_input_2 or uploaded_file_2): # Process Input 1 if uploaded_file_1: pdf_text_1 = extract_pdf_text(uploaded_file_1) else: pdf_text_1 = "" combined_text_1 = text_input_1 + " " + pdf_text_1 chunks_1 = chunk_text(combined_text_1) embeddings_1 = create_embeddings(chunks_1, vector_store_1) # Process Input 2 if uploaded_file_2: pdf_text_2 = extract_pdf_text(uploaded_file_2) else: pdf_text_2 = "" combined_text_2 = text_input_2 + " " + pdf_text_2 chunks_2 = chunk_text(combined_text_2) embeddings_2 = create_embeddings(chunks_2, vector_store_2) # Calculate and display similarity ratio similarity_ratio = calculate_similarity_ratio_and_find_matches(chunks_1, chunks_2, embeddings_1, embeddings_2) st.write(f"### **Context Comparison:** {similarity_ratio * 100:.2f}%") # Calculate and display word similarity ratio st.write("### **Word to Word Similarity:**") for i, (text1, text2) in enumerate(zip(chunks_1, chunks_2)): word_similarity_ratio = calculate_word_similarity_ratio(text1, text2) st.write(f"**Chunk {i+1}:** Word Similarity Ratio: {word_similarity_ratio * 100:.2f}%") else: st.warning("Please provide at least one input for each document (text or PDF).")