Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import PyPDF2 | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from scipy.spatial.distance import cosine | |
| import numpy as np | |
| # Initialize model and FAISS index | |
| model = SentenceTransformer('all-mpnet-base-v2') | |
| vector_store_1 = faiss.IndexFlatL2(768) | |
| vector_store_2 = faiss.IndexFlatL2(768) | |
| # Function to extract text from PDF | |
| def extract_pdf_text(pdf_file): | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Function to chunk text into smaller parts | |
| def chunk_text(text, chunk_size=500): | |
| return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| # Function to create embeddings and add to FAISS index | |
| def create_embeddings(chunks, vector_store): | |
| embeddings = model.encode(chunks) | |
| vector_store.add(embeddings) | |
| return embeddings | |
| # Function to calculate similarity ratio and find matches | |
| def calculate_similarity_ratio_and_find_matches(chunks1, chunks2, embeddings1, embeddings2): | |
| similarities = [] | |
| for i, emb1 in enumerate(embeddings1): | |
| # Find the most similar chunk in the second document | |
| best_similarity = 1 - min([cosine(emb1, emb2) for emb2 in embeddings2]) | |
| similarities.append(best_similarity) | |
| average_similarity = sum(similarities) / len(similarities) | |
| return average_similarity | |
| # Function to calculate word similarity ratio | |
| def calculate_word_similarity_ratio(text1, text2): | |
| words1 = text1.split() | |
| words2 = text2.split() | |
| # Generate embeddings for words | |
| word_embeddings1 = model.encode(words1) | |
| word_embeddings2 = model.encode(words2) | |
| # Calculate word similarities | |
| similarities = [] | |
| for emb1 in word_embeddings1: | |
| similarities.append(max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0)) | |
| average_word_similarity = np.mean(similarities) | |
| return average_word_similarity | |
| # Streamlit UI | |
| st.title("RAGBot: PDF-Based Context Similarity Comparison") | |
| st.header("Analytics") | |
| # Input 1 | |
| st.subheader("Input 1") | |
| text_input_1 = st.text_area("Enter text for Input 1 (optional):") | |
| uploaded_file_1 = st.file_uploader("Upload a PDF for Input 1", type="pdf", key="file1") | |
| # Input 2 | |
| st.subheader("Input 2") | |
| text_input_2 = st.text_area("Enter text for Input 2 (optional):") | |
| uploaded_file_2 = st.file_uploader("Upload a PDF for Input 2", type="pdf", key="file2") | |
| # Submit button | |
| if st.button("Submit"): | |
| if (text_input_1 or uploaded_file_1) and (text_input_2 or uploaded_file_2): | |
| # Process Input 1 | |
| if uploaded_file_1: | |
| pdf_text_1 = extract_pdf_text(uploaded_file_1) | |
| else: | |
| pdf_text_1 = "" | |
| combined_text_1 = text_input_1 + " " + pdf_text_1 | |
| chunks_1 = chunk_text(combined_text_1) | |
| embeddings_1 = create_embeddings(chunks_1, vector_store_1) | |
| # Process Input 2 | |
| if uploaded_file_2: | |
| pdf_text_2 = extract_pdf_text(uploaded_file_2) | |
| else: | |
| pdf_text_2 = "" | |
| combined_text_2 = text_input_2 + " " + pdf_text_2 | |
| chunks_2 = chunk_text(combined_text_2) | |
| embeddings_2 = create_embeddings(chunks_2, vector_store_2) | |
| # Calculate and display similarity ratio | |
| similarity_ratio = calculate_similarity_ratio_and_find_matches(chunks_1, chunks_2, embeddings_1, embeddings_2) | |
| st.write(f"### **Context Comparison:** {similarity_ratio * 100:.2f}%") | |
| # Calculate and display word similarity ratio | |
| st.write("### **Word to Word Similarity:**") | |
| for i, (text1, text2) in enumerate(zip(chunks_1, chunks_2)): | |
| word_similarity_ratio = calculate_word_similarity_ratio(text1, text2) | |
| st.write(f"**Chunk {i+1}:** Word Similarity Ratio: {word_similarity_ratio * 100:.2f}%") | |
| else: | |
| st.warning("Please provide at least one input for each document (text or PDF).") | |