shreejan4603's picture
Create app.py
a823313 verified
import streamlit as st
import PyPDF2
import faiss
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np
# Initialize model and FAISS index
model = SentenceTransformer('all-mpnet-base-v2')
vector_store_1 = faiss.IndexFlatL2(768)
vector_store_2 = faiss.IndexFlatL2(768)
# Function to extract text from PDF
def extract_pdf_text(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to chunk text into smaller parts
def chunk_text(text, chunk_size=500):
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
# Function to create embeddings and add to FAISS index
def create_embeddings(chunks, vector_store):
embeddings = model.encode(chunks)
vector_store.add(embeddings)
return embeddings
# Function to calculate similarity ratio and find matches
def calculate_similarity_ratio_and_find_matches(chunks1, chunks2, embeddings1, embeddings2):
similarities = []
for i, emb1 in enumerate(embeddings1):
# Find the most similar chunk in the second document
best_similarity = 1 - min([cosine(emb1, emb2) for emb2 in embeddings2])
similarities.append(best_similarity)
average_similarity = sum(similarities) / len(similarities)
return average_similarity
# Function to calculate word similarity ratio
def calculate_word_similarity_ratio(text1, text2):
words1 = text1.split()
words2 = text2.split()
# Generate embeddings for words
word_embeddings1 = model.encode(words1)
word_embeddings2 = model.encode(words2)
# Calculate word similarities
similarities = []
for emb1 in word_embeddings1:
similarities.append(max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0))
average_word_similarity = np.mean(similarities)
return average_word_similarity
# Streamlit UI
st.title("RAGBot: PDF-Based Context Similarity Comparison")
st.header("Analytics")
# Input 1
st.subheader("Input 1")
text_input_1 = st.text_area("Enter text for Input 1 (optional):")
uploaded_file_1 = st.file_uploader("Upload a PDF for Input 1", type="pdf", key="file1")
# Input 2
st.subheader("Input 2")
text_input_2 = st.text_area("Enter text for Input 2 (optional):")
uploaded_file_2 = st.file_uploader("Upload a PDF for Input 2", type="pdf", key="file2")
# Submit button
if st.button("Submit"):
if (text_input_1 or uploaded_file_1) and (text_input_2 or uploaded_file_2):
# Process Input 1
if uploaded_file_1:
pdf_text_1 = extract_pdf_text(uploaded_file_1)
else:
pdf_text_1 = ""
combined_text_1 = text_input_1 + " " + pdf_text_1
chunks_1 = chunk_text(combined_text_1)
embeddings_1 = create_embeddings(chunks_1, vector_store_1)
# Process Input 2
if uploaded_file_2:
pdf_text_2 = extract_pdf_text(uploaded_file_2)
else:
pdf_text_2 = ""
combined_text_2 = text_input_2 + " " + pdf_text_2
chunks_2 = chunk_text(combined_text_2)
embeddings_2 = create_embeddings(chunks_2, vector_store_2)
# Calculate and display similarity ratio
similarity_ratio = calculate_similarity_ratio_and_find_matches(chunks_1, chunks_2, embeddings_1, embeddings_2)
st.write(f"### **Context Comparison:** {similarity_ratio * 100:.2f}%")
# Calculate and display word similarity ratio
st.write("### **Word to Word Similarity:**")
for i, (text1, text2) in enumerate(zip(chunks_1, chunks_2)):
word_similarity_ratio = calculate_word_similarity_ratio(text1, text2)
st.write(f"**Chunk {i+1}:** Word Similarity Ratio: {word_similarity_ratio * 100:.2f}%")
else:
st.warning("Please provide at least one input for each document (text or PDF).")