shreejan4603 commited on
Commit
a823313
·
verified ·
1 Parent(s): f35c776

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import faiss
4
+ from sentence_transformers import SentenceTransformer
5
+ from scipy.spatial.distance import cosine
6
+ import numpy as np
7
+
8
+ # Initialize model and FAISS index
9
+ model = SentenceTransformer('all-mpnet-base-v2')
10
+ vector_store_1 = faiss.IndexFlatL2(768)
11
+ vector_store_2 = faiss.IndexFlatL2(768)
12
+
13
+ # Function to extract text from PDF
14
+ def extract_pdf_text(pdf_file):
15
+ reader = PyPDF2.PdfReader(pdf_file)
16
+ text = ""
17
+ for page in reader.pages:
18
+ text += page.extract_text()
19
+ return text
20
+
21
+ # Function to chunk text into smaller parts
22
+ def chunk_text(text, chunk_size=500):
23
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
24
+
25
+ # Function to create embeddings and add to FAISS index
26
+ def create_embeddings(chunks, vector_store):
27
+ embeddings = model.encode(chunks)
28
+ vector_store.add(embeddings)
29
+ return embeddings
30
+
31
+ # Function to calculate similarity ratio and find matches
32
+ def calculate_similarity_ratio_and_find_matches(chunks1, chunks2, embeddings1, embeddings2):
33
+ similarities = []
34
+
35
+ for i, emb1 in enumerate(embeddings1):
36
+ # Find the most similar chunk in the second document
37
+ best_similarity = 1 - min([cosine(emb1, emb2) for emb2 in embeddings2])
38
+ similarities.append(best_similarity)
39
+
40
+ average_similarity = sum(similarities) / len(similarities)
41
+ return average_similarity
42
+
43
+ # Function to calculate word similarity ratio
44
+ def calculate_word_similarity_ratio(text1, text2):
45
+ words1 = text1.split()
46
+ words2 = text2.split()
47
+
48
+ # Generate embeddings for words
49
+ word_embeddings1 = model.encode(words1)
50
+ word_embeddings2 = model.encode(words2)
51
+
52
+ # Calculate word similarities
53
+ similarities = []
54
+ for emb1 in word_embeddings1:
55
+ similarities.append(max([1 - cosine(emb1, emb2) for emb2 in word_embeddings2], default=0))
56
+
57
+ average_word_similarity = np.mean(similarities)
58
+ return average_word_similarity
59
+
60
+ # Streamlit UI
61
+ st.title("RAGBot: PDF-Based Context Similarity Comparison")
62
+
63
+ st.header("Analytics")
64
+
65
+ # Input 1
66
+ st.subheader("Input 1")
67
+ text_input_1 = st.text_area("Enter text for Input 1 (optional):")
68
+ uploaded_file_1 = st.file_uploader("Upload a PDF for Input 1", type="pdf", key="file1")
69
+
70
+ # Input 2
71
+ st.subheader("Input 2")
72
+ text_input_2 = st.text_area("Enter text for Input 2 (optional):")
73
+ uploaded_file_2 = st.file_uploader("Upload a PDF for Input 2", type="pdf", key="file2")
74
+
75
+ # Submit button
76
+ if st.button("Submit"):
77
+ if (text_input_1 or uploaded_file_1) and (text_input_2 or uploaded_file_2):
78
+ # Process Input 1
79
+ if uploaded_file_1:
80
+ pdf_text_1 = extract_pdf_text(uploaded_file_1)
81
+ else:
82
+ pdf_text_1 = ""
83
+ combined_text_1 = text_input_1 + " " + pdf_text_1
84
+ chunks_1 = chunk_text(combined_text_1)
85
+ embeddings_1 = create_embeddings(chunks_1, vector_store_1)
86
+
87
+ # Process Input 2
88
+ if uploaded_file_2:
89
+ pdf_text_2 = extract_pdf_text(uploaded_file_2)
90
+ else:
91
+ pdf_text_2 = ""
92
+ combined_text_2 = text_input_2 + " " + pdf_text_2
93
+ chunks_2 = chunk_text(combined_text_2)
94
+ embeddings_2 = create_embeddings(chunks_2, vector_store_2)
95
+
96
+ # Calculate and display similarity ratio
97
+ similarity_ratio = calculate_similarity_ratio_and_find_matches(chunks_1, chunks_2, embeddings_1, embeddings_2)
98
+ st.write(f"### **Context Comparison:** {similarity_ratio * 100:.2f}%")
99
+
100
+ # Calculate and display word similarity ratio
101
+ st.write("### **Word to Word Similarity:**")
102
+ for i, (text1, text2) in enumerate(zip(chunks_1, chunks_2)):
103
+ word_similarity_ratio = calculate_word_similarity_ratio(text1, text2)
104
+ st.write(f"**Chunk {i+1}:** Word Similarity Ratio: {word_similarity_ratio * 100:.2f}%")
105
+ else:
106
+ st.warning("Please provide at least one input for each document (text or PDF).")