File size: 11,727 Bytes
33ce21d
 
 
 
 
 
 
168f067
 
 
 
33ce21d
 
9e0a359
 
 
 
33ce21d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168f067
33ce21d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168f067
 
 
 
fcf3d8f
168f067
 
 
 
9e21a30
168f067
 
 
9e21a30
168f067
9e21a30
 
 
 
 
168f067
9e21a30
168f067
 
9e21a30
 
 
168f067
 
 
 
 
 
 
 
 
9e21a30
 
168f067
 
 
 
 
 
 
 
 
 
 
9e21a30
168f067
 
 
 
9e21a30
 
 
168f067
 
 
 
9e21a30
168f067
9e21a30
168f067
 
9e21a30
168f067
 
9e21a30
168f067
9e21a30
168f067
9e21a30
168f067
 
9e21a30
168f067
9e21a30
168f067
 
 
 
9e21a30
168f067
 
 
9e21a30
 
 
168f067
9e21a30
 
 
168f067
9e21a30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import streamlit as st
import fitz  # PyMuPDF
import nltk
from nltk.tokenize import word_tokenize
import google.generativeai as genai
import faiss
import numpy as np
from pymongo import MongoClient
from nltk.tokenize import sent_tokenize
import json
from pymongo.errors import ConnectionFailure, OperationFailure
import os

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


genai.configure(api_key=os.environ["AI_API_KEY"])
gemini_model = genai.GenerativeModel('gemini-1.5-flash')

# Function to extract text from the uploaded PDF using PyMuPDF (fitz)
def extract_text_from_pdf(pdf_file):
    try:
        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return None

# Function to split text into overlapping chunks using NLTK tokenization
def split_text_into_chunks(text, chunk_size=500, overlap=100):
    try:
        words = word_tokenize(text)
        chunks = []
        for i in range(0, len(words), chunk_size - overlap):
            chunk = " ".join(words[i:i + chunk_size])
            chunks.append(chunk)
        return chunks
    except Exception as e:
        st.error(f"Error splitting text into chunks: {e}")
        return []

# Function to generate embeddings for a list of text chunks
def generate_embeddings(chunks, title="PDF Document"):
    embeddings = []
    for chunk in chunks:
        try:
            embedding = genai.embed_content(
                model="models/embedding-001",
                content=chunk,
                task_type="retrieval_document",
                title=title
            )
            embeddings.append(embedding["embedding"])
        except Exception as e:
            st.error(f"Error generating embedding for chunk: {e}")
    return embeddings

# Function to store embeddings in FAISS
def store_embeddings_in_faiss(embeddings):
    try:
        embeddings_array = np.array(embeddings).astype('float32')
        dimension = embeddings_array.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings_array)
        return index
    except Exception as e:
        st.error(f"Error storing embeddings in FAISS: {e}")
        return None

# Function to retrieve relevant chunks using FAISS
def retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3):
    try:
        query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
        distances, indices = index.search(query_embedding, top_k)
        relevant_chunks = [chunks[i] for i in indices[0]]
        return relevant_chunks
    except Exception as e:
        st.error(f"Error retrieving relevant chunks: {e}")
        return []

# Function to generate an answer using Gemini API
def generate_answer(query, context_chunks):
    try:
        context = "\n".join(context_chunks)
        prompt = f"""
        Context:
        {context}
        Question:
        {query}
        Answer the question based on the context provided above.
        """
        response = gemini_model.generate_content(prompt)
        return response.text
    except Exception as e:
        st.error(f"Error generating answer: {e}")
        return "Unable to generate an answer due to an error."

# Streamlit UI
with st.sidebar:
    st.title("Navigation")
    hide_st_style = '''
        <style>
        MainMenu {visibility: hidden;}
        footer {visibility: hidden;}
        header {visibility: hidden;}
        </style>
    '''
    st.markdown(hide_st_style, unsafe_allow_html=True)
    page = st.radio("Options", ["Home","MongoDb", "Privacy Policy"], label_visibility="collapsed")

if page == "Home":
    st.title("Gemini RAG Application")
    st.markdown("Upload a PDF document and ask questions to get answers using Google's Gemini API.")

    pdf_file = st.file_uploader("Choose a PDF file", type="pdf")

    if pdf_file is not None:
        with st.spinner("Extracting text..."):
            extracted_text = extract_text_from_pdf(pdf_file)
        
        if extracted_text:
            with st.spinner("Splitting text into overlapping chunks..."):
                chunks = split_text_into_chunks(extracted_text, chunk_size=500, overlap=100)
            
            if chunks:
                with st.status(f"Total chunks: {len(chunks)}"):
                    for i, chunk in enumerate(chunks):
                        st.subheader(f"Chunk {i + 1}")
                        st.text_area(f"Chunk {i + 1} Text", chunk, height=200, key=f"chunk_{i}")
                
                with st.spinner("Generating embeddings..."):
                    embeddings = generate_embeddings(chunks)
                
                if embeddings:
                    with st.spinner("Storing embeddings in FAISS..."):
                        index = store_embeddings_in_faiss(embeddings)
                    
                    if index:
                        st.success("Embeddings have been successfully stored in the FAISS vector database.")
                        
                        query = st.text_input("Enter your question:")
                        if query:
                            with st.spinner("Generating query embedding..."):
                                query_embedding = genai.embed_content(
                                    model="models/embedding-001",
                                    content=query,
                                    task_type="retrieval_query"
                                )["embedding"]
                            
                            with st.spinner("Retrieving relevant chunks..."):
                                relevant_chunks = retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3)
                            
                            if relevant_chunks:
                                with st.status("### Relevant Context Chunks:"):
                                    for i, chunk in enumerate(relevant_chunks):
                                        st.subheader(f"Chunk {i + 1}")
                                        st.text_area(f"Relevant Chunk {i + 1} Text", chunk, height=200, key=f"relevant_chunk_{i}")
                                
                                with st.spinner("Generating answer..."):
                                    answer = generate_answer(query, relevant_chunks)
                                    st.write("### Answer:")
                                    st.write(answer)
                            else:
                                st.warning("No relevant chunks found.")
                    else:
                        st.error("Failed to store embeddings in FAISS.")
                else:
                    st.error("Failed to generate embeddings.")
            else:
                st.error("No chunks generated from the text.")
        else:
            st.error("No text extracted. The document might be image-based or corrupted.")

if page == "MongoDb":
    try:
        client = MongoClient(os.environ["MONGO_API_KEY"])
        db = client['resume_database']
        collection = db['resumes']
        st.success("Connected to MongoDB Atlas!")
    except ConnectionFailure:
        st.error("Failed to connect to MongoDB. Check your connection string.")
        st.stop()

    def extract_text_from_pdf(pdf_bytes):
        """Extract text from a PDF file."""
        try:
            doc = fitz.open(stream=pdf_bytes, filetype="pdf") 
            text = ""
            for page in doc:
                text += page.get_text() 
            return text
        except Exception as e:
            st.error(f"Error extracting text: {e}")
            return None

    # Split resume text into sections
    def split_resume_into_sections(resume_text):
        """Split the resume text into sections like Education, Experience, etc."""
        sections = {
            'education': [],
            'experience': [],
            'technical_skills': [],
            'projects': [],
            'certifications': []
        }
        
        current_section = None
        for sentence in sent_tokenize(resume_text):  # Split text into sentences
            sentence_upper = sentence.upper()  # Convert to uppercase for easier matching
            if "EDUCATION" in sentence_upper:
                current_section = 'education'
            elif "EXPERIENCE" in sentence_upper:
                current_section = 'experience'
            elif "TECHNICAL SKILLS" in sentence_upper:
                current_section = 'technical_skills'
            elif "PROJECTS" in sentence_upper:
                current_section = 'projects'
            elif "CERTIFICATIONS" in sentence_upper:
                current_section = 'certifications'
            
            if current_section:  # Add the sentence to the appropriate section
                sections[current_section].append(sentence.strip())
        
        return sections

    # Save resume data to MongoDB
    def save_resume_to_mongodb(pdf_bytes, user_id):
        """Save the resume text and sections to MongoDB."""
        try:
            resume_text = extract_text_from_pdf(pdf_bytes)
            if not resume_text:
                return None
            resume_sections = split_resume_into_sections(resume_text)
            
            # Prepare data to save
            resume_data = {
                'user_id': user_id,
                'resume': resume_sections
            }
            
            # Insert data into MongoDB
            result = collection.insert_one(resume_data)
            return result.inserted_id 
        except OperationFailure as e:
            st.error(f"Error saving data: {e}")
            return None

    # Fetch resume data from MongoDB
    def fetch_resume_from_mongodb(user_id):
        """Fetch resume data from MongoDB using the user ID."""
        try:
            resume_data = collection.find_one({"user_id": user_id})
            return resume_data
        except OperationFailure as e:
            st.error(f"Error fetching data: {e}")
            return None

    st.title("Resume Extractor and MongoDB Storage")
    st.write("Upload a PDF resume, extract text, and store it in MongoDB.")
    st.header("Step 1: Upload and Store Resume")
    pdf_file = st.file_uploader("Upload a PDF Resume", type="pdf")

    if pdf_file:
        pdf_bytes = pdf_file.read()
        resume_text = extract_text_from_pdf(pdf_bytes)
        
        if resume_text:
            st.subheader("Extracted Text")
            st.write(resume_text)
            
            user_id = st.text_input("Enter User ID", "12345")
            
            if st.button("Save Resume to MongoDB"):
                with st.spinner("Saving..."):
                    inserted_id = save_resume_to_mongodb(pdf_bytes, user_id)
                    if inserted_id:
                        st.success(f"Resume saved! Document ID: {inserted_id}")

    #Fetch resume data from MongoDB
    st.header("Step 2: Retrieve Resume Data")
    user_id_to_fetch = st.text_input("Enter User ID to Fetch Data", "12345")

    if st.button("Fetch Resume"):
        with st.spinner("Fetching..."):
            resume_data = fetch_resume_from_mongodb(user_id_to_fetch)
            
            if resume_data:
                st.subheader(f"Resume Data for User ID: {user_id_to_fetch}")
                st.json(json.dumps(resume_data, default=str, indent=4))  # Show data as JSON
            else:
                st.warning(f"No resume found for User ID: {user_id_to_fetch}")