import streamlit as st import fitz # PyMuPDF import nltk from nltk.tokenize import word_tokenize import google.generativeai as genai import faiss import numpy as np from pymongo import MongoClient from nltk.tokenize import sent_tokenize import json from pymongo.errors import ConnectionFailure, OperationFailure import os nltk.download('punkt_tab') nltk.download('punkt') nltk.download('wordnet') nltk.download('omw-1.4') genai.configure(api_key=os.environ["AI_API_KEY"]) gemini_model = genai.GenerativeModel('gemini-1.5-flash') # Function to extract text from the uploaded PDF using PyMuPDF (fitz) def extract_text_from_pdf(pdf_file): try: doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page_num in range(len(doc)): page = doc.load_page(page_num) text += page.get_text() return text except Exception as e: st.error(f"Error extracting text from PDF: {e}") return None # Function to split text into overlapping chunks using NLTK tokenization def split_text_into_chunks(text, chunk_size=500, overlap=100): try: words = word_tokenize(text) chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = " ".join(words[i:i + chunk_size]) chunks.append(chunk) return chunks except Exception as e: st.error(f"Error splitting text into chunks: {e}") return [] # Function to generate embeddings for a list of text chunks def generate_embeddings(chunks, title="PDF Document"): embeddings = [] for chunk in chunks: try: embedding = genai.embed_content( model="models/embedding-001", content=chunk, task_type="retrieval_document", title=title ) embeddings.append(embedding["embedding"]) except Exception as e: st.error(f"Error generating embedding for chunk: {e}") return embeddings # Function to store embeddings in FAISS def store_embeddings_in_faiss(embeddings): try: embeddings_array = np.array(embeddings).astype('float32') dimension = embeddings_array.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings_array) return index except Exception as e: st.error(f"Error storing embeddings in FAISS: {e}") return None # Function to retrieve relevant chunks using FAISS def retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3): try: query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1) distances, indices = index.search(query_embedding, top_k) relevant_chunks = [chunks[i] for i in indices[0]] return relevant_chunks except Exception as e: st.error(f"Error retrieving relevant chunks: {e}") return [] # Function to generate an answer using Gemini API def generate_answer(query, context_chunks): try: context = "\n".join(context_chunks) prompt = f""" Context: {context} Question: {query} Answer the question based on the context provided above. """ response = gemini_model.generate_content(prompt) return response.text except Exception as e: st.error(f"Error generating answer: {e}") return "Unable to generate an answer due to an error." # Streamlit UI with st.sidebar: st.title("Navigation") hide_st_style = ''' ''' st.markdown(hide_st_style, unsafe_allow_html=True) page = st.radio("Options", ["Home","MongoDb", "Privacy Policy"], label_visibility="collapsed") if page == "Home": st.title("Gemini RAG Application") st.markdown("Upload a PDF document and ask questions to get answers using Google's Gemini API.") pdf_file = st.file_uploader("Choose a PDF file", type="pdf") if pdf_file is not None: with st.spinner("Extracting text..."): extracted_text = extract_text_from_pdf(pdf_file) if extracted_text: with st.spinner("Splitting text into overlapping chunks..."): chunks = split_text_into_chunks(extracted_text, chunk_size=500, overlap=100) if chunks: with st.status(f"Total chunks: {len(chunks)}"): for i, chunk in enumerate(chunks): st.subheader(f"Chunk {i + 1}") st.text_area(f"Chunk {i + 1} Text", chunk, height=200, key=f"chunk_{i}") with st.spinner("Generating embeddings..."): embeddings = generate_embeddings(chunks) if embeddings: with st.spinner("Storing embeddings in FAISS..."): index = store_embeddings_in_faiss(embeddings) if index: st.success("Embeddings have been successfully stored in the FAISS vector database.") query = st.text_input("Enter your question:") if query: with st.spinner("Generating query embedding..."): query_embedding = genai.embed_content( model="models/embedding-001", content=query, task_type="retrieval_query" )["embedding"] with st.spinner("Retrieving relevant chunks..."): relevant_chunks = retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3) if relevant_chunks: with st.status("### Relevant Context Chunks:"): for i, chunk in enumerate(relevant_chunks): st.subheader(f"Chunk {i + 1}") st.text_area(f"Relevant Chunk {i + 1} Text", chunk, height=200, key=f"relevant_chunk_{i}") with st.spinner("Generating answer..."): answer = generate_answer(query, relevant_chunks) st.write("### Answer:") st.write(answer) else: st.warning("No relevant chunks found.") else: st.error("Failed to store embeddings in FAISS.") else: st.error("Failed to generate embeddings.") else: st.error("No chunks generated from the text.") else: st.error("No text extracted. The document might be image-based or corrupted.") if page == "MongoDb": try: client = MongoClient(os.environ["MONGO_API_KEY"]) db = client['resume_database'] collection = db['resumes'] st.success("Connected to MongoDB Atlas!") except ConnectionFailure: st.error("Failed to connect to MongoDB. Check your connection string.") st.stop() def extract_text_from_pdf(pdf_bytes): """Extract text from a PDF file.""" try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") text = "" for page in doc: text += page.get_text() return text except Exception as e: st.error(f"Error extracting text: {e}") return None # Split resume text into sections def split_resume_into_sections(resume_text): """Split the resume text into sections like Education, Experience, etc.""" sections = { 'education': [], 'experience': [], 'technical_skills': [], 'projects': [], 'certifications': [] } current_section = None for sentence in sent_tokenize(resume_text): # Split text into sentences sentence_upper = sentence.upper() # Convert to uppercase for easier matching if "EDUCATION" in sentence_upper: current_section = 'education' elif "EXPERIENCE" in sentence_upper: current_section = 'experience' elif "TECHNICAL SKILLS" in sentence_upper: current_section = 'technical_skills' elif "PROJECTS" in sentence_upper: current_section = 'projects' elif "CERTIFICATIONS" in sentence_upper: current_section = 'certifications' if current_section: # Add the sentence to the appropriate section sections[current_section].append(sentence.strip()) return sections # Save resume data to MongoDB def save_resume_to_mongodb(pdf_bytes, user_id): """Save the resume text and sections to MongoDB.""" try: resume_text = extract_text_from_pdf(pdf_bytes) if not resume_text: return None resume_sections = split_resume_into_sections(resume_text) # Prepare data to save resume_data = { 'user_id': user_id, 'resume': resume_sections } # Insert data into MongoDB result = collection.insert_one(resume_data) return result.inserted_id except OperationFailure as e: st.error(f"Error saving data: {e}") return None # Fetch resume data from MongoDB def fetch_resume_from_mongodb(user_id): """Fetch resume data from MongoDB using the user ID.""" try: resume_data = collection.find_one({"user_id": user_id}) return resume_data except OperationFailure as e: st.error(f"Error fetching data: {e}") return None st.title("Resume Extractor and MongoDB Storage") st.write("Upload a PDF resume, extract text, and store it in MongoDB.") st.header("Step 1: Upload and Store Resume") pdf_file = st.file_uploader("Upload a PDF Resume", type="pdf") if pdf_file: pdf_bytes = pdf_file.read() resume_text = extract_text_from_pdf(pdf_bytes) if resume_text: st.subheader("Extracted Text") st.write(resume_text) user_id = st.text_input("Enter User ID", "12345") if st.button("Save Resume to MongoDB"): with st.spinner("Saving..."): inserted_id = save_resume_to_mongodb(pdf_bytes, user_id) if inserted_id: st.success(f"Resume saved! Document ID: {inserted_id}") #Fetch resume data from MongoDB st.header("Step 2: Retrieve Resume Data") user_id_to_fetch = st.text_input("Enter User ID to Fetch Data", "12345") if st.button("Fetch Resume"): with st.spinner("Fetching..."): resume_data = fetch_resume_from_mongodb(user_id_to_fetch) if resume_data: st.subheader(f"Resume Data for User ID: {user_id_to_fetch}") st.json(json.dumps(resume_data, default=str, indent=4)) # Show data as JSON else: st.warning(f"No resume found for User ID: {user_id_to_fetch}")