|
|
import streamlit as st |
|
|
import fitz |
|
|
import nltk |
|
|
from nltk.tokenize import word_tokenize |
|
|
import google.generativeai as genai |
|
|
import faiss |
|
|
import numpy as np |
|
|
from pymongo import MongoClient |
|
|
from nltk.tokenize import sent_tokenize |
|
|
import json |
|
|
from pymongo.errors import ConnectionFailure, OperationFailure |
|
|
import os |
|
|
|
|
|
nltk.download('punkt_tab') |
|
|
nltk.download('punkt') |
|
|
nltk.download('wordnet') |
|
|
nltk.download('omw-1.4') |
|
|
|
|
|
|
|
|
genai.configure(api_key=os.environ["AI_API_KEY"]) |
|
|
gemini_model = genai.GenerativeModel('gemini-1.5-flash') |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
try: |
|
|
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") |
|
|
text = "" |
|
|
for page_num in range(len(doc)): |
|
|
page = doc.load_page(page_num) |
|
|
text += page.get_text() |
|
|
return text |
|
|
except Exception as e: |
|
|
st.error(f"Error extracting text from PDF: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def split_text_into_chunks(text, chunk_size=500, overlap=100): |
|
|
try: |
|
|
words = word_tokenize(text) |
|
|
chunks = [] |
|
|
for i in range(0, len(words), chunk_size - overlap): |
|
|
chunk = " ".join(words[i:i + chunk_size]) |
|
|
chunks.append(chunk) |
|
|
return chunks |
|
|
except Exception as e: |
|
|
st.error(f"Error splitting text into chunks: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def generate_embeddings(chunks, title="PDF Document"): |
|
|
embeddings = [] |
|
|
for chunk in chunks: |
|
|
try: |
|
|
embedding = genai.embed_content( |
|
|
model="models/embedding-001", |
|
|
content=chunk, |
|
|
task_type="retrieval_document", |
|
|
title=title |
|
|
) |
|
|
embeddings.append(embedding["embedding"]) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating embedding for chunk: {e}") |
|
|
return embeddings |
|
|
|
|
|
|
|
|
def store_embeddings_in_faiss(embeddings): |
|
|
try: |
|
|
embeddings_array = np.array(embeddings).astype('float32') |
|
|
dimension = embeddings_array.shape[1] |
|
|
index = faiss.IndexFlatL2(dimension) |
|
|
index.add(embeddings_array) |
|
|
return index |
|
|
except Exception as e: |
|
|
st.error(f"Error storing embeddings in FAISS: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3): |
|
|
try: |
|
|
query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1) |
|
|
distances, indices = index.search(query_embedding, top_k) |
|
|
relevant_chunks = [chunks[i] for i in indices[0]] |
|
|
return relevant_chunks |
|
|
except Exception as e: |
|
|
st.error(f"Error retrieving relevant chunks: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def generate_answer(query, context_chunks): |
|
|
try: |
|
|
context = "\n".join(context_chunks) |
|
|
prompt = f""" |
|
|
Context: |
|
|
{context} |
|
|
Question: |
|
|
{query} |
|
|
Answer the question based on the context provided above. |
|
|
""" |
|
|
response = gemini_model.generate_content(prompt) |
|
|
return response.text |
|
|
except Exception as e: |
|
|
st.error(f"Error generating answer: {e}") |
|
|
return "Unable to generate an answer due to an error." |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.title("Navigation") |
|
|
hide_st_style = ''' |
|
|
<style> |
|
|
MainMenu {visibility: hidden;} |
|
|
footer {visibility: hidden;} |
|
|
header {visibility: hidden;} |
|
|
</style> |
|
|
''' |
|
|
st.markdown(hide_st_style, unsafe_allow_html=True) |
|
|
page = st.radio("Options", ["Home","MongoDb", "Privacy Policy"], label_visibility="collapsed") |
|
|
|
|
|
if page == "Home": |
|
|
st.title("Gemini RAG Application") |
|
|
st.markdown("Upload a PDF document and ask questions to get answers using Google's Gemini API.") |
|
|
|
|
|
pdf_file = st.file_uploader("Choose a PDF file", type="pdf") |
|
|
|
|
|
if pdf_file is not None: |
|
|
with st.spinner("Extracting text..."): |
|
|
extracted_text = extract_text_from_pdf(pdf_file) |
|
|
|
|
|
if extracted_text: |
|
|
with st.spinner("Splitting text into overlapping chunks..."): |
|
|
chunks = split_text_into_chunks(extracted_text, chunk_size=500, overlap=100) |
|
|
|
|
|
if chunks: |
|
|
with st.status(f"Total chunks: {len(chunks)}"): |
|
|
for i, chunk in enumerate(chunks): |
|
|
st.subheader(f"Chunk {i + 1}") |
|
|
st.text_area(f"Chunk {i + 1} Text", chunk, height=200, key=f"chunk_{i}") |
|
|
|
|
|
with st.spinner("Generating embeddings..."): |
|
|
embeddings = generate_embeddings(chunks) |
|
|
|
|
|
if embeddings: |
|
|
with st.spinner("Storing embeddings in FAISS..."): |
|
|
index = store_embeddings_in_faiss(embeddings) |
|
|
|
|
|
if index: |
|
|
st.success("Embeddings have been successfully stored in the FAISS vector database.") |
|
|
|
|
|
query = st.text_input("Enter your question:") |
|
|
if query: |
|
|
with st.spinner("Generating query embedding..."): |
|
|
query_embedding = genai.embed_content( |
|
|
model="models/embedding-001", |
|
|
content=query, |
|
|
task_type="retrieval_query" |
|
|
)["embedding"] |
|
|
|
|
|
with st.spinner("Retrieving relevant chunks..."): |
|
|
relevant_chunks = retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3) |
|
|
|
|
|
if relevant_chunks: |
|
|
with st.status("### Relevant Context Chunks:"): |
|
|
for i, chunk in enumerate(relevant_chunks): |
|
|
st.subheader(f"Chunk {i + 1}") |
|
|
st.text_area(f"Relevant Chunk {i + 1} Text", chunk, height=200, key=f"relevant_chunk_{i}") |
|
|
|
|
|
with st.spinner("Generating answer..."): |
|
|
answer = generate_answer(query, relevant_chunks) |
|
|
st.write("### Answer:") |
|
|
st.write(answer) |
|
|
else: |
|
|
st.warning("No relevant chunks found.") |
|
|
else: |
|
|
st.error("Failed to store embeddings in FAISS.") |
|
|
else: |
|
|
st.error("Failed to generate embeddings.") |
|
|
else: |
|
|
st.error("No chunks generated from the text.") |
|
|
else: |
|
|
st.error("No text extracted. The document might be image-based or corrupted.") |
|
|
|
|
|
if page == "MongoDb": |
|
|
try: |
|
|
client = MongoClient(os.environ["MONGO_API_KEY"]) |
|
|
db = client['resume_database'] |
|
|
collection = db['resumes'] |
|
|
st.success("Connected to MongoDB Atlas!") |
|
|
except ConnectionFailure: |
|
|
st.error("Failed to connect to MongoDB. Check your connection string.") |
|
|
st.stop() |
|
|
|
|
|
def extract_text_from_pdf(pdf_bytes): |
|
|
"""Extract text from a PDF file.""" |
|
|
try: |
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
text = "" |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
return text |
|
|
except Exception as e: |
|
|
st.error(f"Error extracting text: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def split_resume_into_sections(resume_text): |
|
|
"""Split the resume text into sections like Education, Experience, etc.""" |
|
|
sections = { |
|
|
'education': [], |
|
|
'experience': [], |
|
|
'technical_skills': [], |
|
|
'projects': [], |
|
|
'certifications': [] |
|
|
} |
|
|
|
|
|
current_section = None |
|
|
for sentence in sent_tokenize(resume_text): |
|
|
sentence_upper = sentence.upper() |
|
|
if "EDUCATION" in sentence_upper: |
|
|
current_section = 'education' |
|
|
elif "EXPERIENCE" in sentence_upper: |
|
|
current_section = 'experience' |
|
|
elif "TECHNICAL SKILLS" in sentence_upper: |
|
|
current_section = 'technical_skills' |
|
|
elif "PROJECTS" in sentence_upper: |
|
|
current_section = 'projects' |
|
|
elif "CERTIFICATIONS" in sentence_upper: |
|
|
current_section = 'certifications' |
|
|
|
|
|
if current_section: |
|
|
sections[current_section].append(sentence.strip()) |
|
|
|
|
|
return sections |
|
|
|
|
|
|
|
|
def save_resume_to_mongodb(pdf_bytes, user_id): |
|
|
"""Save the resume text and sections to MongoDB.""" |
|
|
try: |
|
|
resume_text = extract_text_from_pdf(pdf_bytes) |
|
|
if not resume_text: |
|
|
return None |
|
|
resume_sections = split_resume_into_sections(resume_text) |
|
|
|
|
|
|
|
|
resume_data = { |
|
|
'user_id': user_id, |
|
|
'resume': resume_sections |
|
|
} |
|
|
|
|
|
|
|
|
result = collection.insert_one(resume_data) |
|
|
return result.inserted_id |
|
|
except OperationFailure as e: |
|
|
st.error(f"Error saving data: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def fetch_resume_from_mongodb(user_id): |
|
|
"""Fetch resume data from MongoDB using the user ID.""" |
|
|
try: |
|
|
resume_data = collection.find_one({"user_id": user_id}) |
|
|
return resume_data |
|
|
except OperationFailure as e: |
|
|
st.error(f"Error fetching data: {e}") |
|
|
return None |
|
|
|
|
|
st.title("Resume Extractor and MongoDB Storage") |
|
|
st.write("Upload a PDF resume, extract text, and store it in MongoDB.") |
|
|
st.header("Step 1: Upload and Store Resume") |
|
|
pdf_file = st.file_uploader("Upload a PDF Resume", type="pdf") |
|
|
|
|
|
if pdf_file: |
|
|
pdf_bytes = pdf_file.read() |
|
|
resume_text = extract_text_from_pdf(pdf_bytes) |
|
|
|
|
|
if resume_text: |
|
|
st.subheader("Extracted Text") |
|
|
st.write(resume_text) |
|
|
|
|
|
user_id = st.text_input("Enter User ID", "12345") |
|
|
|
|
|
if st.button("Save Resume to MongoDB"): |
|
|
with st.spinner("Saving..."): |
|
|
inserted_id = save_resume_to_mongodb(pdf_bytes, user_id) |
|
|
if inserted_id: |
|
|
st.success(f"Resume saved! Document ID: {inserted_id}") |
|
|
|
|
|
|
|
|
st.header("Step 2: Retrieve Resume Data") |
|
|
user_id_to_fetch = st.text_input("Enter User ID to Fetch Data", "12345") |
|
|
|
|
|
if st.button("Fetch Resume"): |
|
|
with st.spinner("Fetching..."): |
|
|
resume_data = fetch_resume_from_mongodb(user_id_to_fetch) |
|
|
|
|
|
if resume_data: |
|
|
st.subheader(f"Resume Data for User ID: {user_id_to_fetch}") |
|
|
st.json(json.dumps(resume_data, default=str, indent=4)) |
|
|
else: |
|
|
st.warning(f"No resume found for User ID: {user_id_to_fetch}") |