Spaces:

tjwrld
/

Talk

Sleeping

App Files Files Community

Talk / app.py

tjwrld

Update app.py

fcf3d8f verified 10 months ago

raw

history blame contribute delete

11.7 kB

	import streamlit as st
	import fitz # PyMuPDF
	import nltk
	from nltk.tokenize import word_tokenize
	import google.generativeai as genai
	import faiss
	import numpy as np
	from pymongo import MongoClient
	from nltk.tokenize import sent_tokenize
	import json
	from pymongo.errors import ConnectionFailure, OperationFailure
	import os

	nltk.download('punkt_tab')
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('omw-1.4')


	genai.configure(api_key=os.environ["AI_API_KEY"])
	gemini_model = genai.GenerativeModel('gemini-1.5-flash')

	# Function to extract text from the uploaded PDF using PyMuPDF (fitz)
	def extract_text_from_pdf(pdf_file):
	try:
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text = ""
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text += page.get_text()
	return text
	except Exception as e:
	st.error(f"Error extracting text from PDF: {e}")
	return None

	# Function to split text into overlapping chunks using NLTK tokenization
	def split_text_into_chunks(text, chunk_size=500, overlap=100):
	try:
	words = word_tokenize(text)
	chunks = []
	for i in range(0, len(words), chunk_size - overlap):
	chunk = " ".join(words[i:i + chunk_size])
	chunks.append(chunk)
	return chunks
	except Exception as e:
	st.error(f"Error splitting text into chunks: {e}")
	return []

	# Function to generate embeddings for a list of text chunks
	def generate_embeddings(chunks, title="PDF Document"):
	embeddings = []
	for chunk in chunks:
	try:
	embedding = genai.embed_content(
	model="models/embedding-001",
	content=chunk,
	task_type="retrieval_document",
	title=title
	)
	embeddings.append(embedding["embedding"])
	except Exception as e:
	st.error(f"Error generating embedding for chunk: {e}")
	return embeddings

	# Function to store embeddings in FAISS
	def store_embeddings_in_faiss(embeddings):
	try:
	embeddings_array = np.array(embeddings).astype('float32')
	dimension = embeddings_array.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings_array)
	return index
	except Exception as e:
	st.error(f"Error storing embeddings in FAISS: {e}")
	return None

	# Function to retrieve relevant chunks using FAISS
	def retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3):
	try:
	query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
	distances, indices = index.search(query_embedding, top_k)
	relevant_chunks = [chunks[i] for i in indices[0]]
	return relevant_chunks
	except Exception as e:
	st.error(f"Error retrieving relevant chunks: {e}")
	return []

	# Function to generate an answer using Gemini API
	def generate_answer(query, context_chunks):
	try:
	context = "\n".join(context_chunks)
	prompt = f"""
	Context:
	{context}
	Question:
	{query}
	Answer the question based on the context provided above.
	"""
	response = gemini_model.generate_content(prompt)
	return response.text
	except Exception as e:
	st.error(f"Error generating answer: {e}")
	return "Unable to generate an answer due to an error."

	# Streamlit UI
	with st.sidebar:
	st.title("Navigation")
	hide_st_style = '''
	<style>
	MainMenu {visibility: hidden;}
	footer {visibility: hidden;}
	header {visibility: hidden;}
	</style>
	'''
	st.markdown(hide_st_style, unsafe_allow_html=True)
	page = st.radio("Options", ["Home","MongoDb", "Privacy Policy"], label_visibility="collapsed")

	if page == "Home":
	st.title("Gemini RAG Application")
	st.markdown("Upload a PDF document and ask questions to get answers using Google's Gemini API.")

	pdf_file = st.file_uploader("Choose a PDF file", type="pdf")

	if pdf_file is not None:
	with st.spinner("Extracting text..."):
	extracted_text = extract_text_from_pdf(pdf_file)

	if extracted_text:
	with st.spinner("Splitting text into overlapping chunks..."):
	chunks = split_text_into_chunks(extracted_text, chunk_size=500, overlap=100)

	if chunks:
	with st.status(f"Total chunks: {len(chunks)}"):
	for i, chunk in enumerate(chunks):
	st.subheader(f"Chunk {i + 1}")
	st.text_area(f"Chunk {i + 1} Text", chunk, height=200, key=f"chunk_{i}")

	with st.spinner("Generating embeddings..."):
	embeddings = generate_embeddings(chunks)

	if embeddings:
	with st.spinner("Storing embeddings in FAISS..."):
	index = store_embeddings_in_faiss(embeddings)

	if index:
	st.success("Embeddings have been successfully stored in the FAISS vector database.")

	query = st.text_input("Enter your question:")
	if query:
	with st.spinner("Generating query embedding..."):
	query_embedding = genai.embed_content(
	model="models/embedding-001",
	content=query,
	task_type="retrieval_query"
	)["embedding"]

	with st.spinner("Retrieving relevant chunks..."):
	relevant_chunks = retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3)

	if relevant_chunks:
	with st.status("### Relevant Context Chunks:"):
	for i, chunk in enumerate(relevant_chunks):
	st.subheader(f"Chunk {i + 1}")
	st.text_area(f"Relevant Chunk {i + 1} Text", chunk, height=200, key=f"relevant_chunk_{i}")

	with st.spinner("Generating answer..."):
	answer = generate_answer(query, relevant_chunks)
	st.write("### Answer:")
	st.write(answer)
	else:
	st.warning("No relevant chunks found.")
	else:
	st.error("Failed to store embeddings in FAISS.")
	else:
	st.error("Failed to generate embeddings.")
	else:
	st.error("No chunks generated from the text.")
	else:
	st.error("No text extracted. The document might be image-based or corrupted.")

	if page == "MongoDb":
	try:
	client = MongoClient(os.environ["MONGO_API_KEY"])
	db = client['resume_database']
	collection = db['resumes']
	st.success("Connected to MongoDB Atlas!")
	except ConnectionFailure:
	st.error("Failed to connect to MongoDB. Check your connection string.")
	st.stop()

	def extract_text_from_pdf(pdf_bytes):
	"""Extract text from a PDF file."""
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text()
	return text
	except Exception as e:
	st.error(f"Error extracting text: {e}")
	return None

	# Split resume text into sections
	def split_resume_into_sections(resume_text):
	"""Split the resume text into sections like Education, Experience, etc."""
	sections = {
	'education': [],
	'experience': [],
	'technical_skills': [],
	'projects': [],
	'certifications': []
	}

	current_section = None
	for sentence in sent_tokenize(resume_text): # Split text into sentences
	sentence_upper = sentence.upper() # Convert to uppercase for easier matching
	if "EDUCATION" in sentence_upper:
	current_section = 'education'
	elif "EXPERIENCE" in sentence_upper:
	current_section = 'experience'
	elif "TECHNICAL SKILLS" in sentence_upper:
	current_section = 'technical_skills'
	elif "PROJECTS" in sentence_upper:
	current_section = 'projects'
	elif "CERTIFICATIONS" in sentence_upper:
	current_section = 'certifications'

	if current_section: # Add the sentence to the appropriate section
	sections[current_section].append(sentence.strip())

	return sections

	# Save resume data to MongoDB
	def save_resume_to_mongodb(pdf_bytes, user_id):
	"""Save the resume text and sections to MongoDB."""
	try:
	resume_text = extract_text_from_pdf(pdf_bytes)
	if not resume_text:
	return None
	resume_sections = split_resume_into_sections(resume_text)

	# Prepare data to save
	resume_data = {
	'user_id': user_id,
	'resume': resume_sections
	}

	# Insert data into MongoDB
	result = collection.insert_one(resume_data)
	return result.inserted_id
	except OperationFailure as e:
	st.error(f"Error saving data: {e}")
	return None

	# Fetch resume data from MongoDB
	def fetch_resume_from_mongodb(user_id):
	"""Fetch resume data from MongoDB using the user ID."""
	try:
	resume_data = collection.find_one({"user_id": user_id})
	return resume_data
	except OperationFailure as e:
	st.error(f"Error fetching data: {e}")
	return None

	st.title("Resume Extractor and MongoDB Storage")
	st.write("Upload a PDF resume, extract text, and store it in MongoDB.")
	st.header("Step 1: Upload and Store Resume")
	pdf_file = st.file_uploader("Upload a PDF Resume", type="pdf")

	if pdf_file:
	pdf_bytes = pdf_file.read()
	resume_text = extract_text_from_pdf(pdf_bytes)

	if resume_text:
	st.subheader("Extracted Text")
	st.write(resume_text)

	user_id = st.text_input("Enter User ID", "12345")

	if st.button("Save Resume to MongoDB"):
	with st.spinner("Saving..."):
	inserted_id = save_resume_to_mongodb(pdf_bytes, user_id)
	if inserted_id:
	st.success(f"Resume saved! Document ID: {inserted_id}")

	#Fetch resume data from MongoDB
	st.header("Step 2: Retrieve Resume Data")
	user_id_to_fetch = st.text_input("Enter User ID to Fetch Data", "12345")

	if st.button("Fetch Resume"):
	with st.spinner("Fetching..."):
	resume_data = fetch_resume_from_mongodb(user_id_to_fetch)

	if resume_data:
	st.subheader(f"Resume Data for User ID: {user_id_to_fetch}")
	st.json(json.dumps(resume_data, default=str, indent=4)) # Show data as JSON
	else:
	st.warning(f"No resume found for User ID: {user_id_to_fetch}")