Spaces:

tjwrld
/

Talk

Sleeping

App Files Files Community

tjwrld commited on Mar 11, 2025

Commit

168f067

verified ·

1 Parent(s): 9e0a359

Update app.py

Browse files

Files changed (1) hide show

app.py +337 -26

app.py CHANGED Viewed

@@ -1,3 +1,203 @@
 import streamlit as st
 import fitz  # PyMuPDF
 import nltk
@@ -5,6 +205,10 @@ from nltk.tokenize import word_tokenize
 import google.generativeai as genai
 import faiss
 import numpy as np
 import os
 nltk.download('punkt_tab')
@@ -12,28 +216,6 @@ nltk.download('punkt')
 nltk.download('wordnet')
 nltk.download('omw-1.4')
-# # Ensure NLTK resources are downloaded
-# # Set NLTK data path to a writable directory
-# nltk_data_dir = "/tmp/nltk_data"
-# os.environ["NLTK_DATA"] = nltk_data_dir
-# nltk.data.path.append(nltk_data_dir)
-# # Ensure NLTK resources are downloaded
-# try:
-#     # Check if punkt is already downloaded
-#     if not os.path.exists(os.path.join(nltk_data_dir, "tokenizers/punkt")):
-#         st.write("Downloading NLTK punkt data...")
-#         nltk.download("punkt", download_dir=nltk_data_dir)
-#     else:
-#         st.write("NLTK punkt data already exists.")
-# except Exception as e:
-#     st.error(f"Error downloading NLTK data: {e}")
-#     st.stop()
-# Configure Gemini API (use environment variable or Streamlit secrets for API key)
-# GEMINI_API_KEY = ""  # Replace with your actual API key
-# genai.configure(api_key=GEMINI_API_KEY)
 genai.configure(api_key=os.environ["AI_API_KEY"])
 gemini_model = genai.GenerativeModel('gemini-1.5-flash')
@@ -110,10 +292,8 @@ def generate_answer(query, context_chunks):
         prompt = f"""
         Context:
         {context}
         Question:
         {query}
         Answer the question based on the context provided above.
         """
         response = gemini_model.generate_content(prompt)
@@ -133,7 +313,7 @@ with st.sidebar:
         </style>
     '''
     st.markdown(hide_st_style, unsafe_allow_html=True)
-    page = st.radio("Options", ["Home", "Privacy Policy"], label_visibility="collapsed")
 if page == "Home":
     st.title("Gemini RAG Application")
@@ -196,4 +376,135 @@ if page == "Home":
             else:
                 st.error("No chunks generated from the text.")
         else:
-            st.error("No text extracted. The document might be image-based or corrupted.")

+# import streamlit as st
+# import fitz  # PyMuPDF
+# import nltk
+# from nltk.tokenize import word_tokenize
+# import google.generativeai as genai
+# import faiss
+# import numpy as np
+# import os
+# nltk.download('punkt_tab')
+# nltk.download('punkt')
+# nltk.download('wordnet')
+# nltk.download('omw-1.4')
+# # # Ensure NLTK resources are downloaded
+# # # Set NLTK data path to a writable directory
+# # nltk_data_dir = "/tmp/nltk_data"
+# # os.environ["NLTK_DATA"] = nltk_data_dir
+# # nltk.data.path.append(nltk_data_dir)
+# # # Ensure NLTK resources are downloaded
+# # try:
+# #     # Check if punkt is already downloaded
+# #     if not os.path.exists(os.path.join(nltk_data_dir, "tokenizers/punkt")):
+# #         st.write("Downloading NLTK punkt data...")
+# #         nltk.download("punkt", download_dir=nltk_data_dir)
+# #     else:
+# #         st.write("NLTK punkt data already exists.")
+# # except Exception as e:
+# #     st.error(f"Error downloading NLTK data: {e}")
+# #     st.stop()
+# # Configure Gemini API (use environment variable or Streamlit secrets for API key)
+# # GEMINI_API_KEY = ""  # Replace with your actual API key
+# # genai.configure(api_key=GEMINI_API_KEY)
+# genai.configure(api_key=os.environ["AI_API_KEY"])
+# gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+# # Function to extract text from the uploaded PDF using PyMuPDF (fitz)
+# def extract_text_from_pdf(pdf_file):
+#     try:
+#         doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+#         text = ""
+#         for page_num in range(len(doc)):
+#             page = doc.load_page(page_num)
+#             text += page.get_text()
+#         return text
+#     except Exception as e:
+#         st.error(f"Error extracting text from PDF: {e}")
+#         return None
+# # Function to split text into overlapping chunks using NLTK tokenization
+# def split_text_into_chunks(text, chunk_size=500, overlap=100):
+#     try:
+#         words = word_tokenize(text)
+#         chunks = []
+#         for i in range(0, len(words), chunk_size - overlap):
+#             chunk = " ".join(words[i:i + chunk_size])
+#             chunks.append(chunk)
+#         return chunks
+#     except Exception as e:
+#         st.error(f"Error splitting text into chunks: {e}")
+#         return []
+# # Function to generate embeddings for a list of text chunks
+# def generate_embeddings(chunks, title="PDF Document"):
+#     embeddings = []
+#     for chunk in chunks:
+#         try:
+#             embedding = genai.embed_content(
+#                 model="models/embedding-001",
+#                 content=chunk,
+#                 task_type="retrieval_document",
+#                 title=title
+#             )
+#             embeddings.append(embedding["embedding"])
+#         except Exception as e:
+#             st.error(f"Error generating embedding for chunk: {e}")
+#     return embeddings
+# # Function to store embeddings in FAISS
+# def store_embeddings_in_faiss(embeddings):
+#     try:
+#         embeddings_array = np.array(embeddings).astype('float32')
+#         dimension = embeddings_array.shape[1]
+#         index = faiss.IndexFlatL2(dimension)
+#         index.add(embeddings_array)
+#         return index
+#     except Exception as e:
+#         st.error(f"Error storing embeddings in FAISS: {e}")
+#         return None
+# # Function to retrieve relevant chunks using FAISS
+# def retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3):
+#     try:
+#         query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
+#         distances, indices = index.search(query_embedding, top_k)
+#         relevant_chunks = [chunks[i] for i in indices[0]]
+#         return relevant_chunks
+#     except Exception as e:
+#         st.error(f"Error retrieving relevant chunks: {e}")
+#         return []
+# # Function to generate an answer using Gemini API
+# def generate_answer(query, context_chunks):
+#     try:
+#         context = "\n".join(context_chunks)
+#         prompt = f"""
+#         Context:
+#         {context}
+#         Question:
+#         {query}
+#         Answer the question based on the context provided above.
+#         """
+#         response = gemini_model.generate_content(prompt)
+#         return response.text
+#     except Exception as e:
+#         st.error(f"Error generating answer: {e}")
+#         return "Unable to generate an answer due to an error."
+# # Streamlit UI
+# with st.sidebar:
+#     st.title("Navigation")
+#     hide_st_style = '''
+#         <style>
+#         MainMenu {visibility: hidden;}
+#         footer {visibility: hidden;}
+#         header {visibility: hidden;}
+#         </style>
+#     '''
+#     st.markdown(hide_st_style, unsafe_allow_html=True)
+#     page = st.radio("Options", ["Home", "Privacy Policy"], label_visibility="collapsed")
+# if page == "Home":
+#     st.title("Gemini RAG Application")
+#     st.markdown("Upload a PDF document and ask questions to get answers using Google's Gemini API.")
+#     pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
+#     if pdf_file is not None:
+#         with st.spinner("Extracting text..."):
+#             extracted_text = extract_text_from_pdf(pdf_file)
+#         if extracted_text:
+#             with st.spinner("Splitting text into overlapping chunks..."):
+#                 chunks = split_text_into_chunks(extracted_text, chunk_size=500, overlap=100)
+#             if chunks:
+#                 with st.status(f"Total chunks: {len(chunks)}"):
+#                     for i, chunk in enumerate(chunks):
+#                         st.subheader(f"Chunk {i + 1}")
+#                         st.text_area(f"Chunk {i + 1} Text", chunk, height=200, key=f"chunk_{i}")
+#                 with st.spinner("Generating embeddings..."):
+#                     embeddings = generate_embeddings(chunks)
+#                 if embeddings:
+#                     with st.spinner("Storing embeddings in FAISS..."):
+#                         index = store_embeddings_in_faiss(embeddings)
+#                     if index:
+#                         st.success("Embeddings have been successfully stored in the FAISS vector database.")
+#                         query = st.text_input("Enter your question:")
+#                         if query:
+#                             with st.spinner("Generating query embedding..."):
+#                                 query_embedding = genai.embed_content(
+#                                     model="models/embedding-001",
+#                                     content=query,
+#                                     task_type="retrieval_query"
+#                                 )["embedding"]
+#                             with st.spinner("Retrieving relevant chunks..."):
+#                                 relevant_chunks = retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3)
+#                             if relevant_chunks:
+#                                 with st.status("### Relevant Context Chunks:"):
+#                                     for i, chunk in enumerate(relevant_chunks):
+#                                         st.subheader(f"Chunk {i + 1}")
+#                                         st.text_area(f"Relevant Chunk {i + 1} Text", chunk, height=200, key=f"relevant_chunk_{i}")
+#                                 with st.spinner("Generating answer..."):
+#                                     answer = generate_answer(query, relevant_chunks)
+#                                     st.write("### Answer:")
+#                                     st.write(answer)
+#                             else:
+#                                 st.warning("No relevant chunks found.")
+#                     else:
+#                         st.error("Failed to store embeddings in FAISS.")
+#                 else:
+#                     st.error("Failed to generate embeddings.")
+#             else:
+#                 st.error("No chunks generated from the text.")
+#         else:
+#             st.error("No text extracted. The document might be image-based or corrupted.")
 import streamlit as st
 import fitz  # PyMuPDF
 import nltk
 import google.generativeai as genai
 import faiss
 import numpy as np
+from pymongo import MongoClient
+from nltk.tokenize import sent_tokenize
+import json
+from pymongo.errors import ConnectionFailure, OperationFailure
 import os
 nltk.download('punkt_tab')
 nltk.download('wordnet')
 nltk.download('omw-1.4')
 genai.configure(api_key=os.environ["AI_API_KEY"])
 gemini_model = genai.GenerativeModel('gemini-1.5-flash')
         prompt = f"""
         Context:
         {context}
         Question:
         {query}
         Answer the question based on the context provided above.
         """
         response = gemini_model.generate_content(prompt)
         </style>
     '''
     st.markdown(hide_st_style, unsafe_allow_html=True)
+    page = st.radio("Options", ["Home","MongoDb", "Privacy Policy"], label_visibility="collapsed")
 if page == "Home":
     st.title("Gemini RAG Application")
             else:
                 st.error("No chunks generated from the text.")
         else:
+            st.error("No text extracted. The document might be image-based or corrupted.")
+if page == "MongoDb":
+    try:
+        client = MongoClient(os.environ["MONGO_API_KEY"])
+        db = client['resume_database']
+        collection = db['resumes']
+        st.success("Connected to MongoDB Atlas!")
+    except ConnectionFailure:
+        st.error("Failed to connect to MongoDB Atlas. Please check your connection string.")
+        st.stop()
+    # Function to extract text from the uploaded PDF
+    def extract_text_from_pdf(pdf_bytes):
+        """Extract text from the PDF."""
+        try:
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            return ''.join(page.get_text() for page in doc)
+        except Exception as e:
+            st.error(f"Error extracting text from PDF: {e}")
+            return None
+    # Function to chunk the resume text into sections based on keywords
+    def chunk_resume_text(resume_text):
+        """Chunk the resume text into sections based on keywords."""
+        sections = {
+            'education': [],
+            'experience': [],
+            'technical_skills': [],
+            'projects': [],
+            'certifications': []
+        }
+        current_section = None
+        for sentence in sent_tokenize(resume_text):
+            sentence_upper = sentence.upper()
+            if "EDUCATION" in sentence_upper:
+                current_section = 'education'
+            elif "EXPERIENCE" in sentence_upper:
+                current_section = 'experience'
+            elif "TECHNICAL SKILLS" in sentence_upper:
+                current_section = 'technical_skills'
+            elif "PROJECTS" in sentence_upper:
+                current_section = 'projects'
+            elif "CERTIFICATIONS" in sentence_upper:
+                current_section = 'certifications'
+            if current_section:
+                sections[current_section].append(sentence.strip())
+        return sections
+    # Function to store the extracted resume data into MongoDB
+    def store_resume_in_mongodb(pdf_bytes, user_id):
+        """Store the extracted and chunked resume data in MongoDB."""
+        try:
+            resume_text = extract_text_from_pdf(pdf_bytes)
+            if not resume_text:
+                return None
+            chunked_resume = chunk_resume_text(resume_text)
+            resume_data = {
+                'user_id': user_id,
+                'resume': chunked_resume
+            }
+            result = collection.insert_one(resume_data)
+            return result.inserted_id
+        except OperationFailure as e:
+            st.error(f"Error storing data in MongoDB: {e}")
+            return None
+    # Function to fetch resume data from MongoDB
+    def fetch_resume_from_mongodb(user_id):
+        """Fetch resume data from MongoDB based on user_id."""
+        try:
+            resume_data = collection.find_one({"user_id": user_id})
+            return resume_data
+        except OperationFailure as e:
+            st.error(f"Error fetching data from MongoDB: {e}")
+            return None
+    # Streamlit UI
+    st.title("Resume Extractor and MongoDB Storage")
+    st.write("Upload a PDF to extract text and store it in MongoDB.")
+    # Step 1: Upload PDF and store it in MongoDB
+    with st.expander("Step 1: Upload and Store Resume"):
+        pdf_file = st.file_uploader("Upload Resume PDF", type="pdf")
+        if pdf_file:
+            # Extract text and display the tokenized sentences
+            pdf_bytes = pdf_file.read()
+            resume_text = extract_text_from_pdf(pdf_bytes)
+            if resume_text:
+                tokenized_sentences = sent_tokenize(resume_text)
+                st.subheader("Tokenized Sentences")
+                for idx, sentence in enumerate(tokenized_sentences):
+                    st.write(f"{idx + 1}. {sentence}")
+                # User ID input
+                user_id = st.text_input("Enter User ID", "12345")
+                if st.button("Store Resume in MongoDB"):
+                    with st.spinner("Storing resume in MongoDB..."):
+                        inserted_id = store_resume_in_mongodb(pdf_bytes, user_id)
+                        if inserted_id:
+                            st.success(f"Resume stored successfully with ID: {inserted_id}")
+    # Step 2: Fetch resume data from MongoDB
+    with st.expander("Step 2: Retrieve Resume Data"):
+        st.write("Enter the User ID to fetch the resume data from MongoDB.")
+        # User input for user_id
+        user_id_to_fetch = st.text_input("Enter User ID to fetch data", "12345")
+        if st.button("Fetch Resume Data"):
+            with st.spinner("Fetching resume data..."):
+                resume_data = fetch_resume_from_mongodb(user_id_to_fetch)
+                if resume_data:
+                    # Display resume data in JSON format
+                    st.subheader(f"Resume Data for User ID: {user_id_to_fetch}")
+                    # Convert MongoDB result to JSON string and display it
+                    json_data = json.dumps(resume_data, default=str, indent=4)  # default=str to handle ObjectId
+                    st.json(json_data)  # Display JSON in a readable format
+                else:
+                    st.warning(f"No resume found for User ID: {user_id_to_fetch}")