Spaces:

tjwrld
/

Talk

Sleeping

App Files Files Community

tjwrld commited on Mar 16, 2025

Commit

9e21a30

verified ·

1 Parent(s): c9df7b8

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -270

app.py CHANGED Viewed

@@ -1,203 +1,3 @@
-# import streamlit as st
-# import fitz  # PyMuPDF
-# import nltk
-# from nltk.tokenize import word_tokenize
-# import google.generativeai as genai
-# import faiss
-# import numpy as np
-# import os
-# nltk.download('punkt_tab')
-# nltk.download('punkt')
-# nltk.download('wordnet')
-# nltk.download('omw-1.4')
-# # # Ensure NLTK resources are downloaded
-# # # Set NLTK data path to a writable directory
-# # nltk_data_dir = "/tmp/nltk_data"
-# # os.environ["NLTK_DATA"] = nltk_data_dir
-# # nltk.data.path.append(nltk_data_dir)
-# # # Ensure NLTK resources are downloaded
-# # try:
-# #     # Check if punkt is already downloaded
-# #     if not os.path.exists(os.path.join(nltk_data_dir, "tokenizers/punkt")):
-# #         st.write("Downloading NLTK punkt data...")
-# #         nltk.download("punkt", download_dir=nltk_data_dir)
-# #     else:
-# #         st.write("NLTK punkt data already exists.")
-# # except Exception as e:
-# #     st.error(f"Error downloading NLTK data: {e}")
-# #     st.stop()
-# # Configure Gemini API (use environment variable or Streamlit secrets for API key)
-# # GEMINI_API_KEY = ""  # Replace with your actual API key
-# # genai.configure(api_key=GEMINI_API_KEY)
-# genai.configure(api_key=os.environ["AI_API_KEY"])
-# gemini_model = genai.GenerativeModel('gemini-1.5-flash')
-# # Function to extract text from the uploaded PDF using PyMuPDF (fitz)
-# def extract_text_from_pdf(pdf_file):
-#     try:
-#         doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-#         text = ""
-#         for page_num in range(len(doc)):
-#             page = doc.load_page(page_num)
-#             text += page.get_text()
-#         return text
-#     except Exception as e:
-#         st.error(f"Error extracting text from PDF: {e}")
-#         return None
-# # Function to split text into overlapping chunks using NLTK tokenization
-# def split_text_into_chunks(text, chunk_size=500, overlap=100):
-#     try:
-#         words = word_tokenize(text)
-#         chunks = []
-#         for i in range(0, len(words), chunk_size - overlap):
-#             chunk = " ".join(words[i:i + chunk_size])
-#             chunks.append(chunk)
-#         return chunks
-#     except Exception as e:
-#         st.error(f"Error splitting text into chunks: {e}")
-#         return []
-# # Function to generate embeddings for a list of text chunks
-# def generate_embeddings(chunks, title="PDF Document"):
-#     embeddings = []
-#     for chunk in chunks:
-#         try:
-#             embedding = genai.embed_content(
-#                 model="models/embedding-001",
-#                 content=chunk,
-#                 task_type="retrieval_document",
-#                 title=title
-#             )
-#             embeddings.append(embedding["embedding"])
-#         except Exception as e:
-#             st.error(f"Error generating embedding for chunk: {e}")
-#     return embeddings
-# # Function to store embeddings in FAISS
-# def store_embeddings_in_faiss(embeddings):
-#     try:
-#         embeddings_array = np.array(embeddings).astype('float32')
-#         dimension = embeddings_array.shape[1]
-#         index = faiss.IndexFlatL2(dimension)
-#         index.add(embeddings_array)
-#         return index
-#     except Exception as e:
-#         st.error(f"Error storing embeddings in FAISS: {e}")
-#         return None
-# # Function to retrieve relevant chunks using FAISS
-# def retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3):
-#     try:
-#         query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
-#         distances, indices = index.search(query_embedding, top_k)
-#         relevant_chunks = [chunks[i] for i in indices[0]]
-#         return relevant_chunks
-#     except Exception as e:
-#         st.error(f"Error retrieving relevant chunks: {e}")
-#         return []
-# # Function to generate an answer using Gemini API
-# def generate_answer(query, context_chunks):
-#     try:
-#         context = "\n".join(context_chunks)
-#         prompt = f"""
-#         Context:
-#         {context}
-#         Question:
-#         {query}
-#         Answer the question based on the context provided above.
-#         """
-#         response = gemini_model.generate_content(prompt)
-#         return response.text
-#     except Exception as e:
-#         st.error(f"Error generating answer: {e}")
-#         return "Unable to generate an answer due to an error."
-# # Streamlit UI
-# with st.sidebar:
-#     st.title("Navigation")
-#     hide_st_style = '''
-#         <style>
-#         MainMenu {visibility: hidden;}
-#         footer {visibility: hidden;}
-#         header {visibility: hidden;}
-#         </style>
-#     '''
-#     st.markdown(hide_st_style, unsafe_allow_html=True)
-#     page = st.radio("Options", ["Home", "Privacy Policy"], label_visibility="collapsed")
-# if page == "Home":
-#     st.title("Gemini RAG Application")
-#     st.markdown("Upload a PDF document and ask questions to get answers using Google's Gemini API.")
-#     pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
-#     if pdf_file is not None:
-#         with st.spinner("Extracting text..."):
-#             extracted_text = extract_text_from_pdf(pdf_file)
-#         if extracted_text:
-#             with st.spinner("Splitting text into overlapping chunks..."):
-#                 chunks = split_text_into_chunks(extracted_text, chunk_size=500, overlap=100)
-#             if chunks:
-#                 with st.status(f"Total chunks: {len(chunks)}"):
-#                     for i, chunk in enumerate(chunks):
-#                         st.subheader(f"Chunk {i + 1}")
-#                         st.text_area(f"Chunk {i + 1} Text", chunk, height=200, key=f"chunk_{i}")
-#                 with st.spinner("Generating embeddings..."):
-#                     embeddings = generate_embeddings(chunks)
-#                 if embeddings:
-#                     with st.spinner("Storing embeddings in FAISS..."):
-#                         index = store_embeddings_in_faiss(embeddings)
-#                     if index:
-#                         st.success("Embeddings have been successfully stored in the FAISS vector database.")
-#                         query = st.text_input("Enter your question:")
-#                         if query:
-#                             with st.spinner("Generating query embedding..."):
-#                                 query_embedding = genai.embed_content(
-#                                     model="models/embedding-001",
-#                                     content=query,
-#                                     task_type="retrieval_query"
-#                                 )["embedding"]
-#                             with st.spinner("Retrieving relevant chunks..."):
-#                                 relevant_chunks = retrieve_relevant_chunks(query_embedding, index, chunks, top_k=3)
-#                             if relevant_chunks:
-#                                 with st.status("### Relevant Context Chunks:"):
-#                                     for i, chunk in enumerate(relevant_chunks):
-#                                         st.subheader(f"Chunk {i + 1}")
-#                                         st.text_area(f"Relevant Chunk {i + 1} Text", chunk, height=200, key=f"relevant_chunk_{i}")
-#                                 with st.spinner("Generating answer..."):
-#                                     answer = generate_answer(query, relevant_chunks)
-#                                     st.write("### Answer:")
-#                                     st.write(answer)
-#                             else:
-#                                 st.warning("No relevant chunks found.")
-#                     else:
-#                         st.error("Failed to store embeddings in FAISS.")
-#                 else:
-#                     st.error("Failed to generate embeddings.")
-#             else:
-#                 st.error("No chunks generated from the text.")
-#         else:
-#             st.error("No text extracted. The document might be image-based or corrupted.")
 import streamlit as st
 import fitz  # PyMuPDF
 import nltk
@@ -380,27 +180,29 @@ if page == "Home":
 if page == "MongoDb":
     try:
-        client = MongoClient(os.environ["MONGO_API_KEY"])
         db = client['resume_database']
         collection = db['resumes']
         st.success("Connected to MongoDB Atlas!")
     except ConnectionFailure:
-        st.error("Failed to connect to MongoDB Atlas. Please check your connection string.")
         st.stop()
-    # Function to extract text from the uploaded PDF
     def extract_text_from_pdf(pdf_bytes):
-        """Extract text from the PDF."""
         try:
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            return ''.join(page.get_text() for page in doc)
         except Exception as e:
-            st.error(f"Error extracting text from PDF: {e}")
             return None
-    # Function to chunk the resume text into sections based on keywords
-    def chunk_resume_text(resume_text):
-        """Chunk the resume text into sections based on keywords."""
         sections = {
             'education': [],
             'experience': [],
@@ -410,8 +212,8 @@ if page == "MongoDb":
         }
         current_section = None
-        for sentence in sent_tokenize(resume_text):
-            sentence_upper = sentence.upper()
             if "EDUCATION" in sentence_upper:
                 current_section = 'education'
             elif "EXPERIENCE" in sentence_upper:
@@ -423,88 +225,74 @@ if page == "MongoDb":
             elif "CERTIFICATIONS" in sentence_upper:
                 current_section = 'certifications'
-            if current_section:
                 sections[current_section].append(sentence.strip())
         return sections
-    # Function to store the extracted resume data into MongoDB
-    def store_resume_in_mongodb(pdf_bytes, user_id):
-        """Store the extracted and chunked resume data in MongoDB."""
         try:
             resume_text = extract_text_from_pdf(pdf_bytes)
             if not resume_text:
                 return None
-            chunked_resume = chunk_resume_text(resume_text)
             resume_data = {
                 'user_id': user_id,
-                'resume': chunked_resume
             }
             result = collection.insert_one(resume_data)
-            return result.inserted_id
         except OperationFailure as e:
-            st.error(f"Error storing data in MongoDB: {e}")
             return None
-    # Function to fetch resume data from MongoDB
     def fetch_resume_from_mongodb(user_id):
-        """Fetch resume data from MongoDB based on user_id."""
         try:
             resume_data = collection.find_one({"user_id": user_id})
             return resume_data
         except OperationFailure as e:
-            st.error(f"Error fetching data from MongoDB: {e}")
             return None
-    # Streamlit UI
     st.title("Resume Extractor and MongoDB Storage")
-    st.write("Upload a PDF to extract text and store it in MongoDB.")
-    # Step 1: Upload PDF and store it in MongoDB
-    with st.expander("Step 1: Upload and Store Resume"):
-        pdf_file = st.file_uploader("Upload Resume PDF", type="pdf")
-        if pdf_file:
-            # Extract text and display the tokenized sentences
-            pdf_bytes = pdf_file.read()
-            resume_text = extract_text_from_pdf(pdf_bytes)
-            if resume_text:
-                tokenized_sentences = sent_tokenize(resume_text)
-                st.subheader("Tokenized Sentences")
-                for idx, sentence in enumerate(tokenized_sentences):
-                    st.write(f"{idx + 1}. {sentence}")
-                # User ID input
-                user_id = st.text_input("Enter User ID", "12345")
-                if st.button("Store Resume in MongoDB"):
-                    with st.spinner("Storing resume in MongoDB..."):
-                        inserted_id = store_resume_in_mongodb(pdf_bytes, user_id)
-                        if inserted_id:
-                            st.success(f"Resume stored successfully with ID: {inserted_id}")
-    # Step 2: Fetch resume data from MongoDB
-    with st.expander("Step 2: Retrieve Resume Data"):
-        st.write("Enter the User ID to fetch the resume data from MongoDB.")
-        # User input for user_id
-        user_id_to_fetch = st.text_input("Enter User ID to fetch data", "12345")
-        if st.button("Fetch Resume Data"):
-            with st.spinner("Fetching resume data..."):
-                resume_data = fetch_resume_from_mongodb(user_id_to_fetch)
-                if resume_data:
-                    # Display resume data in JSON format
-                    st.subheader(f"Resume Data for User ID: {user_id_to_fetch}")
-                    # Convert MongoDB result to JSON string and display it
-                    json_data = json.dumps(resume_data, default=str, indent=4)  # default=str to handle ObjectId
-                    st.json(json_data)  # Display JSON in a readable format
-                else:
-                    st.warning(f"No resume found for User ID: {user_id_to_fetch}")

 import streamlit as st
 import fitz  # PyMuPDF
 import nltk
 if page == "MongoDb":
     try:
+        client = MongoClient("mongodb+srv://gojochan31:simple1234@cluster0.b0msc.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
         db = client['resume_database']
         collection = db['resumes']
         st.success("Connected to MongoDB Atlas!")
     except ConnectionFailure:
+        st.error("Failed to connect to MongoDB. Check your connection string.")
         st.stop()
     def extract_text_from_pdf(pdf_bytes):
+        """Extract text from a PDF file."""
         try:
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            text = ""
+            for page in doc:
+                text += page.get_text()
+            return text
         except Exception as e:
+            st.error(f"Error extracting text: {e}")
             return None
+    # Split resume text into sections
+    def split_resume_into_sections(resume_text):
+        """Split the resume text into sections like Education, Experience, etc."""
         sections = {
             'education': [],
             'experience': [],
         }
         current_section = None
+        for sentence in sent_tokenize(resume_text):  # Split text into sentences
+            sentence_upper = sentence.upper()  # Convert to uppercase for easier matching
             if "EDUCATION" in sentence_upper:
                 current_section = 'education'
             elif "EXPERIENCE" in sentence_upper:
             elif "CERTIFICATIONS" in sentence_upper:
                 current_section = 'certifications'
+            if current_section:  # Add the sentence to the appropriate section
                 sections[current_section].append(sentence.strip())
         return sections
+    # Save resume data to MongoDB
+    def save_resume_to_mongodb(pdf_bytes, user_id):
+        """Save the resume text and sections to MongoDB."""
         try:
             resume_text = extract_text_from_pdf(pdf_bytes)
             if not resume_text:
                 return None
+            resume_sections = split_resume_into_sections(resume_text)
+            # Prepare data to save
             resume_data = {
                 'user_id': user_id,
+                'resume': resume_sections
             }
+            # Insert data into MongoDB
             result = collection.insert_one(resume_data)
+            return result.inserted_id
         except OperationFailure as e:
+            st.error(f"Error saving data: {e}")
             return None
+    # Fetch resume data from MongoDB
     def fetch_resume_from_mongodb(user_id):
+        """Fetch resume data from MongoDB using the user ID."""
         try:
             resume_data = collection.find_one({"user_id": user_id})
             return resume_data
         except OperationFailure as e:
+            st.error(f"Error fetching data: {e}")
             return None
     st.title("Resume Extractor and MongoDB Storage")
+    st.write("Upload a PDF resume, extract text, and store it in MongoDB.")
+    st.header("Step 1: Upload and Store Resume")
+    pdf_file = st.file_uploader("Upload a PDF Resume", type="pdf")
+    if pdf_file:
+        pdf_bytes = pdf_file.read()
+        resume_text = extract_text_from_pdf(pdf_bytes)
+        if resume_text:
+            st.subheader("Extracted Text")
+            st.write(resume_text)
+            user_id = st.text_input("Enter User ID", "12345")
+            if st.button("Save Resume to MongoDB"):
+                with st.spinner("Saving..."):
+                    inserted_id = save_resume_to_mongodb(pdf_bytes, user_id)
+                    if inserted_id:
+                        st.success(f"Resume saved! Document ID: {inserted_id}")
+    #Fetch resume data from MongoDB
+    st.header("Step 2: Retrieve Resume Data")
+    user_id_to_fetch = st.text_input("Enter User ID to Fetch Data", "12345")
+    if st.button("Fetch Resume"):
+        with st.spinner("Fetching..."):
+            resume_data = fetch_resume_from_mongodb(user_id_to_fetch)
+            if resume_data:
+                st.subheader(f"Resume Data for User ID: {user_id_to_fetch}")
+                st.json(json.dumps(resume_data, default=str, indent=4))  # Show data as JSON
+            else:
+                st.warning(f"No resume found for User ID: {user_id_to_fetch}")