Spaces:
Sleeping
Sleeping
| # import tempfile | |
| # import streamlit as st | |
| # import pickle | |
| # from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| # from langchain_community.document_loaders import PyPDFLoader | |
| # from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| # from langchain_community.vectorstores import FAISS | |
| # import faiss | |
| # # def process_pdf(uploaded_file): | |
| # # all_documents = [] | |
| # # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
| # # main_placeholder = st.empty() | |
| # # # Creating a temporary file to store the uploaded PDF's | |
| # # main_placeholder.text("Data Loading...Started...β β β ") | |
| # # for uploaded_file in uploaded_file: | |
| # # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file: | |
| # # temp_file.write(uploaded_file.read()) ## write file to temporary | |
| # # temp_file_path = temp_file.name # Get the temporary file path | |
| # # # Load the PDF's from the temporary file path | |
| # # loader = PyPDFLoader(temp_file_path) # Document loader | |
| # # doc= loader.load() # load Document | |
| # # main_placeholder.text("Text Splitter...Started...β β β ") | |
| # # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String | |
| # # #final_documents = text_splitter.split_documents(doc)# splitting | |
| # # final_documents = text_splitter.split_documents(doc) | |
| # # all_documents.extend(final_documents) | |
| # # if all_documents: | |
| # # main_placeholder.text("Embedding Vector Started Building...β β β ") | |
| # # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings) | |
| # # st.session_state.docs = all_documents | |
| # # # Save FAISS vector store to disk | |
| # # faiss_index = st.session_state.vectors.index # Extract FAISS index | |
| # # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file | |
| # # main_placeholder.text("Vector database created!...β β β ") | |
| # # else: | |
| # # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.") | |
| import streamlit as st | |
| import faiss | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| from langchain.vectorstores import FAISS | |
| def process_pdf_from_path(file_path): | |
| """ | |
| Processes a PDF from a given path by: | |
| - Loading the PDF | |
| - Splitting it into manageable chunks | |
| - Creating embeddings with Gemini | |
| - Saving the FAISS vector index to disk | |
| Parameters: | |
| file_path (str): Path to the uploaded PDF file | |
| """ | |
| all_documents = [] | |
| try: | |
| # Initialize embeddings model | |
| st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
| main_placeholder = st.empty() | |
| main_placeholder.text("Loading PDF and preparing text... β ") | |
| # Load PDF document | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| # Split documents into smaller chunks | |
| main_placeholder.text("Splitting text into chunks... β ") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| final_documents = text_splitter.split_documents(documents) | |
| all_documents.extend(final_documents) | |
| if all_documents: | |
| main_placeholder.text("Creating vector embeddings... β ") | |
| # Generate vector store | |
| st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings) | |
| st.session_state.docs = all_documents | |
| # Save FAISS index | |
| faiss_index = st.session_state.vectors.index | |
| faiss.write_index(faiss_index, "/tmp/faiss_index.bin") | |
| main_placeholder.text("Vector database created successfully! π") | |
| else: | |
| st.error("No valid documents found in the uploaded PDF.") | |
| except Exception as e: | |
| st.error(f"An error occurred while processing the PDF: {e}") | |