Spaces:
Sleeping
Sleeping
File size: 3,698 Bytes
3ea0a6e 2ee3a93 3ea0a6e 2ee3a93 3ea0a6e 2ee3a93 3ea0a6e 2ee3a93 3ea0a6e 2ee3a93 3ea0a6e 2ee3a93 3ea0a6e 2ee3a93 3ea0a6e 2ee3a93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import tempfile
import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
# def process_pdf(uploaded_file):
# all_documents = []
# st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# main_placeholder = st.empty()
# # Creating a temporary file to store the uploaded PDF's
# main_placeholder.text("Data Loading...Started...β
β
β
")
# for uploaded_file in uploaded_file:
# with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
# temp_file.write(uploaded_file.read()) ## write file to temporary
# temp_file_path = temp_file.name # Get the temporary file path
# # Load the PDF's from the temporary file path
# loader = PyPDFLoader(temp_file_path) # Document loader
# doc= loader.load() # load Document
# main_placeholder.text("Text Splitter...Started...β
β
β
")
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
# #final_documents = text_splitter.split_documents(doc)# splitting
# final_documents = text_splitter.split_documents(doc)
# all_documents.extend(final_documents)
# if all_documents:
# main_placeholder.text("Embedding Vector Started Building...β
β
β
")
# st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
# st.session_state.docs = all_documents
# # Save FAISS vector store to disk
# faiss_index = st.session_state.vectors.index # Extract FAISS index
# faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
# main_placeholder.text("Vector database created!...β
β
β
")
# else:
# st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
def process_pdf(file_path): # Expecting file path string
st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
main_placeholder = st.empty()
main_placeholder.text("Data Loading...Started...β
β
β
")
# Load the PDF from the given file path
loader = PyPDFLoader(file_path)
doc = loader.load()
main_placeholder.text("Text Splitter...Started...β
β
β
")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(doc)
if final_documents:
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
st.session_state.vectors = FAISS.from_documents(final_documents, st.session_state.embeddings)
st.session_state.docs = final_documents
# Save FAISS vector store to disk
faiss_index = st.session_state.vectors.index
faiss.write_index(faiss_index, "faiss_index.bin")
main_placeholder.text("Vector database created!...β
β
β
")
else:
st.error("No documents found or the PDF is corrupted.")
|