DocumentInteractor / src /PDFprocess_sample.py
Uzaiir's picture
Update src/PDFprocess_sample.py
04c1b26 verified
# import tempfile
# import streamlit as st
# import pickle
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_community.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# import faiss
# import os
# def process_pdf(uploaded_file):
# all_documents = []
# # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# main_placeholder = st.empty()
# # Creating a temporary file to store the uploaded PDF's
# main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
# for uploaded_file in uploaded_file:
# with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
# temp_file.write(uploaded_file) ## write file to temporary
# temp_file_path = temp_file.name # Get the temporary file path
# # temp_file_path = os.path.join("/tmp", uploaded_file.name)
# # with open(temp_file_path, "wb") as f:
# # f.write(uploaded_file.read())
# # st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
# # Load the PDF's from the temporary file path
# loader = PyPDFLoader(temp_file_path) # Document loader
# doc= loader.load() # load Document
# main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
# #final_documents = text_splitter.split_documents(doc)# splitting
# final_documents = text_splitter.split_documents(doc)
# all_documents.extend(final_documents)
# if all_documents:
# main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
# st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
# st.session_state.docs = all_documents
# # Save FAISS vector store to disk
# faiss_index = st.session_state.vectors.index # Extract FAISS index
# faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
# main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
# else:
# st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
# # def process_pdf(uploaded_files):
# # all_documents = []
# # main_placeholder = st.empty()
# # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
# # for uploaded_file in uploaded_files:
# # temp_file_path = os.path.join("/tmp", uploaded_file.name)
# # with open(temp_file_path, "wb") as f:
# # f.write(uploaded_file.read())
# # st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
# # loader = PyPDFLoader(temp_file_path)
# # doc = loader.load()
# # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
# # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# # final_documents = text_splitter.split_documents(doc)
# # all_documents.extend(final_documents)
# # if all_documents:
# # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
# # # ⏬ Move embedding initialization here
# # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# # st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
# # st.session_state.docs = all_documents
# # faiss_index = st.session_state.vectors.index
# # faiss.write_index(faiss_index, "faiss_index.bin")
# # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
# # else:
# # st.error("No documents found or the PDF is corrupted.")
import tempfile
import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
import os
def process_pdf(uploaded_file):
all_documents = []
st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
main_placeholder = st.empty()
# Creating a temporary file to store the uploaded PDF's
main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
for uploaded_file in uploaded_file:
with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
temp_file.write(uploaded_file.read()) ## write file to temporary
temp_file_path = temp_file.name # Get the temporary file path
# Load the PDF's from the temporary file path
loader = PyPDFLoader(temp_file_path) # Document loader
doc= loader.load() # load Document
main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
#final_documents = text_splitter.split_documents(doc)# splitting
final_documents = text_splitter.split_documents(doc)
all_documents.extend(final_documents)
if all_documents:
main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
st.session_state.docs = all_documents
# Save FAISS vector store to disk
faiss_index = st.session_state.vectors.index # Extract FAISS index
# faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
output_path = "/tmp/faiss_index.bin" # or another writable path
faiss.write_index(faiss_index, output_path)
main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
else:
st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")