Spaces:
Sleeping
Sleeping
File size: 4,003 Bytes
9f33131 2d42345 9f33131 cb666fa 9f33131 cb666fa 9f33131 cb666fa a4d8ec7 cb666fa ac7d88d cb666fa ac7d88d cb666fa ac7d88d 9f33131 cb666fa 9f33131 cb666fa 9f33131 cb666fa 9f33131 cb666fa 9f33131 cb666fa 2d42345 cb666fa 2d42345 9f33131 2d42345 cb666fa 2d42345 cb666fa 2d42345 cb666fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import tempfile
import streamlit as st
import pickle
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
# def process_pdf(uploaded_file):
# all_documents = []
# # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# main_placeholder = st.empty()
# # Creating a temporary file to store the uploaded PDF's
# main_placeholder.text("Data Loading...Started...β
β
β
")
# for uploaded_file in uploaded_file:
# # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
# # temp_file.write(uploaded_file.read()) ## write file to temporary
# # temp_file_path = temp_file.name # Get the temporary file path
# temp_file_path = os.path.join("/tmp", uploaded_file.name)
# with open(temp_file_path, "wb") as f:
# f.write(uploaded_file.read())
# st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
# # Load the PDF's from the temporary file path
# loader = PyPDFLoader(temp_file_path) # Document loader
# doc= loader.load() # load Document
# main_placeholder.text("Text Splitter...Started...β
β
β
")
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
# #final_documents = text_splitter.split_documents(doc)# splitting
# final_documents = text_splitter.split_documents(doc)
# all_documents.extend(final_documents)
# if all_documents:
# main_placeholder.text("Embedding Vector Started Building...β
β
β
")
# st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
# st.session_state.docs = all_documents
# # Save FAISS vector store to disk
# faiss_index = st.session_state.vectors.index # Extract FAISS index
# faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
# main_placeholder.text("Vector database created!...β
β
β
")
# else:
# st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
def process_pdf(uploaded_files):
all_documents = []
main_placeholder = st.empty()
main_placeholder.text("Data Loading...Started...β
β
β
")
for uploaded_file in uploaded_files:
temp_file_path = os.path.join("/tmp", uploaded_file.name)
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.read())
st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
loader = PyPDFLoader(temp_file_path)
doc = loader.load()
main_placeholder.text("Text Splitter...Started...β
β
β
")
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# final_documents = text_splitter.split_documents(doc)
# all_documents.extend(final_documents)
# if all_documents:
# main_placeholder.text("Embedding Vector Started Building...β
β
β
")
# # β¬ Move embedding initialization here
# st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
# st.session_state.docs = all_documents
# faiss_index = st.session_state.vectors.index
# faiss.write_index(faiss_index, "faiss_index.bin")
# main_placeholder.text("Vector database created!...β
β
β
")
# else:
# st.error("No documents found or the PDF is corrupted.")
|