Spaces:
Sleeping
Sleeping
File size: 3,961 Bytes
9f33131 37567cd 9f33131 7eceee8 9f33131 7eceee8 9f33131 7eceee8 a4d8ec7 7eceee8 ac7d88d 7eceee8 ac7d88d 7eceee8 ac7d88d 9f33131 7eceee8 9f33131 7eceee8 9f33131 7eceee8 9f33131 7eceee8 9f33131 7eceee8 cb666fa 7eceee8 cb666fa 7eceee8 cb666fa 7eceee8 cb666fa 7eceee8 cb666fa 7eceee8 cb666fa 7eceee8 9f33131 7eceee8 cb666fa 7eceee8 cb666fa 7eceee8 cb666fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import tempfile
import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
def process_pdf(uploaded_file):
all_documents = []
# st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
main_placeholder = st.empty()
# Creating a temporary file to store the uploaded PDF's
main_placeholder.text("Data Loading...Started...β
β
β
")
for uploaded_file in uploaded_file:
# with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
# temp_file.write(uploaded_file.read()) ## write file to temporary
# temp_file_path = temp_file.name # Get the temporary file path
temp_file_path = os.path.join("/tmp", uploaded_file.name)
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.read())
st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
# Load the PDF's from the temporary file path
loader = PyPDFLoader(temp_file_path) # Document loader
doc= loader.load() # load Document
main_placeholder.text("Text Splitter...Started...β
β
β
")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
#final_documents = text_splitter.split_documents(doc)# splitting
final_documents = text_splitter.split_documents(doc)
all_documents.extend(final_documents)
if all_documents:
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
st.session_state.docs = all_documents
# Save FAISS vector store to disk
faiss_index = st.session_state.vectors.index # Extract FAISS index
faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
main_placeholder.text("Vector database created!...β
β
β
")
else:
st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
# def process_pdf(uploaded_files):
# all_documents = []
# main_placeholder = st.empty()
# main_placeholder.text("Data Loading...Started...β
β
β
")
# for uploaded_file in uploaded_files:
# temp_file_path = os.path.join("/tmp", uploaded_file.name)
# with open(temp_file_path, "wb") as f:
# f.write(uploaded_file.read())
# st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
# loader = PyPDFLoader(temp_file_path)
# doc = loader.load()
# main_placeholder.text("Text Splitter...Started...β
β
β
")
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# final_documents = text_splitter.split_documents(doc)
# all_documents.extend(final_documents)
# if all_documents:
# main_placeholder.text("Embedding Vector Started Building...β
β
β
")
# # β¬ Move embedding initialization here
# st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
# st.session_state.docs = all_documents
# faiss_index = st.session_state.vectors.index
# faiss.write_index(faiss_index, "faiss_index.bin")
# main_placeholder.text("Vector database created!...β
β
β
")
# else:
# st.error("No documents found or the PDF is corrupted.")
|