Spaces:
Sleeping
Sleeping
File size: 6,494 Bytes
9acac0b 9f33131 37567cd 9f33131 1359360 9f33131 7eceee8 9f33131 7eceee8 9acac0b 9f33131 7eceee8 04c37c4 9acac0b 04c37c4 9f33131 7eceee8 9f33131 7eceee8 9f33131 7eceee8 9f33131 7eceee8 04c1b26 7eceee8 9f33131 7eceee8 9f33131 cb666fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# import tempfile
# import streamlit as st
# import pickle
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_community.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# import faiss
# import os
# def process_pdf(uploaded_file):
# all_documents = []
# # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# main_placeholder = st.empty()
# # Creating a temporary file to store the uploaded PDF's
# main_placeholder.text("Data Loading...Started...β
β
β
")
# for uploaded_file in uploaded_file:
# with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
# temp_file.write(uploaded_file) ## write file to temporary
# temp_file_path = temp_file.name # Get the temporary file path
# # temp_file_path = os.path.join("/tmp", uploaded_file.name)
# # with open(temp_file_path, "wb") as f:
# # f.write(uploaded_file.read())
# # st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
# # Load the PDF's from the temporary file path
# loader = PyPDFLoader(temp_file_path) # Document loader
# doc= loader.load() # load Document
# main_placeholder.text("Text Splitter...Started...β
β
β
")
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
# #final_documents = text_splitter.split_documents(doc)# splitting
# final_documents = text_splitter.split_documents(doc)
# all_documents.extend(final_documents)
# if all_documents:
# main_placeholder.text("Embedding Vector Started Building...β
β
β
")
# st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
# st.session_state.docs = all_documents
# # Save FAISS vector store to disk
# faiss_index = st.session_state.vectors.index # Extract FAISS index
# faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
# main_placeholder.text("Vector database created!...β
β
β
")
# else:
# st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
# # def process_pdf(uploaded_files):
# # all_documents = []
# # main_placeholder = st.empty()
# # main_placeholder.text("Data Loading...Started...β
β
β
")
# # for uploaded_file in uploaded_files:
# # temp_file_path = os.path.join("/tmp", uploaded_file.name)
# # with open(temp_file_path, "wb") as f:
# # f.write(uploaded_file.read())
# # st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
# # loader = PyPDFLoader(temp_file_path)
# # doc = loader.load()
# # main_placeholder.text("Text Splitter...Started...β
β
β
")
# # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# # final_documents = text_splitter.split_documents(doc)
# # all_documents.extend(final_documents)
# # if all_documents:
# # main_placeholder.text("Embedding Vector Started Building...β
β
β
")
# # # β¬ Move embedding initialization here
# # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# # st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
# # st.session_state.docs = all_documents
# # faiss_index = st.session_state.vectors.index
# # faiss.write_index(faiss_index, "faiss_index.bin")
# # main_placeholder.text("Vector database created!...β
β
β
")
# # else:
# # st.error("No documents found or the PDF is corrupted.")
import tempfile
import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
import os
def process_pdf(uploaded_file):
all_documents = []
st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
main_placeholder = st.empty()
# Creating a temporary file to store the uploaded PDF's
main_placeholder.text("Data Loading...Started...β
β
β
")
for uploaded_file in uploaded_file:
with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
temp_file.write(uploaded_file.read()) ## write file to temporary
temp_file_path = temp_file.name # Get the temporary file path
# Load the PDF's from the temporary file path
loader = PyPDFLoader(temp_file_path) # Document loader
doc= loader.load() # load Document
main_placeholder.text("Text Splitter...Started...β
β
β
")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
#final_documents = text_splitter.split_documents(doc)# splitting
final_documents = text_splitter.split_documents(doc)
all_documents.extend(final_documents)
if all_documents:
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
st.session_state.docs = all_documents
# Save FAISS vector store to disk
faiss_index = st.session_state.vectors.index # Extract FAISS index
# faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
output_path = "/tmp/faiss_index.bin" # or another writable path
faiss.write_index(faiss_index, output_path)
main_placeholder.text("Vector database created!...β
β
β
")
else:
st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
|