Spaces:
Sleeping
Sleeping
Update src/PDFprocess_sample.py
Browse files- src/PDFprocess_sample.py +15 -15
src/PDFprocess_sample.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import tempfile
|
| 2 |
import streamlit as st
|
| 3 |
import pickle
|
| 4 |
-
|
| 5 |
from langchain_community.document_loaders import PyPDFLoader
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_community.vectorstores import FAISS
|
|
@@ -73,22 +73,22 @@ def process_pdf(uploaded_files):
|
|
| 73 |
doc = loader.load()
|
| 74 |
main_placeholder.text("Text Splitter...Started...β
β
β
")
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
|
|
|
| 1 |
import tempfile
|
| 2 |
import streamlit as st
|
| 3 |
import pickle
|
| 4 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 5 |
from langchain_community.document_loaders import PyPDFLoader
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 73 |
doc = loader.load()
|
| 74 |
main_placeholder.text("Text Splitter...Started...β
β
β
")
|
| 75 |
|
| 76 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 77 |
+
final_documents = text_splitter.split_documents(doc)
|
| 78 |
+
all_documents.extend(final_documents)
|
| 79 |
|
| 80 |
+
if all_documents:
|
| 81 |
+
main_placeholder.text("Embedding Vector Started Building...β
β
β
")
|
| 82 |
|
| 83 |
+
# β¬ Move embedding initialization here
|
| 84 |
+
st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
| 85 |
+
st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
|
| 86 |
+
st.session_state.docs = all_documents
|
| 87 |
|
| 88 |
+
faiss_index = st.session_state.vectors.index
|
| 89 |
+
faiss.write_index(faiss_index, "faiss_index.bin")
|
| 90 |
+
main_placeholder.text("Vector database created!...β
β
β
")
|
| 91 |
|
| 92 |
+
else:
|
| 93 |
+
st.error("No documents found or the PDF is corrupted.")
|
| 94 |
|