Spaces:
Build error
Build error
Update concat_vector_store_정리된.py
Browse files- concat_vector_store_정리된.py +16 -16
concat_vector_store_정리된.py
CHANGED
|
@@ -5,11 +5,11 @@ from e5_embeddings import E5Embeddings
|
|
| 5 |
from langchain_community.vectorstores import FAISS
|
| 6 |
from document_processor import load_pdf_with_pymupdf, split_documents
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
FOLDER = "
|
| 10 |
VECTOR_STORE_PATH = "vector_db"
|
| 11 |
|
| 12 |
-
# 1.
|
| 13 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
| 14 |
return E5Embeddings(
|
| 15 |
model_name=model_name,
|
|
@@ -17,39 +17,39 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
|
|
| 17 |
encode_kwargs={'normalize_embeddings': True}
|
| 18 |
)
|
| 19 |
|
| 20 |
-
# 2.
|
| 21 |
def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
|
| 22 |
if not os.path.exists(load_path):
|
| 23 |
-
raise FileNotFoundError(f"
|
| 24 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
| 25 |
|
| 26 |
-
# 3.
|
| 27 |
def embed_cleaned_pdfs(folder, vectorstore, embeddings):
|
| 28 |
-
pattern = os.path.join(folder, "
|
| 29 |
pdf_files = glob.glob(pattern)
|
| 30 |
-
print(f"
|
| 31 |
|
| 32 |
new_documents = []
|
| 33 |
for pdf_path in pdf_files:
|
| 34 |
-
print(f"
|
| 35 |
text = load_pdf_with_pymupdf(pdf_path)
|
| 36 |
if text.strip():
|
| 37 |
new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))
|
| 38 |
|
| 39 |
-
print(f"
|
| 40 |
|
| 41 |
chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
|
| 42 |
-
print(f"
|
| 43 |
|
| 44 |
-
print(f"
|
| 45 |
vectorstore.add_documents(chunks)
|
| 46 |
-
print(f"
|
| 47 |
|
| 48 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
| 49 |
-
print(f"
|
| 50 |
|
| 51 |
-
#
|
| 52 |
if __name__ == "__main__":
|
| 53 |
embeddings = get_embeddings()
|
| 54 |
vectorstore = load_vector_store(embeddings)
|
| 55 |
-
embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)
|
|
|
|
| 5 |
from langchain_community.vectorstores import FAISS
|
| 6 |
from document_processor import load_pdf_with_pymupdf, split_documents
|
| 7 |
|
| 8 |
+
# Path configuration
|
| 9 |
+
FOLDER = "cleaned_pdfs" # Folder containing the cleaned PDFs
|
| 10 |
VECTOR_STORE_PATH = "vector_db"
|
| 11 |
|
| 12 |
+
# 1. Load the embedding model
|
| 13 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
| 14 |
return E5Embeddings(
|
| 15 |
model_name=model_name,
|
|
|
|
| 17 |
encode_kwargs={'normalize_embeddings': True}
|
| 18 |
)
|
| 19 |
|
| 20 |
+
# 2. Load existing vector store
|
| 21 |
def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
|
| 22 |
if not os.path.exists(load_path):
|
| 23 |
+
raise FileNotFoundError(f"Cannot find vector store: {load_path}")
|
| 24 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
| 25 |
|
| 26 |
+
# 3. Embed only the cleaned PDFs
|
| 27 |
def embed_cleaned_pdfs(folder, vectorstore, embeddings):
|
| 28 |
+
pattern = os.path.join(folder, "cleaned*.pdf")
|
| 29 |
pdf_files = glob.glob(pattern)
|
| 30 |
+
print(f"Number of target PDFs: {len(pdf_files)}")
|
| 31 |
|
| 32 |
new_documents = []
|
| 33 |
for pdf_path in pdf_files:
|
| 34 |
+
print(f"Processing: {pdf_path}")
|
| 35 |
text = load_pdf_with_pymupdf(pdf_path)
|
| 36 |
if text.strip():
|
| 37 |
new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))
|
| 38 |
|
| 39 |
+
print(f"Number of documents: {len(new_documents)}")
|
| 40 |
|
| 41 |
chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
|
| 42 |
+
print(f"Number of chunks: {len(chunks)}")
|
| 43 |
|
| 44 |
+
print(f"Vector count before addition: {vectorstore.index.ntotal}")
|
| 45 |
vectorstore.add_documents(chunks)
|
| 46 |
+
print(f"Vector count after addition: {vectorstore.index.ntotal}")
|
| 47 |
|
| 48 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
| 49 |
+
print(f"Save completed: {VECTOR_STORE_PATH}")
|
| 50 |
|
| 51 |
+
# Execution
|
| 52 |
if __name__ == "__main__":
|
| 53 |
embeddings = get_embeddings()
|
| 54 |
vectorstore = load_vector_store(embeddings)
|
| 55 |
+
embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)
|