|
|
import os |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain_chroma import Chroma |
|
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
SOURCE_DIRECTORY = "source_data" |
|
|
PERSIST_DIRECTORY = "data" |
|
|
|
|
|
EMBEDDING_MODEL = "all-MiniLM-L6-v2" |
|
|
CHUNK_SIZE = 1000 |
|
|
CHUNK_OVERLAP = 100 |
|
|
|
|
|
def create_vector_store(): |
|
|
""" |
|
|
Fungsi untuk memuat PDF, membaginya menjadi potongan, |
|
|
dan membuat database vektor Chroma yang persisten. |
|
|
""" |
|
|
|
|
|
pdf_files = [f for f in os.listdir(SOURCE_DIRECTORY) if f.endswith('.pdf')] |
|
|
if not pdf_files: |
|
|
print(f"Tidak ada file PDF yang ditemukan di folder '{SOURCE_DIRECTORY}'.") |
|
|
return |
|
|
|
|
|
all_docs = [] |
|
|
print("Memulai proses memuat dokumen PDF...") |
|
|
for pdf_file in pdf_files: |
|
|
try: |
|
|
file_path = os.path.join(SOURCE_DIRECTORY, pdf_file) |
|
|
loader = PyPDFLoader(file_path) |
|
|
data = loader.load() |
|
|
all_docs.extend(data) |
|
|
print(f"-> Berhasil memuat {len(data)} halaman dari '{pdf_file}'") |
|
|
except Exception as e: |
|
|
print(f"-> GAGAL memuat PDF '{pdf_file}': {e}") |
|
|
|
|
|
if not all_docs: |
|
|
print("Tidak ada data yang berhasil dimuat dari PDF. Proses dihentikan.") |
|
|
return |
|
|
|
|
|
|
|
|
print("\nMembagi dokumen menjadi potongan teks...") |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=CHUNK_SIZE, |
|
|
chunk_overlap=CHUNK_OVERLAP |
|
|
) |
|
|
docs_split = text_splitter.split_documents(all_docs) |
|
|
print(f"Total potongan dokumen yang dibuat: {len(docs_split)}") |
|
|
|
|
|
|
|
|
print(f"\nMenginisialisasi model embedding: {EMBEDDING_MODEL}...") |
|
|
try: |
|
|
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) |
|
|
except Exception as e: |
|
|
print(f"GALAT: Gagal menginisialisasi model embedding: {e}") |
|
|
print("Pastikan Anda memiliki koneksi internet dan library 'sentence-transformers' terinstal.") |
|
|
return |
|
|
|
|
|
|
|
|
print(f"\nMembuat dan menyimpan vector store di direktori '{PERSIST_DIRECTORY}'...") |
|
|
try: |
|
|
vectorstore = Chroma.from_documents( |
|
|
documents=docs_split, |
|
|
embedding=embeddings, |
|
|
persist_directory=PERSIST_DIRECTORY |
|
|
) |
|
|
print("\n--- PROSES SELESAI ---") |
|
|
print("Database vektor berhasil dibuat dan disimpan.") |
|
|
print("Anda sekarang dapat menjalankan 'app.py' untuk memulai chatbot.") |
|
|
except Exception as e: |
|
|
print(f"GALAT: Gagal membuat vector store Chroma: {e}") |
|
|
|
|
|
if __name__ == '__main__': |
|
|
create_vector_store() |