Spaces:
Running
Running
| import os | |
| import sys | |
| root_dir = os.path.abspath(os.path.dirname(__file__)) | |
| sys.path.append(root_dir) | |
| from src.ingestion.loader import load_data | |
| from src.ingestion.chunker import split_documents | |
| from src.retrieval.vector_store import get_vector_store | |
| SOURCES = [ | |
| "https://www.dgwfertilizer.co.id/8-hama-dan-penyakit-penting-pada-tanaman-cabai/", | |
| "https://mitrabertani.com/artikel/detail/Budidaya-Cabai-Sederhana-tapi-Penting-Cara-Tepat-Tanam-Cabai", | |
| "https://digitani.ipb.ac.id/bagaimana-langkah-langkah-budidaya-cabai/", | |
| "data/cabai.pdf", | |
| "data/budidaya_cabai_rawit.pdf", | |
| "data/penyakit.pdf" | |
| ] | |
| def run_ingestion_pipeline(): | |
| print("Memulai Data Ingestion Pipeline (Mode Semantik)\n") | |
| all_chunks = [] | |
| for source in SOURCES: | |
| try: | |
| print(f"Membaca sumber: {source}") | |
| raw_docs = load_data(source) | |
| # MENGGUNAKAN SEMANTIC CHUNKING KHUSUS UNTUK WEB & PDF | |
| chunks = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200) | |
| all_chunks.extend(chunks) | |
| print(f"Berhasil memproses menjadi {len(chunks)} semantic chunks.\n") | |
| except Exception as e: | |
| print(f"Gagal memproses {source}: {e}\n") | |
| if all_chunks: | |
| print(f"Menyiapkan penyisipan {len(all_chunks)} semantic chunks ke ChromaDB...") | |
| db = get_vector_store() | |
| db.add_documents(all_chunks) | |
| print("\nSelesai! Semua data web & PDF telah diproses secara semantik.") | |
| print("Database vektor siap digunakan oleh API FastAPI.") | |
| else: | |
| print("\nTidak ada data yang berhasil diproses.") | |
| if __name__ == "__main__": | |
| run_ingestion_pipeline() |