| import json |
| import os |
| import glob |
| from dotenv import load_dotenv |
| from qdrant_client import models |
| from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode |
| from langchain_huggingface import HuggingFaceEmbeddings |
| from langchain_core.documents import Document |
| from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
| |
| current_dir = os.path.dirname(os.path.abspath(__file__)) |
| |
| env_path = os.path.join(current_dir, '..', '.env') |
| load_dotenv(env_path) |
|
|
| QDRANT_COLLECTION_NAME = os.getenv('QDRANT_COLLECTION_NAME', 'jrg_bot_collection') |
| QDRANT_URL = os.getenv('QDRANT_URL') |
| QDRANT_API_KEY = os.getenv('QDRANT_API_KEY') |
|
|
| def load_all_json_files(data_dir): |
| """Scan and convert all .json files in the data directory into Langchain Documents""" |
| raw_documents = [] |
| |
| |
| json_files = glob.glob(os.path.join(data_dir, "*.json")) |
| |
| print(f"--- STARTING DATA INGESTION: FOUND {len(json_files)} JSON FILES ---") |
| |
| for file_path in json_files: |
| file_name = os.path.basename(file_path) |
| print(f"Reading file: {file_name}...") |
| |
| with open(file_path, 'r', encoding='utf-8') as f: |
| try: |
| data = json.load(f) |
| for item in data: |
| content = item.get('content', '').strip() |
| if not content: |
| continue |
| |
| raw_documents.append(Document( |
| page_content=content, |
| metadata=item.get('metadata', {}) |
| )) |
| except json.JSONDecodeError: |
| print(f"ERROR: JSON syntax error in file: {file_name}") |
| |
| print(f"Successfully loaded {len(raw_documents)} raw documents.") |
| |
| |
| print("Initializing Text Splitter for chunking...") |
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=800, |
| chunk_overlap=100 |
| ) |
| |
| final_docs = text_splitter.split_documents(raw_documents) |
| print(f"Data has been split into {len(final_docs)} optimal chunks ready for VectorDB.") |
| |
| return final_docs |
|
|
| def main(): |
| if not QDRANT_URL or not QDRANT_API_KEY: |
| print("CRITICAL ERROR: QDRANT_URL or QDRANT_API_KEY not found in .env file.") |
| return |
|
|
| |
| documents = load_all_json_files(current_dir) |
|
|
| if not documents: |
| print("WARNING: No data found to ingest. Please check your JSON files.") |
| return |
|
|
| |
| print("Initializing Hybrid Embedding Models (Dense & Sparse)...") |
| |
| |
| model_name = 'bkai-foundation-models/vietnamese-bi-encoder' |
| print(f"Loading Dense Model: {model_name}...") |
| embedding_model = HuggingFaceEmbeddings(model_name=model_name) |
| |
| |
| print("Loading Sparse Model: Qdrant/BM25...") |
| sparse_embeddings = FastEmbedSparse(model_name="Qdrant/BM25") |
|
|
| |
| print(f"Uploading vectors to Qdrant Collection: '{QDRANT_COLLECTION_NAME}'...") |
| print("This process may take a few minutes depending on the data size. Please do not close the terminal...") |
| |
| vectorstore = QdrantVectorStore.from_documents( |
| documents, |
| embedding_model, |
| sparse_embedding=sparse_embeddings, |
| retrieval_mode=RetrievalMode.HYBRID, |
| url=QDRANT_URL, |
| api_key=QDRANT_API_KEY, |
| collection_name=QDRANT_COLLECTION_NAME, |
| distance=models.Distance.COSINE, |
| force_recreate=True |
| ) |
|
|
| print("SUCCESS: The entire Knowledge Base has been uploaded to Qdrant Cloud!") |
|
|
| if __name__ == "__main__": |
| main() |