import os import json from tqdm import tqdm from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings # from langchain_google_genai import GoogleGenerativeAIEmbeddings from elasticsearch import Elasticsearch from langchain_community.vectorstores import ElasticsearchStore from langchain.schema import Document from sentence_transformers import SentenceTransformer # Import từ helpers from helpers import ( list_docx_files, # Lấy danh sách file .docx get_splits, # Xử lý file docx thành splits get_json_splits_only, # Xử lý file JSON (FAQ) get_web_documents, # Xử lý dữ liệu từ web ) def get_vectorstore(): print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2') # embedding = HuggingFaceEmbeddings( # model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # ) embedding = HuggingFaceEmbeddings( model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5" ) print('Setting up vectorstore Elasticsearch') vectorstore = ElasticsearchStore( # es_url="http://localhost:9200", es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4", index_name="daai_assistant_v3", embedding=embedding, es_user="elastic", # es_password="changeme" es_password="SPID6t3YsGbtt3e9yqA1ChmJ" ) print("Loading documents from JSON...") with open("processed_documents_docx_v3.json", 'r', encoding='utf-8') as f: document_lists = json.load(f) # Convert to Langchain Documents documents = [] for doc_list in document_lists: if isinstance(doc_list, list): for doc in doc_list: documents.append( Document( page_content=doc["content"], metadata={ "department_brief": doc["department_brief"], "department_name": doc["department_name"], "program_brief": doc["program_brief"], "program_name": doc["program_name"], "degree": doc["degree"], "file_name": doc["file_name"], "file_path": doc["file_path"], "level": doc["level"], "major_name": doc["major_name"], "major_code": doc["major_code"] } ) ) else: documents.append( Document( page_content=doc_list["content"], metadata={ "department_brief": doc_list["department_brief"], "department_name": doc_list["department_name"], "program_brief": doc_list["program_brief"], "program_name": doc_list["program_name"], "degree": doc_list["degree"], "file_name": doc_list["file_name"], "file_path": doc_list["file_path"], "level": doc_list["level"], "major_name": doc_list["major_name"], "major_code": doc_list["major_code"] } ) ) print(f"Adding {len(documents)} documents to Elasticsearch...") vectorstore.add_documents(documents) return vectorstore def load_vectorstore(): # embedding = HuggingFaceEmbeddings( # model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # ) embedding = HuggingFaceEmbeddings( model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5" ) vectorstore = ElasticsearchStore( # es_url="http://localhost:9200", es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4", index_name="daai_assistant_v3", embedding=embedding, es_user="elastic", # es_password="changeme" es_password="SPID6t3YsGbtt3e9yqA1ChmJ" ) return vectorstore if __name__ == "__main__": get_vectorstore()