Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| from tqdm import tqdm | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| # from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| from elasticsearch import Elasticsearch | |
| from langchain_community.vectorstores import ElasticsearchStore | |
| from langchain.schema import Document | |
| from sentence_transformers import SentenceTransformer | |
| # Import từ helpers | |
| from helpers import ( | |
| list_docx_files, # Lấy danh sách file .docx | |
| get_splits, # Xử lý file docx thành splits | |
| get_json_splits_only, # Xử lý file JSON (FAQ) | |
| get_web_documents, # Xử lý dữ liệu từ web | |
| ) | |
| def get_vectorstore(): | |
| print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2') | |
| # embedding = HuggingFaceEmbeddings( | |
| # model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| # ) | |
| embedding = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5" | |
| ) | |
| print('Setting up vectorstore Elasticsearch') | |
| vectorstore = ElasticsearchStore( | |
| # es_url="http://localhost:9200", | |
| es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4", | |
| index_name="daai_assistant_v3", | |
| embedding=embedding, | |
| es_user="elastic", | |
| # es_password="changeme" | |
| es_password="SPID6t3YsGbtt3e9yqA1ChmJ" | |
| ) | |
| print("Loading documents from JSON...") | |
| with open("processed_documents_docx_v3.json", 'r', encoding='utf-8') as f: | |
| document_lists = json.load(f) | |
| # Convert to Langchain Documents | |
| documents = [] | |
| for doc_list in document_lists: | |
| if isinstance(doc_list, list): | |
| for doc in doc_list: | |
| documents.append( | |
| Document( | |
| page_content=doc["content"], | |
| metadata={ | |
| "department_brief": doc["department_brief"], | |
| "department_name": doc["department_name"], | |
| "program_brief": doc["program_brief"], | |
| "program_name": doc["program_name"], | |
| "degree": doc["degree"], | |
| "file_name": doc["file_name"], | |
| "file_path": doc["file_path"], | |
| "level": doc["level"], | |
| "major_name": doc["major_name"], | |
| "major_code": doc["major_code"] | |
| } | |
| ) | |
| ) | |
| else: | |
| documents.append( | |
| Document( | |
| page_content=doc_list["content"], | |
| metadata={ | |
| "department_brief": doc_list["department_brief"], | |
| "department_name": doc_list["department_name"], | |
| "program_brief": doc_list["program_brief"], | |
| "program_name": doc_list["program_name"], | |
| "degree": doc_list["degree"], | |
| "file_name": doc_list["file_name"], | |
| "file_path": doc_list["file_path"], | |
| "level": doc_list["level"], | |
| "major_name": doc_list["major_name"], | |
| "major_code": doc_list["major_code"] | |
| } | |
| ) | |
| ) | |
| print(f"Adding {len(documents)} documents to Elasticsearch...") | |
| vectorstore.add_documents(documents) | |
| return vectorstore | |
| def load_vectorstore(): | |
| # embedding = HuggingFaceEmbeddings( | |
| # model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| # ) | |
| embedding = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5" | |
| ) | |
| vectorstore = ElasticsearchStore( | |
| # es_url="http://localhost:9200", | |
| es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4", | |
| index_name="daai_assistant_v3", | |
| embedding=embedding, | |
| es_user="elastic", | |
| # es_password="changeme" | |
| es_password="SPID6t3YsGbtt3e9yqA1ChmJ" | |
| ) | |
| return vectorstore | |
| if __name__ == "__main__": | |
| get_vectorstore() | |