Spaces:
Runtime error
Runtime error
File size: 4,510 Bytes
e94340a a0c2313 e94340a a0c2313 e94340a a0c2313 e94340a e56661f a0c2313 e94340a e56661f e94340a a0c2313 e94340a a3378d0 e94340a a3378d0 e94340a a3378d0 e94340a a3378d0 e94340a a0c2313 e94340a a0c2313 e94340a e56661f a0c2313 e94340a e56661f e94340a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | import os
import json
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
# Import từ helpers
from helpers import (
list_docx_files, # Lấy danh sách file .docx
get_splits, # Xử lý file docx thành splits
get_json_splits_only, # Xử lý file JSON (FAQ)
get_web_documents, # Xử lý dữ liệu từ web
)
def get_vectorstore():
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
# embedding = HuggingFaceEmbeddings(
# model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# )
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
)
print('Setting up vectorstore Elasticsearch')
vectorstore = ElasticsearchStore(
# es_url="http://localhost:9200",
es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
index_name="daai_assistant_v3",
embedding=embedding,
es_user="elastic",
# es_password="changeme"
es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
)
print("Loading documents from JSON...")
with open("processed_documents_docx_v3.json", 'r', encoding='utf-8') as f:
document_lists = json.load(f)
# Convert to Langchain Documents
documents = []
for doc_list in document_lists:
if isinstance(doc_list, list):
for doc in doc_list:
documents.append(
Document(
page_content=doc["content"],
metadata={
"department_brief": doc["department_brief"],
"department_name": doc["department_name"],
"program_brief": doc["program_brief"],
"program_name": doc["program_name"],
"degree": doc["degree"],
"file_name": doc["file_name"],
"file_path": doc["file_path"],
"level": doc["level"],
"major_name": doc["major_name"],
"major_code": doc["major_code"]
}
)
)
else:
documents.append(
Document(
page_content=doc_list["content"],
metadata={
"department_brief": doc_list["department_brief"],
"department_name": doc_list["department_name"],
"program_brief": doc_list["program_brief"],
"program_name": doc_list["program_name"],
"degree": doc_list["degree"],
"file_name": doc_list["file_name"],
"file_path": doc_list["file_path"],
"level": doc_list["level"],
"major_name": doc_list["major_name"],
"major_code": doc_list["major_code"]
}
)
)
print(f"Adding {len(documents)} documents to Elasticsearch...")
vectorstore.add_documents(documents)
return vectorstore
def load_vectorstore():
# embedding = HuggingFaceEmbeddings(
# model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# )
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
)
vectorstore = ElasticsearchStore(
# es_url="http://localhost:9200",
es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
index_name="daai_assistant_v3",
embedding=embedding,
es_user="elastic",
# es_password="changeme"
es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
)
return vectorstore
if __name__ == "__main__":
get_vectorstore()
|