File size: 4,510 Bytes
e94340a
 
 
 
 
 
 
 
 
a0c2313
e94340a
 
 
 
 
 
 
 
 
 
 
 
a0c2313
 
 
e94340a
a0c2313
e94340a
 
 
 
e56661f
 
a0c2313
e94340a
 
e56661f
 
e94340a
 
 
a0c2313
e94340a
 
 
 
 
 
 
 
 
 
 
 
a3378d0
e94340a
a3378d0
 
e94340a
 
 
 
 
 
 
 
 
 
 
 
 
 
a3378d0
e94340a
a3378d0
 
e94340a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0c2313
 
 
e94340a
a0c2313
e94340a
 
 
e56661f
 
a0c2313
e94340a
 
e56661f
 
e94340a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import json
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain.schema import Document
from sentence_transformers import SentenceTransformer

# Import từ helpers
from helpers import (
    list_docx_files,  # Lấy danh sách file .docx
    get_splits,  # Xử lý file docx thành splits
    get_json_splits_only,  # Xử lý file JSON (FAQ)
    get_web_documents,  # Xử lý dữ liệu từ web
)


def get_vectorstore():
    print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
    # embedding = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    # )
    embedding = HuggingFaceEmbeddings(
        model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
    )

    print('Setting up vectorstore Elasticsearch')
    vectorstore = ElasticsearchStore(
        # es_url="http://localhost:9200",
        es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
        index_name="daai_assistant_v3",
        embedding=embedding,
        es_user="elastic",
        # es_password="changeme"
        es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
    )

    print("Loading documents from JSON...")
    with open("processed_documents_docx_v3.json", 'r', encoding='utf-8') as f:
        document_lists = json.load(f)

    # Convert to Langchain Documents
    documents = []
    for doc_list in document_lists:
        if isinstance(doc_list, list):
            for doc in doc_list:
                documents.append(
                    Document(
                        page_content=doc["content"],
                        metadata={
                            "department_brief": doc["department_brief"],
                            "department_name": doc["department_name"],
                            "program_brief": doc["program_brief"],
                            "program_name": doc["program_name"],
                            "degree": doc["degree"],
                            "file_name": doc["file_name"],
                            "file_path": doc["file_path"],
                            "level": doc["level"],
                            "major_name": doc["major_name"],
                            "major_code": doc["major_code"]
                        }
                    )
                )
        else:
            documents.append(
                Document(
                    page_content=doc_list["content"],
                    metadata={
                        "department_brief": doc_list["department_brief"],
                        "department_name": doc_list["department_name"],
                        "program_brief": doc_list["program_brief"],
                        "program_name": doc_list["program_name"],
                        "degree": doc_list["degree"],
                        "file_name": doc_list["file_name"],
                        "file_path": doc_list["file_path"],
                        "level": doc_list["level"],
                        "major_name": doc_list["major_name"],
                        "major_code": doc_list["major_code"]
                    }
                )
            )

    print(f"Adding {len(documents)} documents to Elasticsearch...")
    vectorstore.add_documents(documents)

    return vectorstore


def load_vectorstore():
    # embedding = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    # )
    embedding = HuggingFaceEmbeddings(
        model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
    )

    vectorstore = ElasticsearchStore(
        # es_url="http://localhost:9200",
        es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
        index_name="daai_assistant_v3",
        embedding=embedding,
        es_user="elastic",
        # es_password="changeme"
        es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
    )

    return vectorstore


if __name__ == "__main__":
    get_vectorstore()