Spaces:

buianh0803
/

NCT_chatbot_QA

Runtime error

File size: 4,510 Bytes

import os
import json
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain.schema import Document
from sentence_transformers import SentenceTransformer

# Import từ helpers
from helpers import (
    list_docx_files,  # Lấy danh sách file .docx
    get_splits,  # Xử lý file docx thành splits
    get_json_splits_only,  # Xử lý file JSON (FAQ)
    get_web_documents,  # Xử lý dữ liệu từ web
)


def get_vectorstore():
    print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
    # embedding = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    # )
    embedding = HuggingFaceEmbeddings(
        model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
    )

    print('Setting up vectorstore Elasticsearch')
    vectorstore = ElasticsearchStore(
        # es_url="http://localhost:9200",
        es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
        index_name="daai_assistant_v3",
        embedding=embedding,
        es_user="elastic",
        # es_password="changeme"
        es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
    )

    print("Loading documents from JSON...")
    with open("processed_documents_docx_v3.json", 'r', encoding='utf-8') as f:
        document_lists = json.load(f)

    # Convert to Langchain Documents
    documents = []
    for doc_list in document_lists:
        if isinstance(doc_list, list):
            for doc in doc_list:
                documents.append(
                    Document(
                        page_content=doc["content"],
                        metadata={
                            "department_brief": doc["department_brief"],
                            "department_name": doc["department_name"],
                            "program_brief": doc["program_brief"],
                            "program_name": doc["program_name"],
                            "degree": doc["degree"],
                            "file_name": doc["file_name"],
                            "file_path": doc["file_path"],
                            "level": doc["level"],
                            "major_name": doc["major_name"],
                            "major_code": doc["major_code"]
                        }
                    )
                )
        else:
            documents.append(
                Document(
                    page_content=doc_list["content"],
                    metadata={
                        "department_brief": doc_list["department_brief"],
                        "department_name": doc_list["department_name"],
                        "program_brief": doc_list["program_brief"],
                        "program_name": doc_list["program_name"],
                        "degree": doc_list["degree"],
                        "file_name": doc_list["file_name"],
                        "file_path": doc_list["file_path"],
                        "level": doc_list["level"],
                        "major_name": doc_list["major_name"],
                        "major_code": doc_list["major_code"]
                    }
                )
            )

    print(f"Adding {len(documents)} documents to Elasticsearch...")
    vectorstore.add_documents(documents)

    return vectorstore


def load_vectorstore():
    # embedding = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    # )
    embedding = HuggingFaceEmbeddings(
        model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
    )

    vectorstore = ElasticsearchStore(
        # es_url="http://localhost:9200",
        es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
        index_name="daai_assistant_v3",
        embedding=embedding,
        es_user="elastic",
        # es_password="changeme"
        es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
    )

    return vectorstore


if __name__ == "__main__":
    get_vectorstore()