NCT_chatbot_QA / file_loader.py
buianh0803's picture
Upload 4 files
a0c2313 verified
import os
import json
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
from elasticsearch import Elasticsearch
from langchain_community.vectorstores import ElasticsearchStore
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
# Import từ helpers
from helpers import (
list_docx_files, # Lấy danh sách file .docx
get_splits, # Xử lý file docx thành splits
get_json_splits_only, # Xử lý file JSON (FAQ)
get_web_documents, # Xử lý dữ liệu từ web
)
def get_vectorstore():
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
# embedding = HuggingFaceEmbeddings(
# model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# )
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
)
print('Setting up vectorstore Elasticsearch')
vectorstore = ElasticsearchStore(
# es_url="http://localhost:9200",
es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
index_name="daai_assistant_v3",
embedding=embedding,
es_user="elastic",
# es_password="changeme"
es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
)
print("Loading documents from JSON...")
with open("processed_documents_docx_v3.json", 'r', encoding='utf-8') as f:
document_lists = json.load(f)
# Convert to Langchain Documents
documents = []
for doc_list in document_lists:
if isinstance(doc_list, list):
for doc in doc_list:
documents.append(
Document(
page_content=doc["content"],
metadata={
"department_brief": doc["department_brief"],
"department_name": doc["department_name"],
"program_brief": doc["program_brief"],
"program_name": doc["program_name"],
"degree": doc["degree"],
"file_name": doc["file_name"],
"file_path": doc["file_path"],
"level": doc["level"],
"major_name": doc["major_name"],
"major_code": doc["major_code"]
}
)
)
else:
documents.append(
Document(
page_content=doc_list["content"],
metadata={
"department_brief": doc_list["department_brief"],
"department_name": doc_list["department_name"],
"program_brief": doc_list["program_brief"],
"program_name": doc_list["program_name"],
"degree": doc_list["degree"],
"file_name": doc_list["file_name"],
"file_path": doc_list["file_path"],
"level": doc_list["level"],
"major_name": doc_list["major_name"],
"major_code": doc_list["major_code"]
}
)
)
print(f"Adding {len(documents)} documents to Elasticsearch...")
vectorstore.add_documents(documents)
return vectorstore
def load_vectorstore():
# embedding = HuggingFaceEmbeddings(
# model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# )
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/msmarco-MiniLM-L12-cos-v5"
)
vectorstore = ElasticsearchStore(
# es_url="http://localhost:9200",
es_cloud_id="NCT_chatbot:YXAtc291dGhlYXN0LTEuYXdzLmZvdW5kLmlvJGEzYjVlOTgzYjM0ODRmZmU4YTVhYmQwOGQ4YTRmODM0JDVkMDQzOGY0MjRhMTQ4MDU5NTZmNTcyZmI0ZmFkOTQ4",
index_name="daai_assistant_v3",
embedding=embedding,
es_user="elastic",
# es_password="changeme"
es_password="SPID6t3YsGbtt3e9yqA1ChmJ"
)
return vectorstore
if __name__ == "__main__":
get_vectorstore()