Spaces:

buianh0803
/

NCT_chatbot_QA

Runtime error

App Files Files Community

NCT_chatbot_QA / create_json_file.py

buianh0803

Upload 4 files

a0c2313 verified 11 months ago

raw

history blame contribute delete

15.9 kB

	import os
	from pathlib import Path
	import json
	from docx import Document
	from langchain_huggingface import HuggingFaceEmbeddings
	from tqdm import tqdm
	from langchain_docling import DoclingLoader
	from langchain_docling.loader import ExportType
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from transformers import AutoTokenizer


	class DocumentProcessor:
	def __init__(self):
	print('Initializing embedding model...')
	self.tokenizer = AutoTokenizer.from_pretrained(
	"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
	self.embedding_model = HuggingFaceEmbeddings(
	model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
	)
	# Define text splitter
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=10000, # Characters, not tokens
	chunk_overlap=3000
	)

	# Define program folder mappings
	self.folder_mappings = {
	"Chương trình An toàn thông tin": {
	"department_brief": "FIT",
	"department_name": "Khoa Công nghệ thông tin",
	"program_brief": "IS",
	"program_name": "Chương trình An toàn thông tin",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân An toàn thông tin/Bachelor of Information Security",
	"major_code": "7480202",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình An toàn thông tin/CTĐT ngành An toàn thông tin.docx"
	},
	"Chương trình Công nghệ thông tin": {
	"department_brief": "FIT",
	"department_name": "Khoa Công nghệ thông tin",
	"program_brief": "IT",
	"program_name": "Chương trình Công nghệ thông tin",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Công nghệ thông tin/Bachelor of Information Technology",
	"major_code": "7480201",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Công nghệ thông tin/Công nghệ thông tin_CTĐT_2023.docx"
	},
	"Chương trình Khoa học máy tính": {
	"department_brief": "FIT",
	"department_name": "Khoa Công nghệ thông tin",
	"program_brief": "CS",
	"program_name": "Chương trình Khoa học máy tính",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Khoa học máy tính/Bachelor of Computer Science",
	"major_code": "7480101",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Khoa học máy tính/Khoa học máy tính_CTĐT_2023.docx"
	},
	"Chương trình Kỹ thuật phần mềm": {
	"department_brief": "FIT",
	"department_name": "Khoa Công nghệ thông tin",
	"program_brief": "SE",
	"program_name": "Chương trình Kỹ thuật phần mềm",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Kỹ thuật phần mềm/Bachelor of Software Engineering",
	"major_code": "7480103",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Kỹ thuật phần mềm/CTĐT ngành Kỹ thuật phần mềm.docx"
	},
	"Chương trình Định phí bảo hiểm và Quản trị rủi ro": {
	"department_brief": "MFE",
	"department_name": "Khoa Toán kinh tế",
	"program_brief": "Actuary",
	"program_name": "Chương trình Định phí bảo hiểm và Quản trị rủi ro",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Định phí bảo hiểm và Quản trị rủi ro/Bachelor of Actuarial Science",
	"major_code": "7310108",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Định phí bảo hiểm và Quản trị rủi ro/CTĐT ngành Định phí bảo hiểm và Quản trị rủi ro.docx"
	},
	"Chương trình Hệ thống thông tin": {
	"department_brief": "MIS",
	"department_name": "Khoa Hệ thống thông tin quản lý",
	"program_brief": "IS",
	"program_name": "Chương trình Hệ thống thông tin",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Hệ thống thông tin/Bachelor of Information Systems",
	"major_code": "7480104",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Hệ thống thông tin/CTĐT ngành HTTT.docx"
	},
	"Chương trình Hệ thống thông tin quản lý": {
	"department_brief": "MIS",
	"department_name": "Khoa Hệ thống thông tin quản lý",
	"program_brief": "MIS",
	"program_name": "Chương trình Hệ thống thông tin quản lý",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Hệ thống thông tin quản lý/Bachelor of Management Information Systems",
	"major_code": "7340405",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Hệ thống thông tin quản lý/Hệ thống thông tin quản lý_CTĐT_2023.docx"
	},
	"Chương trình Phân tích dữ liệu trong Kinh tế": {
	"department_brief": "MFE",
	"department_name": "Khoa Toán kinh tế",
	"program_brief": "DSEB",
	"program_name": "Chương trình Phân tích dữ liệu trong Kinh tế",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Khoa học dữ liệu trong kinh tế và kinh doanh/Bachelor of Data Science in Economics and Business",
	"major_code": "7310108",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Phân tích dữ liệu trong Kinh tế/DSEB_khung_chuong_trinh.docx"
	},
	"Chương trình Thống kê kinh tế": {
	"department_brief": "KTK",
	"department_name": "Khoa thống kê",
	"program_brief": "ES",
	"program_name": "Chương trình Thống kê kinh tế",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Thống kê kinh tế/Bachelor of Economic Statistics",
	"major_code": "7310107",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Thống kê kinh tế/Thống kê kinh tế_CTĐT_2023.docx"
	},
	"Chương trình Toán kinh tế": {
	"department_brief": "MFE",
	"department_name": "Khoa Toán kinh tế",
	"program_brief": "TOKT",
	"program_name": "Chương trình Toán kinh tế",
	"degree": "Cử nhân/Bachelor",
	"level": "Đại học/Undergraduate",
	"major_name": "Cử nhân Toán kinh tế/Bachelor of Mathematical Economics",
	"major_code": "7310108",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Toán kinh tế/Toán kinh tế_CTĐT_2023.docx"
	},
	"Chương trình Trí tuệ nhân tạo": {
	"department_brief": "FDA",
	"department_name": "Khoa Khoa học dữ liệu và Trí tuệ nhân tạo",
	"program_brief": "TTNT",
	"program_name": "Chương trình Trí tuệ nhân tạo",
	"degree": "",
	"level": "Đại học/Undergraduate",
	# "major_name": "Trí tuệ nhân tạo/ of Artificial Intelligence",
	"major_name": "",
	"major_code": "7480107",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Trí tuệ nhân tạo/22.5.1.CTĐT_ngành TTNT_cử nhân.docx"
	},
	"Chương trình Khoa học dữ liệu": {
	"department_brief": "FDA",
	"department_name": "Khoa Khoa học dữ liệu và Trí tuệ nhân tạo",
	"program_brief": "KHDL",
	"program_name": "Chương trình Khoa học dữ liệu",
	"degree": "",
	"level": "Đại học/Undergraduate",
	# "major_name": "Khoa học dữ liệu trong Tài chính và thương mại điện tử/ of Data Science in Finance and E-commerce",
	"major_name": "",
	"major_code": "7460108",
	"file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Khoa học dữ liệu/22.5.1.CTĐT_ngành KHDL_cử nhân.docx"
	}
	}

	def extract_text_from_docx(self, file_path):
	"""Extract text from .docx file using Docling and return chunks"""
	try:
	# Initialize DoclingLoader
	loader = DoclingLoader(
	file_path=file_path,
	export_type=ExportType.MARKDOWN,
	)

	# Load and process document
	docs = loader.load()

	if docs:
	# Combine all text first
	full_text = "\n".join([doc.page_content for doc in docs])
	# Split into chunks
	chunks = self.text_splitter.split_text(full_text)

	# Validate each chunk against token limit
	valid_chunks = []
	for chunk in chunks:
	tokens = self.tokenizer.encode(chunk)
	if len(tokens) <= 10000:
	valid_chunks.append(chunk)
	else:
	# Further split this chunk
	smaller_splitter = RecursiveCharacterTextSplitter(
	chunk_size=len(chunk) // 2, # Split in half
	chunk_overlap=2000,
	)
	smaller_chunks = smaller_splitter.split_text(chunk)
	valid_chunks.extend(smaller_chunks)

	return valid_chunks
	return []

	except Exception as e:
	print(f"Error processing DOCX {file_path}: {e}")
	return []

	def get_faculty_program_from_path(self, file_path):
	"""Get faculty and program based on folder structure"""
	path_parts = Path(file_path).parts

	# Look for program folder in path
	for part in path_parts:
	if part in self.folder_mappings:
	return self.folder_mappings[part]

	return {
	"department_brief": "Unknown",
	"department_name": "Unknown",
	"program_brief": "Unknown",
	"program_name": "Unknown",
	"degree": "Unknown",
	"level": "Unknown",
	"major_name": "Unknown",
	"major_code": "Unknown",
	"file_path": "Unknown"
	}

	def process_document(self, file_path):
	"""Process a single document and return its metadata"""
	file_path = Path(file_path)
	file_ext = file_path.suffix.lower()
	file_name = file_path.name.lower()

	# Extract text chunks based on file type
	if file_ext in ['.docx']:
	chunks = self.extract_text_from_docx(file_path)
	else:
	print(f"Unsupported file type: {file_ext}")
	return None

	# Get faculty and program info
	faculty_program_info = self.get_faculty_program_from_path(file_path)

	# Determine degree type
	if "kỹ sư" in file_name:
	degree_prefix = "Kỹ sư"
	eng_prefix = "Engineer"
	else:
	degree_prefix = "Cử nhân"
	eng_prefix = "Bachelor"

	# Create documents list with chunks
	documents = []
	for chunk in chunks:
	# Determine major_name based on program
	if faculty_program_info["program_name"] == "Chương trình Khoa học dữ liệu":
	major_name = f"{degree_prefix} {faculty_program_info['program_name']}/{eng_prefix} of Data Science in Finance and E-commerce"
	degree = f"{degree_prefix}/{eng_prefix}"
	elif faculty_program_info["program_name"] == "Chương trình Trí tuệ nhân tạo":
	major_name = f"{degree_prefix} {faculty_program_info['program_name']}/{eng_prefix} of Artificial Intelligence"
	degree = f"{degree_prefix}/{eng_prefix}"
	else:
	major_name = faculty_program_info["major_name"]
	degree = faculty_program_info["degree"]

	doc = {
	"content": chunk,
	"department_brief": faculty_program_info["department_brief"],
	"department_name": faculty_program_info["department_name"],
	"program_brief": faculty_program_info["program_brief"],
	"program_name": faculty_program_info["program_name"],
	"degree": degree,
	"file_name": file_path.name,
	"file_path": str(file_path),
	"level": "Đại học/Undergraduate",
	"major_name": major_name,
	"major_code": faculty_program_info["major_code"]
	}
	documents.append(doc)

	return documents

	def process_directory(self, directory_path, output_file):
	"""Process all documents in a directory and save to JSON"""
	directory = Path(directory_path)
	all_documents = []

	# Get all supported files
	# files = list(directory.glob("*/.docx"))
	files = list(directory.glob("*/.docx"))

	print(f"Found {len(files)} documents to process")

	# Process each file
	for file_path in tqdm(files, desc="Processing documents"):
	doc_metadata = self.process_document(file_path)
	if doc_metadata:
	all_documents.append(doc_metadata)

	# Save to JSON file
	print(f"Saving {len(all_documents)} documents to {output_file}")
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(all_documents, f, ensure_ascii=False, indent=2)


	if __name__ == "__main__":
	# Initialize processor
	processor = DocumentProcessor()

	# Process documents
	# Change this to your documents directory
	input_directory = "syllabus_nct_docx_format/"
	output_file = "processed_documents_docx_v3.json"

	processor.process_directory(input_directory, output_file)