Spaces:

buianh0803
/

NCT_chatbot_QA

Runtime error

File size: 15,854 Bytes

import os
from pathlib import Path
import json
from docx import Document
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


class DocumentProcessor:
    def __init__(self):
        print('Initializing embedding model...')
        self.tokenizer = AutoTokenizer.from_pretrained(
            "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        self.embedding_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
        )
        # Define text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=10000,  # Characters, not tokens
            chunk_overlap=3000
        )

        # Define program folder mappings
        self.folder_mappings = {
            "Chương trình An toàn thông tin": {
                "department_brief": "FIT",
                "department_name": "Khoa Công nghệ thông tin",
                "program_brief": "IS",
                "program_name": "Chương trình An toàn thông tin",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân An toàn thông tin/Bachelor of Information Security",
                "major_code": "7480202",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình An toàn thông tin/CTĐT ngành An toàn thông tin.docx"
            },
            "Chương trình Công nghệ thông tin": {
                "department_brief": "FIT",
                "department_name": "Khoa Công nghệ thông tin",
                "program_brief": "IT",
                "program_name": "Chương trình Công nghệ thông tin",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Công nghệ thông tin/Bachelor of Information Technology",
                "major_code": "7480201",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Công nghệ thông tin/Công nghệ thông tin_CTĐT_2023.docx"
            },
            "Chương trình Khoa học máy tính": {
                "department_brief": "FIT",
                "department_name": "Khoa Công nghệ thông tin",
                "program_brief": "CS",
                "program_name": "Chương trình Khoa học máy tính",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Khoa học máy tính/Bachelor of Computer Science",
                "major_code": "7480101",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Khoa học máy tính/Khoa học máy tính_CTĐT_2023.docx"
            },
            "Chương trình Kỹ thuật phần mềm": {
                "department_brief": "FIT",
                "department_name": "Khoa Công nghệ thông tin",
                "program_brief": "SE",
                "program_name": "Chương trình Kỹ thuật phần mềm",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Kỹ thuật phần mềm/Bachelor of Software Engineering",
                "major_code": "7480103",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Kỹ thuật phần mềm/CTĐT ngành Kỹ thuật phần mềm.docx"
            },
            "Chương trình Định phí bảo hiểm và Quản trị rủi ro": {
                "department_brief": "MFE",
                "department_name": "Khoa Toán kinh tế",
                "program_brief": "Actuary",
                "program_name": "Chương trình Định phí bảo hiểm và Quản trị rủi ro",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Định phí bảo hiểm và Quản trị rủi ro/Bachelor of Actuarial Science",
                "major_code": "7310108",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Định phí bảo hiểm và Quản trị rủi ro/CTĐT ngành Định phí bảo hiểm và Quản trị rủi ro.docx"
            },
            "Chương trình Hệ thống thông tin": {
                "department_brief": "MIS",
                "department_name": "Khoa Hệ thống thông tin quản lý",
                "program_brief": "IS",
                "program_name": "Chương trình Hệ thống thông tin",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Hệ thống thông tin/Bachelor of Information Systems",
                "major_code": "7480104",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Hệ thống thông tin/CTĐT ngành HTTT.docx"
            },
            "Chương trình Hệ thống thông tin quản lý": {
                "department_brief": "MIS",
                "department_name": "Khoa Hệ thống thông tin quản lý",
                "program_brief": "MIS",
                "program_name": "Chương trình Hệ thống thông tin quản lý",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Hệ thống thông tin quản lý/Bachelor of Management Information Systems",
                "major_code": "7340405",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Hệ thống thông tin quản lý/Hệ thống thông tin quản lý_CTĐT_2023.docx"
            },
            "Chương trình Phân tích dữ liệu trong Kinh tế": {
                "department_brief": "MFE",
                "department_name": "Khoa Toán kinh tế",
                "program_brief": "DSEB",
                "program_name": "Chương trình Phân tích dữ liệu trong Kinh tế",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Khoa học dữ liệu trong kinh tế và kinh doanh/Bachelor of Data Science in Economics and Business",
                "major_code": "7310108",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Phân tích dữ liệu trong Kinh tế/DSEB_khung_chuong_trinh.docx"
            },
            "Chương trình Thống kê kinh tế": {
                "department_brief": "KTK",
                "department_name": "Khoa thống kê",
                "program_brief": "ES",
                "program_name": "Chương trình Thống kê kinh tế",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Thống kê kinh tế/Bachelor of Economic Statistics",
                "major_code": "7310107",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Thống kê kinh tế/Thống kê kinh tế_CTĐT_2023.docx"
            },
            "Chương trình Toán kinh tế": {
                "department_brief": "MFE",
                "department_name": "Khoa Toán kinh tế",
                "program_brief": "TOKT",
                "program_name": "Chương trình Toán kinh tế",
                "degree": "Cử nhân/Bachelor",
                "level": "Đại học/Undergraduate",
                "major_name": "Cử nhân Toán kinh tế/Bachelor of Mathematical Economics",
                "major_code": "7310108",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Toán kinh tế/Toán kinh tế_CTĐT_2023.docx"
            },
            "Chương trình Trí tuệ nhân tạo": {
                "department_brief": "FDA",
                "department_name": "Khoa Khoa học dữ liệu và Trí tuệ nhân tạo",
                "program_brief": "TTNT",
                "program_name": "Chương trình Trí tuệ nhân tạo",
                "degree": "",
                "level": "Đại học/Undergraduate",
                # "major_name": "Trí tuệ nhân tạo/ of Artificial Intelligence",
                "major_name": "",
                "major_code": "7480107",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Trí tuệ nhân tạo/22.5.1.CTĐT_ngành TTNT_cử nhân.docx"
            },
            "Chương trình Khoa học dữ liệu": {
                "department_brief": "FDA",
                "department_name": "Khoa Khoa học dữ liệu và Trí tuệ nhân tạo",
                "program_brief": "KHDL",
                "program_name": "Chương trình Khoa học dữ liệu",
                "degree": "",
                "level": "Đại học/Undergraduate",
                # "major_name": "Khoa học dữ liệu trong Tài chính và thương mại điện tử/ of Data Science in Finance and E-commerce",
                "major_name": "",
                "major_code": "7460108",
                "file_path": "syllabus_nct_word_format/Trường Công nghệ/Chương trình Khoa học dữ liệu/22.5.1.CTĐT_ngành KHDL_cử nhân.docx"
            }
        }

    def extract_text_from_docx(self, file_path):
        """Extract text from .docx file using Docling and return chunks"""
        try:
            # Initialize DoclingLoader
            loader = DoclingLoader(
                file_path=file_path,
                export_type=ExportType.MARKDOWN,
            )

            # Load and process document
            docs = loader.load()

            if docs:
                # Combine all text first
                full_text = "\n".join([doc.page_content for doc in docs])
                # Split into chunks
                chunks = self.text_splitter.split_text(full_text)

                # Validate each chunk against token limit
                valid_chunks = []
                for chunk in chunks:
                    tokens = self.tokenizer.encode(chunk)
                    if len(tokens) <= 10000:
                        valid_chunks.append(chunk)
                    else:
                        # Further split this chunk
                        smaller_splitter = RecursiveCharacterTextSplitter(
                            chunk_size=len(chunk) // 2,  # Split in half
                            chunk_overlap=2000,
                        )
                        smaller_chunks = smaller_splitter.split_text(chunk)
                        valid_chunks.extend(smaller_chunks)

                return valid_chunks
            return []

        except Exception as e:
            print(f"Error processing DOCX {file_path}: {e}")
            return []

    def get_faculty_program_from_path(self, file_path):
        """Get faculty and program based on folder structure"""
        path_parts = Path(file_path).parts

        # Look for program folder in path
        for part in path_parts:
            if part in self.folder_mappings:
                return self.folder_mappings[part]

        return {
            "department_brief": "Unknown",
            "department_name": "Unknown",
            "program_brief": "Unknown",
            "program_name": "Unknown",
            "degree": "Unknown",
            "level": "Unknown",
            "major_name": "Unknown",
            "major_code": "Unknown",
            "file_path": "Unknown"
        }

    def process_document(self, file_path):
        """Process a single document and return its metadata"""
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()
        file_name = file_path.name.lower()

        # Extract text chunks based on file type
        if file_ext in ['.docx']:
            chunks = self.extract_text_from_docx(file_path)
        else:
            print(f"Unsupported file type: {file_ext}")
            return None

        # Get faculty and program info
        faculty_program_info = self.get_faculty_program_from_path(file_path)

        # Determine degree type
        if "kỹ sư" in file_name:
            degree_prefix = "Kỹ sư"
            eng_prefix = "Engineer"
        else:
            degree_prefix = "Cử nhân"
            eng_prefix = "Bachelor"

        # Create documents list with chunks
        documents = []
        for chunk in chunks:
            # Determine major_name based on program
            if faculty_program_info["program_name"] == "Chương trình Khoa học dữ liệu":
                major_name = f"{degree_prefix} {faculty_program_info['program_name']}/{eng_prefix} of Data Science in Finance and E-commerce"
                degree = f"{degree_prefix}/{eng_prefix}"
            elif faculty_program_info["program_name"] == "Chương trình Trí tuệ nhân tạo":
                major_name = f"{degree_prefix} {faculty_program_info['program_name']}/{eng_prefix} of Artificial Intelligence"
                degree = f"{degree_prefix}/{eng_prefix}"
            else:
                major_name = faculty_program_info["major_name"]
                degree = faculty_program_info["degree"]

            doc = {
                "content": chunk,
                "department_brief": faculty_program_info["department_brief"],
                "department_name": faculty_program_info["department_name"],
                "program_brief": faculty_program_info["program_brief"],
                "program_name": faculty_program_info["program_name"],
                "degree": degree,
                "file_name": file_path.name,
                "file_path": str(file_path),
                "level": "Đại học/Undergraduate",
                "major_name": major_name,
                "major_code": faculty_program_info["major_code"]
            }
            documents.append(doc)

        return documents

    def process_directory(self, directory_path, output_file):
        """Process all documents in a directory and save to JSON"""
        directory = Path(directory_path)
        all_documents = []

        # Get all supported files
        # files = list(directory.glob("**/*.docx"))
        files = list(directory.glob("**/*.docx"))

        print(f"Found {len(files)} documents to process")

        # Process each file
        for file_path in tqdm(files, desc="Processing documents"):
            doc_metadata = self.process_document(file_path)
            if doc_metadata:
                all_documents.append(doc_metadata)

        # Save to JSON file
        print(f"Saving {len(all_documents)} documents to {output_file}")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_documents, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    # Initialize processor
    processor = DocumentProcessor()

    # Process documents
    # Change this to your documents directory
    input_directory = "syllabus_nct_docx_format/"
    output_file = "processed_documents_docx_v3.json"

    processor.process_directory(input_directory, output_file)