Spaces:

hung2903
/

chatbot_nihe

Sleeping

File size: 4,091 Bytes

f9b0dca

import os
import json
import glob
import uuid
import re
from typing import List

PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed"
CHUNKS_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/chunks"

def split_into_sentences(text: str) -> List[str]:
    """
    Splits text into sentences using regex, respecting common Vietnamese punctuation.
    """
    # Split by '.', '!', '?' followed by whitespace. Keep the punctuation.
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]:
    """
    Chunks text respecting sentence boundaries to avoid cut-off context.
    
    Strategy:
    1. Split into sentences.
    2. Group sentences until chunk_size is reached.
    3. Apply overlap by including previous sentences.
    """
    if not text:
        return []
        
    sentences = split_into_sentences(text)
    chunks = []
    current_chunk = []
    current_length = 0
    
    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        sentence_len = len(sentence.split()) # Approximate token count by words
        
        if current_length + sentence_len > chunk_size and current_chunk:
            # Current chunk is full, save it
            chunks.append(" ".join(current_chunk))
            
            # Start new chunk with overlap
            # Backtrack to capture overlap words
            overlap_len = 0
            overlap_chunk = []
            back_idx = i - 1
            while back_idx >= 0 and overlap_len < overlap:
                overlap_chunk.insert(0, sentences[back_idx])
                overlap_len += len(sentences[back_idx].split())
                back_idx -= 1
            
            current_chunk = overlap_chunk + [sentence]
            current_length = overlap_len + sentence_len
        else:
            current_chunk.append(sentence)
            current_length += sentence_len
        
        i += 1
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
        
    return chunks

def assign_topic(text: str, title: str) -> str:
    """Heuristic topic assignment."""
    combined = (title + " " + text).lower()
    if any(k in combined for k in ['tiêm', 'vaccine', 'vắc xin', 'lịch']):
        return "tiêm chủng"
    if any(k in combined for k in ['lãnh đạo', 'giám đốc', 'viện trưởng', 'chức năng']):
        return "lãnh đạo"
    if any(k in combined for k in ['dịch', 'bệnh', 'sốt xuất huyết', 'tay chân miệng', 'covid']):
        return "dịch tễ"
    if any(k in combined for k in ['hành chính', 'liên hệ', 'địa chỉ']):
        return "hành chính"
    return "khác"

def main():
    if not os.path.exists(CHUNKS_DIR):
        os.makedirs(CHUNKS_DIR)
        
    files = glob.glob(os.path.join(PROCESSED_DIR, "*.json"))
    print(f"Chunking {len(files)} files...")
    
    total_chunks = 0
    
    for filepath in files:
        with open(filepath, 'r', encoding='utf-8') as f:
            doc = json.load(f)
            
        chunks = chunk_text(doc['text'])
        
        for i, chunk_content in enumerate(chunks):
            chunk_id = str(uuid.uuid4())
            chunk_data = {
                "id": chunk_id,
                "text": chunk_content,
                "title": doc['title'],
                "url": doc['url'],
                "topic": assign_topic(chunk_content, doc['title']),
                "date": "2024-01-30", # Default for now, as extraction logic for date is complex
                "source": "NIHE",
                "language": "vi",
                "parent_id": os.path.basename(filepath)
            }
            
            out_name = f"{chunk_id}.json"
            with open(os.path.join(CHUNKS_DIR, out_name), 'w', encoding='utf-8') as cf:
                json.dump(chunk_data, cf, ensure_ascii=False, indent=2)
            
            total_chunks += 1
            
    print(f"Chunking complete. Created {total_chunks} chunks.")

if __name__ == "__main__":
    main()