File size: 4,091 Bytes
f9b0dca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import json
import glob
import uuid
import re
from typing import List

PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed"
CHUNKS_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/chunks"

def split_into_sentences(text: str) -> List[str]:
    """
    Splits text into sentences using regex, respecting common Vietnamese punctuation.
    """
    # Split by '.', '!', '?' followed by whitespace. Keep the punctuation.
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]:
    """
    Chunks text respecting sentence boundaries to avoid cut-off context.
    
    Strategy:
    1. Split into sentences.
    2. Group sentences until chunk_size is reached.
    3. Apply overlap by including previous sentences.
    """
    if not text:
        return []
        
    sentences = split_into_sentences(text)
    chunks = []
    current_chunk = []
    current_length = 0
    
    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        sentence_len = len(sentence.split()) # Approximate token count by words
        
        if current_length + sentence_len > chunk_size and current_chunk:
            # Current chunk is full, save it
            chunks.append(" ".join(current_chunk))
            
            # Start new chunk with overlap
            # Backtrack to capture overlap words
            overlap_len = 0
            overlap_chunk = []
            back_idx = i - 1
            while back_idx >= 0 and overlap_len < overlap:
                overlap_chunk.insert(0, sentences[back_idx])
                overlap_len += len(sentences[back_idx].split())
                back_idx -= 1
            
            current_chunk = overlap_chunk + [sentence]
            current_length = overlap_len + sentence_len
        else:
            current_chunk.append(sentence)
            current_length += sentence_len
        
        i += 1
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
        
    return chunks

def assign_topic(text: str, title: str) -> str:
    """Heuristic topic assignment."""
    combined = (title + " " + text).lower()
    if any(k in combined for k in ['tiêm', 'vaccine', 'vắc xin', 'lịch']):
        return "tiêm chủng"
    if any(k in combined for k in ['lãnh đạo', 'giám đốc', 'viện trưởng', 'chức năng']):
        return "lãnh đạo"
    if any(k in combined for k in ['dịch', 'bệnh', 'sốt xuất huyết', 'tay chân miệng', 'covid']):
        return "dịch tễ"
    if any(k in combined for k in ['hành chính', 'liên hệ', 'địa chỉ']):
        return "hành chính"
    return "khác"

def main():
    if not os.path.exists(CHUNKS_DIR):
        os.makedirs(CHUNKS_DIR)
        
    files = glob.glob(os.path.join(PROCESSED_DIR, "*.json"))
    print(f"Chunking {len(files)} files...")
    
    total_chunks = 0
    
    for filepath in files:
        with open(filepath, 'r', encoding='utf-8') as f:
            doc = json.load(f)
            
        chunks = chunk_text(doc['text'])
        
        for i, chunk_content in enumerate(chunks):
            chunk_id = str(uuid.uuid4())
            chunk_data = {
                "id": chunk_id,
                "text": chunk_content,
                "title": doc['title'],
                "url": doc['url'],
                "topic": assign_topic(chunk_content, doc['title']),
                "date": "2024-01-30", # Default for now, as extraction logic for date is complex
                "source": "NIHE",
                "language": "vi",
                "parent_id": os.path.basename(filepath)
            }
            
            out_name = f"{chunk_id}.json"
            with open(os.path.join(CHUNKS_DIR, out_name), 'w', encoding='utf-8') as cf:
                json.dump(chunk_data, cf, ensure_ascii=False, indent=2)
            
            total_chunks += 1
            
    print(f"Chunking complete. Created {total_chunks} chunks.")

if __name__ == "__main__":
    main()