import os import json import glob import uuid import re from typing import List PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed" CHUNKS_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/chunks" def split_into_sentences(text: str) -> List[str]: """ Splits text into sentences using regex, respecting common Vietnamese punctuation. """ # Split by '.', '!', '?' followed by whitespace. Keep the punctuation. sentences = re.split(r'(?<=[.!?])\s+', text) return [s.strip() for s in sentences if s.strip()] def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]: """ Chunks text respecting sentence boundaries to avoid cut-off context. Strategy: 1. Split into sentences. 2. Group sentences until chunk_size is reached. 3. Apply overlap by including previous sentences. """ if not text: return [] sentences = split_into_sentences(text) chunks = [] current_chunk = [] current_length = 0 i = 0 while i < len(sentences): sentence = sentences[i] sentence_len = len(sentence.split()) # Approximate token count by words if current_length + sentence_len > chunk_size and current_chunk: # Current chunk is full, save it chunks.append(" ".join(current_chunk)) # Start new chunk with overlap # Backtrack to capture overlap words overlap_len = 0 overlap_chunk = [] back_idx = i - 1 while back_idx >= 0 and overlap_len < overlap: overlap_chunk.insert(0, sentences[back_idx]) overlap_len += len(sentences[back_idx].split()) back_idx -= 1 current_chunk = overlap_chunk + [sentence] current_length = overlap_len + sentence_len else: current_chunk.append(sentence) current_length += sentence_len i += 1 if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def assign_topic(text: str, title: str) -> str: """Heuristic topic assignment.""" combined = (title + " " + text).lower() if any(k in combined for k in ['tiêm', 'vaccine', 'vắc xin', 'lịch']): return "tiêm chủng" if any(k in combined for k in ['lãnh đạo', 'giám đốc', 'viện trưởng', 'chức năng']): return "lãnh đạo" if any(k in combined for k in ['dịch', 'bệnh', 'sốt xuất huyết', 'tay chân miệng', 'covid']): return "dịch tễ" if any(k in combined for k in ['hành chính', 'liên hệ', 'địa chỉ']): return "hành chính" return "khác" def main(): if not os.path.exists(CHUNKS_DIR): os.makedirs(CHUNKS_DIR) files = glob.glob(os.path.join(PROCESSED_DIR, "*.json")) print(f"Chunking {len(files)} files...") total_chunks = 0 for filepath in files: with open(filepath, 'r', encoding='utf-8') as f: doc = json.load(f) chunks = chunk_text(doc['text']) for i, chunk_content in enumerate(chunks): chunk_id = str(uuid.uuid4()) chunk_data = { "id": chunk_id, "text": chunk_content, "title": doc['title'], "url": doc['url'], "topic": assign_topic(chunk_content, doc['title']), "date": "2024-01-30", # Default for now, as extraction logic for date is complex "source": "NIHE", "language": "vi", "parent_id": os.path.basename(filepath) } out_name = f"{chunk_id}.json" with open(os.path.join(CHUNKS_DIR, out_name), 'w', encoding='utf-8') as cf: json.dump(chunk_data, cf, ensure_ascii=False, indent=2) total_chunks += 1 print(f"Chunking complete. Created {total_chunks} chunks.") if __name__ == "__main__": main()