Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import glob | |
| import uuid | |
| import re | |
| from typing import List | |
| PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed" | |
| CHUNKS_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/chunks" | |
| def split_into_sentences(text: str) -> List[str]: | |
| """ | |
| Splits text into sentences using regex, respecting common Vietnamese punctuation. | |
| """ | |
| # Split by '.', '!', '?' followed by whitespace. Keep the punctuation. | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| return [s.strip() for s in sentences if s.strip()] | |
| def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]: | |
| """ | |
| Chunks text respecting sentence boundaries to avoid cut-off context. | |
| Strategy: | |
| 1. Split into sentences. | |
| 2. Group sentences until chunk_size is reached. | |
| 3. Apply overlap by including previous sentences. | |
| """ | |
| if not text: | |
| return [] | |
| sentences = split_into_sentences(text) | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| i = 0 | |
| while i < len(sentences): | |
| sentence = sentences[i] | |
| sentence_len = len(sentence.split()) # Approximate token count by words | |
| if current_length + sentence_len > chunk_size and current_chunk: | |
| # Current chunk is full, save it | |
| chunks.append(" ".join(current_chunk)) | |
| # Start new chunk with overlap | |
| # Backtrack to capture overlap words | |
| overlap_len = 0 | |
| overlap_chunk = [] | |
| back_idx = i - 1 | |
| while back_idx >= 0 and overlap_len < overlap: | |
| overlap_chunk.insert(0, sentences[back_idx]) | |
| overlap_len += len(sentences[back_idx].split()) | |
| back_idx -= 1 | |
| current_chunk = overlap_chunk + [sentence] | |
| current_length = overlap_len + sentence_len | |
| else: | |
| current_chunk.append(sentence) | |
| current_length += sentence_len | |
| i += 1 | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def assign_topic(text: str, title: str) -> str: | |
| """Heuristic topic assignment.""" | |
| combined = (title + " " + text).lower() | |
| if any(k in combined for k in ['tiêm', 'vaccine', 'vắc xin', 'lịch']): | |
| return "tiêm chủng" | |
| if any(k in combined for k in ['lãnh đạo', 'giám đốc', 'viện trưởng', 'chức năng']): | |
| return "lãnh đạo" | |
| if any(k in combined for k in ['dịch', 'bệnh', 'sốt xuất huyết', 'tay chân miệng', 'covid']): | |
| return "dịch tễ" | |
| if any(k in combined for k in ['hành chính', 'liên hệ', 'địa chỉ']): | |
| return "hành chính" | |
| return "khác" | |
| def main(): | |
| if not os.path.exists(CHUNKS_DIR): | |
| os.makedirs(CHUNKS_DIR) | |
| files = glob.glob(os.path.join(PROCESSED_DIR, "*.json")) | |
| print(f"Chunking {len(files)} files...") | |
| total_chunks = 0 | |
| for filepath in files: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| doc = json.load(f) | |
| chunks = chunk_text(doc['text']) | |
| for i, chunk_content in enumerate(chunks): | |
| chunk_id = str(uuid.uuid4()) | |
| chunk_data = { | |
| "id": chunk_id, | |
| "text": chunk_content, | |
| "title": doc['title'], | |
| "url": doc['url'], | |
| "topic": assign_topic(chunk_content, doc['title']), | |
| "date": "2024-01-30", # Default for now, as extraction logic for date is complex | |
| "source": "NIHE", | |
| "language": "vi", | |
| "parent_id": os.path.basename(filepath) | |
| } | |
| out_name = f"{chunk_id}.json" | |
| with open(os.path.join(CHUNKS_DIR, out_name), 'w', encoding='utf-8') as cf: | |
| json.dump(chunk_data, cf, ensure_ascii=False, indent=2) | |
| total_chunks += 1 | |
| print(f"Chunking complete. Created {total_chunks} chunks.") | |
| if __name__ == "__main__": | |
| main() | |