Spaces:
Sleeping
Sleeping
File size: 4,091 Bytes
f9b0dca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import os
import json
import glob
import uuid
import re
from typing import List
PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed"
CHUNKS_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/chunks"
def split_into_sentences(text: str) -> List[str]:
"""
Splits text into sentences using regex, respecting common Vietnamese punctuation.
"""
# Split by '.', '!', '?' followed by whitespace. Keep the punctuation.
sentences = re.split(r'(?<=[.!?])\s+', text)
return [s.strip() for s in sentences if s.strip()]
def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]:
"""
Chunks text respecting sentence boundaries to avoid cut-off context.
Strategy:
1. Split into sentences.
2. Group sentences until chunk_size is reached.
3. Apply overlap by including previous sentences.
"""
if not text:
return []
sentences = split_into_sentences(text)
chunks = []
current_chunk = []
current_length = 0
i = 0
while i < len(sentences):
sentence = sentences[i]
sentence_len = len(sentence.split()) # Approximate token count by words
if current_length + sentence_len > chunk_size and current_chunk:
# Current chunk is full, save it
chunks.append(" ".join(current_chunk))
# Start new chunk with overlap
# Backtrack to capture overlap words
overlap_len = 0
overlap_chunk = []
back_idx = i - 1
while back_idx >= 0 and overlap_len < overlap:
overlap_chunk.insert(0, sentences[back_idx])
overlap_len += len(sentences[back_idx].split())
back_idx -= 1
current_chunk = overlap_chunk + [sentence]
current_length = overlap_len + sentence_len
else:
current_chunk.append(sentence)
current_length += sentence_len
i += 1
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def assign_topic(text: str, title: str) -> str:
"""Heuristic topic assignment."""
combined = (title + " " + text).lower()
if any(k in combined for k in ['tiêm', 'vaccine', 'vắc xin', 'lịch']):
return "tiêm chủng"
if any(k in combined for k in ['lãnh đạo', 'giám đốc', 'viện trưởng', 'chức năng']):
return "lãnh đạo"
if any(k in combined for k in ['dịch', 'bệnh', 'sốt xuất huyết', 'tay chân miệng', 'covid']):
return "dịch tễ"
if any(k in combined for k in ['hành chính', 'liên hệ', 'địa chỉ']):
return "hành chính"
return "khác"
def main():
if not os.path.exists(CHUNKS_DIR):
os.makedirs(CHUNKS_DIR)
files = glob.glob(os.path.join(PROCESSED_DIR, "*.json"))
print(f"Chunking {len(files)} files...")
total_chunks = 0
for filepath in files:
with open(filepath, 'r', encoding='utf-8') as f:
doc = json.load(f)
chunks = chunk_text(doc['text'])
for i, chunk_content in enumerate(chunks):
chunk_id = str(uuid.uuid4())
chunk_data = {
"id": chunk_id,
"text": chunk_content,
"title": doc['title'],
"url": doc['url'],
"topic": assign_topic(chunk_content, doc['title']),
"date": "2024-01-30", # Default for now, as extraction logic for date is complex
"source": "NIHE",
"language": "vi",
"parent_id": os.path.basename(filepath)
}
out_name = f"{chunk_id}.json"
with open(os.path.join(CHUNKS_DIR, out_name), 'w', encoding='utf-8') as cf:
json.dump(chunk_data, cf, ensure_ascii=False, indent=2)
total_chunks += 1
print(f"Chunking complete. Created {total_chunks} chunks.")
if __name__ == "__main__":
main()
|