Spaces:

hung2903
/

chatbot_nihe

Sleeping

chatbot_nihe / src /preprocessing /chunker.py

Auto Deploy Script

Auto deploy from local machine

f9b0dca 7 days ago

4.09 kB

	import os
	import json
	import glob
	import uuid
	import re
	from typing import List

	PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed"
	CHUNKS_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/chunks"

	def split_into_sentences(text: str) -> List[str]:
	"""
	Splits text into sentences using regex, respecting common Vietnamese punctuation.
	"""
	# Split by '.', '!', '?' followed by whitespace. Keep the punctuation.
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return [s.strip() for s in sentences if s.strip()]

	def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]:
	"""
	Chunks text respecting sentence boundaries to avoid cut-off context.

	Strategy:
	1. Split into sentences.
	2. Group sentences until chunk_size is reached.
	3. Apply overlap by including previous sentences.
	"""
	if not text:
	return []

	sentences = split_into_sentences(text)
	chunks = []
	current_chunk = []
	current_length = 0

	i = 0
	while i < len(sentences):
	sentence = sentences[i]
	sentence_len = len(sentence.split()) # Approximate token count by words

	if current_length + sentence_len > chunk_size and current_chunk:
	# Current chunk is full, save it
	chunks.append(" ".join(current_chunk))

	# Start new chunk with overlap
	# Backtrack to capture overlap words
	overlap_len = 0
	overlap_chunk = []
	back_idx = i - 1
	while back_idx >= 0 and overlap_len < overlap:
	overlap_chunk.insert(0, sentences[back_idx])
	overlap_len += len(sentences[back_idx].split())
	back_idx -= 1

	current_chunk = overlap_chunk + [sentence]
	current_length = overlap_len + sentence_len
	else:
	current_chunk.append(sentence)
	current_length += sentence_len

	i += 1

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	def assign_topic(text: str, title: str) -> str:
	"""Heuristic topic assignment."""
	combined = (title + " " + text).lower()
	if any(k in combined for k in ['tiêm', 'vaccine', 'vắc xin', 'lịch']):
	return "tiêm chủng"
	if any(k in combined for k in ['lãnh đạo', 'giám đốc', 'viện trưởng', 'chức năng']):
	return "lãnh đạo"
	if any(k in combined for k in ['dịch', 'bệnh', 'sốt xuất huyết', 'tay chân miệng', 'covid']):
	return "dịch tễ"
	if any(k in combined for k in ['hành chính', 'liên hệ', 'địa chỉ']):
	return "hành chính"
	return "khác"

	def main():
	if not os.path.exists(CHUNKS_DIR):
	os.makedirs(CHUNKS_DIR)

	files = glob.glob(os.path.join(PROCESSED_DIR, "*.json"))
	print(f"Chunking {len(files)} files...")

	total_chunks = 0

	for filepath in files:
	with open(filepath, 'r', encoding='utf-8') as f:
	doc = json.load(f)

	chunks = chunk_text(doc['text'])

	for i, chunk_content in enumerate(chunks):
	chunk_id = str(uuid.uuid4())
	chunk_data = {
	"id": chunk_id,
	"text": chunk_content,
	"title": doc['title'],
	"url": doc['url'],
	"topic": assign_topic(chunk_content, doc['title']),
	"date": "2024-01-30", # Default for now, as extraction logic for date is complex
	"source": "NIHE",
	"language": "vi",
	"parent_id": os.path.basename(filepath)
	}

	out_name = f"{chunk_id}.json"
	with open(os.path.join(CHUNKS_DIR, out_name), 'w', encoding='utf-8') as cf:
	json.dump(chunk_data, cf, ensure_ascii=False, indent=2)

	total_chunks += 1

	print(f"Chunking complete. Created {total_chunks} chunks.")

	if __name__ == "__main__":
	main()