Spaces:

hung2903
/

chatbot_nihe

Sleeping

chatbot_nihe / src /preprocessing /refine_chunks.py

Auto Deploy Script

Auto deploy from local machine

f9b0dca 7 days ago

5.64 kB

	import os
	import json
	import glob
	import re
	from collections import Counter

	CHUNKS_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/chunks"

	# Keywords for topic definition
	# Keywords for topic definition
	TOPIC_KEYWORDS = {
	"Tiêm chủng & Vắc xin": ["tiêm chủng", "vắc xin", "vaccine", "lịch tiêm", "tiêm phòng", "phản ứng sau tiêm", "an toàn tiêm chủng"],
	"Bệnh truyền nhiễm": ["sốt xuất huyết", "tay chân miệng", "sởi", "cúm", "bệnh dại", "covid", "hiv", "aids", "viêm gan", "lao", "bạch hầu", "ho gà", "uốn ván", "vi rút", "vi khuẩn", "truyền nhiễm", "dịch bệnh", "h5n1", "ev71"],
	"Bệnh không lây nhiễm": ["bệnh không lây nhiễm", "tiểu đường", "đái tháo đường", "ung thư", "tim mạch", "huyết áp", "tâm thần", "trầm cảm", "béo phì", "dinh dưỡng", "rối loạn chuyển hóa"],
	"Sức khỏe cộng đồng": ["sức khỏe cộng đồng", "vệ sinh môi trường", "nước sạch", "hút thuốc", "rượu bia", "vận động", "người cao tuổi", "trường học", "y tế công cộng", "phòng chống tai nạn"],
	"Xét nghiệm & Chẩn đoán": ["xét nghiệm", "chẩn đoán", "lấy mẫu", "sinh học phân tử", "quy trình xét nghiệm", "kết quả xét nghiệm", "giám sát phòng thí nghiệm"],
	"Nghiên cứu & Đào tạo": ["nghiên cứu khoa học", "đào tạo", "tuyển sinh", "luận án", "tiến sĩ", "thạc sĩ", "nghiên cứu sinh", "thử nghiệm lâm sàng", "hội nghị khoa học", "tạp chí"],
	"Hợp tác quốc tế": ["hợp tác quốc tế", "dự án", "who", "cdc", "đối tác", "viện trợ", "đoàn công tác"],
	"Phòng chống dịch": ["phòng chống dịch", "kiểm soát dịch", "khai báo y tế", "cách ly", "giãn cách", "truy vết", "ổ dịch", "đáp ứng khẩn cấp"],
	"Giới thiệu & Tổ chức": ["giới thiệu viện", "lịch sử", "cơ cấu tổ chức", "lãnh đạo viện", "ban giám đốc", "chức năng nhiệm vụ", "đảng bộ", "công đoàn"],
	"Tin tức & Sự kiện": ["tin tức", "sự kiện", "hoạt động", "thông báo", "chúc mừng", "gặp mặt", "tiếp đón"]
	}

	def is_garbage(doc):
	"""Check if a chunk is garbage/navigation noise."""
	text = doc.get("text", "")
	url = doc.get("url", "")
	title = doc.get("title", "")

	# 1. Check URL patterns for Categories/Tags which are usually list pages
	if "Category:" in url or "danh-muc" in url:
	return True

	# 2. Check for navigation loops
	if text.count("Trang chủ") > 2 or text.count("Giới thiệu") > 3:
	return True

	# 3. Check for specific noise phrases and commercial content
	text_lower = text.lower()
	garbage_keywords = [
	"mời báo giá", "yêu cầu báo giá", "chào giá", "đấu thầu", "gói thầu", "mua sắm hàng hóa",
	"kết quả lựa chọn nhà thầu", "catalogue", "sitemap", "404 not found", "access denied",
	"file đính kèm", "danh sách", "kết quả thẩm định", "thông báo mời"
	]

	if any(k in text_lower for k in garbage_keywords) or any(k in title.lower() for k in garbage_keywords):
	return True

	noise_phrases = ["Chào mừng bạn đến với NIHE"]
	for phrase in noise_phrases:
	if text.count(phrase) > 2:
	return True

	# 4. Too short
	if len(text.strip()) < 150:
	return True

	return False

	def assign_topic_advanced(text, title=""):
	"""Assign topic based on keyword density."""
	text_lower = (title + " " + text).lower()
	scores = {topic: 0 for topic in TOPIC_KEYWORDS}

	for topic, keywords in TOPIC_KEYWORDS.items():
	for kw in keywords:
	# Weight deeper/longer matches or title matches?
	# Simple count for now
	scores[topic] += text_lower.count(kw)

	# Get topic with max score
	best_topic = max(scores, key=scores.get)

	if scores[best_topic] > 0:
	return best_topic
	return "Tin tức chung"

	def main():
	print(f"Scanning chunks in {CHUNKS_DIR}...")
	files = glob.glob(os.path.join(CHUNKS_DIR, "*.json"))

	deleted_count = 0
	updated_count = 0

	for filepath in files:
	with open(filepath, 'r', encoding='utf-8') as f:
	try:
	doc = json.load(f)
	except json.JSONDecodeError:
	print(f"Error reading {filepath}, deleting.")
	os.remove(filepath)
	continue

	# 1. Filter Garbage
	if is_garbage(doc):
	print(f"Deleting garbage chunk: {doc.get('id')} ({doc.get('title')})")
	f.close() # Ensure closed before remove
	os.remove(filepath)
	deleted_count += 1
	continue

	# 2. Refine Topic
	# User asked to fix topics, so let's force update based on better logic.
	new_topic = assign_topic_advanced(doc.get('text', ''), doc.get('title', ''))

	if new_topic != doc.get('topic'):
	doc['topic'] = new_topic
	doc['language'] = 'vi' # Ensure lang is set

	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(doc, f, ensure_ascii=False, indent=2)
	updated_count += 1

	print(f"Refinement Complete.")
	print(f"- Deleted {deleted_count} garbage files.")
	print(f"- Updated topic for {updated_count} files.")

	if __name__ == "__main__":
	main()