Spaces:

hung2903
/

chatbot_nihe

Sleeping

chatbot_nihe / src /preprocessing /cleaner.py

Auto Deploy Script

Auto deploy from local machine

f9b0dca 7 days ago

3.81 kB

	import os
	import re
	import json
	import glob

	RAW_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/raw/raw_selenium"
	PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed"

	def clean_text(text):
	"""Clean unwanted artifacts from text."""
	# Remove "CÁC BÀI VIẾT LIÊN QUAN" and everything after
	text = re.split(r'CÁC BÀI VIẾT LIÊN QUAN', text, flags=re.IGNORECASE)[0]

	# Remove "DANH MỤC..." footer section if present
	text = re.split(r'DANH MỤC\s+TIÊM CHỦNG', text, flags=re.IGNORECASE)[0]

	# Remove common noise patterns
	stop_phrases = ["TIN CÙNG CHUYÊN MỤC", "TIN KHÁC", "BÌNH LUẬN", "Tóm lược bài viết"]
	for phrase in stop_phrases:
	text = re.split(phrase, text, flags=re.IGNORECASE)[0]

	# Remove lines that are just numbers or noise
	lines = [line.strip() for line in text.split('\n')]
	cleaned_lines = []
	for line in lines:
	if not line: continue
	# Skip lines that are just navigation or noise
	if line.lower() in ["trang chủ", "giới thiệu", "tin tức", "liên hệ"]: continue
	if line.startswith("Copyright") or "All rights reserved" in line: continue
	cleaned_lines.append(line)

	return '\n'.join(cleaned_lines)

	def is_garbage_content(text):
	"""Check if content is irrelevant (procurement, catalog, etc.)."""
	text_lower = text.lower()
	garbage_keywords = [
	"mời báo giá", "yêu cầu báo giá", "chào giá", "đấu thầu", "gói thầu", "mua sắm hàng hóa",
	"kết quả lựa chọn nhà thầu", "danh mục", "catalogue", "sitemap",
	"404 not found", "access denied", "đang cập nhật", "danh sách", "file đính kèm"
	]
	if any(keyword in text_lower for keyword in garbage_keywords):
	return True
	return False

	def process_file(filepath):
	"""Parse raw file and save to processed JSON."""
	filename = os.path.basename(filepath)
	try:
	with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
	content = f.read()

	# Parse Header
	header_match = re.search(r'Title: (.)\nURL: (.)\nLength: .*\n\n=+', content, re.DOTALL)

	if header_match:
	title = header_match.group(1).strip()
	url = header_match.group(2).strip()
	body = content.split('=' * 80)[-1].strip()
	else:
	# Fallback for manual files or non-conforming files
	title = filename.replace('.txt', '').replace('_', ' ')
	url = f"file://{filename}"
	body = content

	# Clean Body
	cleaned_body = clean_text(body)

	if len(cleaned_body) < 150: # Increased threshold
	print(f"Skipping short file: {filename}")
	return # Skip very short files

	if is_garbage_content(cleaned_body):
	print(f"Skipping garbage content (procurement/catalog): {filename}")
	return

	data = {
	"source_file": filename,
	"title": title,
	"url": url,
	"text": cleaned_body
	}

	# Save to processed
	out_name = filename.replace('.txt', '.json')
	with open(os.path.join(PROCESSED_DIR, out_name), 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=2)

	except Exception as e:
	print(f"Error processing {filename}: {e}")

	def main():
	if not os.path.exists(PROCESSED_DIR):
	os.makedirs(PROCESSED_DIR)

	files = glob.glob(os.path.join(RAW_DIR, "*.txt"))
	print(f"Found {len(files)} files.")

	for filepath in files:
	process_file(filepath)

	print("Cleaning complete.")

	if __name__ == "__main__":
	main()