Upload cleanup_project.py with huggingface_hub

912940a verified 8 months ago

8.05 kB

	#!/usr/bin/env python3
	"""
	Project Cleanup Script
	Removes redundant files and directories to reduce project size
	while preserving essential components for production and future development.
	"""

	import os
	import shutil
	import time
	from pathlib import Path

	def create_backup():
	"""Create a backup of the project before cleanup"""
	backup_name = f"hallucination_detector_backup_{int(time.time())}"
	backup_path = Path("..") / backup_name

	print(f"Creating backup at: {backup_path}")
	try:
	shutil.copytree(".", backup_path, ignore=shutil.ignore_patterns('.git'))
	print(f"✅ Backup created successfully at {backup_path}")
	return True
	except Exception as e:
	print(f"❌ Backup failed: {e}")
	return False

	def safe_remove(path):
	"""Safely remove a file or directory"""
	try:
	if os.path.isfile(path):
	os.remove(path)
	print(f" Removed file: {path}")
	elif os.path.isdir(path):
	shutil.rmtree(path)
	print(f" Removed directory: {path}")
	else:
	print(f" Not found: {path}")
	except Exception as e:
	print(f" Error removing {path}: {e}")

	def cleanup_project():
	"""Remove redundant files and directories"""

	# OLD MODELS TO REMOVE
	old_models = [
	"bert_hallucination_model",
	"bias_fixed_mega_model",
	"competition_model",
	"enhanced_contradiction_model",
	"enhanced_hallucination_model",
	"enhanced_hallucination_model_v2",
	"enhanced_model",
	"enhanced_v2_model",
	"final_balanced_mega_model",
	"fine_tuned_hallucination_model",
	"fixed_enhanced_model",
	"mega_hallucination_model_20250814_151158",
	"progressive_model_1",
	"progressive_model_2",
	"progressive_model_3",
	"progressive_stage_1_20250814_155617",
	"progressive_stage_2_20250814_155850",
	"progressive_stage_3_20250814_160640",
	"stable_mega_enhanced_model",
	"stable_mega_model_20250814_155042",
	"cv_fold_0",
	"ensemble_models",
	"evaluation_results",
	"model_checkpoints",
	"training_checkpoints"
	]

	# OLD TRAINING SCRIPTS TO REMOVE
	old_training_scripts = [
	"train_bert_alternative.py",
	"train_bias_fix.py",
	"train_competition_model.py",
	"train_enhanced_contradiction.py",
	"train_enhanced_model.py",
	"train_enhanced_performance.py",
	"train_final_balance.py",
	"train_fixed_enhanced.py",
	"train_incremental_improvement.py",
	"train_mega_model.py",
	"train_progressive_complete.py",
	"train_simple_competition.py",
	"train_simple_model.py",
	"train_simple_production.py",
	"train_simple_working_model.py",
	"train_smart_incremental.py",
	"train_stable_mega.py",
	"train_working_enhanced.py",
	"train_enhanced_model.py"
	]

	# OLD TEST SCRIPTS TO REMOVE
	old_test_scripts = [
	"test_all_models.py",
	"test_bias_fixed.py",
	"test_complete_mega_model.py",
	"test_detector_direct.py",
	"test_device_patterns.py",
	"test_enhanced_api.py",
	"test_enhanced_model.py",
	"test_enhanced_v2_comprehensive.py",
	"test_final_balance.py",
	"test_fresh_predictions.py",
	"test_hybrid_accuracy.py",
	"test_mega_final.py",
	"test_mega_model.py",
	"test_model.py",
	"test_raw_model.py",
	"test_regex_fix.py",
	"test_reliability.py",
	"test_stable_mega.py"
	]

	# DEBUG/DEVELOPMENT FILES TO REMOVE
	debug_files = [
	"debug_prediction.py",
	"debug_rules.py",
	"apply_quick_fix.py",
	"hybrid_detector.py",
	"standalone_test.py",
	"simple_test_server.py",
	"comprehensive_server_test.py",
	"comprehensive_test.py"
	]

	# OLD DATA FILES TO REMOVE
	old_data_files = [
	"advanced_training_data.csv",
	"bias_fix_training.csv",
	"combined_training_data.csv",
	"competition_training_data.csv",
	"edge_cases_training_data.csv",
	"halueval_combined_training.csv",
	"halueval_training_prepared.csv",
	"mega_training_data.csv",
	"sample_training_data.csv"
	]

	# CACHE/TEMP FILES TO REMOVE
	cache_files = [
	"cache_persistent.pkl",
	"app.log",
	"__pycache__",
	".pytest_cache",
	"logs",
	"tmp",
	"model_cache",
	"plots"
	]

	# OLD ANALYSIS SCRIPTS TO REMOVE
	analysis_scripts = [
	"analyze_halueval.py",
	"analyze_training_metrics.py",
	"complete_data_audit.py",
	"create_verified_dataset.py",
	"diagnose_bias.py",
	"fact_check_report.py",
	"monitor_training.py",
	"training_summary.py",
	"verify_fixes_and_retrain.py"
	]

	# CONFIG FILES TO REMOVE
	config_files = [
	"competition_config.json",
	"monitoring_config.json",
	"setup_report.json",
	"requirements_competition.txt"
	]

	# BATCH FILES TO REMOVE (keeping only essential ones)
	batch_files = [
	"run_simple_training.bat",
	"run_training.bat"
	]

	# ALL FILES TO REMOVE
	all_files_to_remove = (
	old_models + old_training_scripts + old_test_scripts +
	debug_files + old_data_files + cache_files +
	analysis_scripts + config_files + batch_files
	)

	print("Starting cleanup...")
	print("=" * 60)

	removed_count = 0

	for item in all_files_to_remove:
	if os.path.exists(item):
	print(f"Removing: {item}")
	safe_remove(item)
	removed_count += 1

	# Also remove this cleanup script and analysis script after use
	cleanup_scripts = ["analyze_cleanup.py", "cleanup_project.py"]
	for script in cleanup_scripts:
	if os.path.exists(script):
	print(f"Removing cleanup script: {script}")
	safe_remove(script)
	removed_count += 1

	print("=" * 60)
	print(f"Cleanup completed! Removed {removed_count} items.")
	print("\nRemaining essential files:")
	print("- app/ (core application)")
	print("- complete_halueval_model/ (working model)")
	print("- comprehensive_training_data.csv (main training data)")
	print("- training.csv (backup training data)")
	print("- test_comprehensive_hybrid.py (main test)")
	print("- test_api.py (API tests)")
	print("- requirements.txt (dependencies)")
	print("- config.yaml (configuration)")
	print("- README.md (documentation)")
	print("- Dockerfile & docker-compose.yml (deployment)")

	return removed_count

	def main():
	"""Main function"""
	print("PROJECT CLEANUP TOOL")
	print("=" * 60)
	print("This will remove redundant files to reduce project size.")
	print("Essential files for production will be preserved.")
	print()

	# Ask for confirmation
	response = input("Do you want to proceed with cleanup? (y/N): ")
	if response.lower() != 'y':
	print("Cleanup cancelled.")
	return

	# Create backup first
	print("\nStep 1: Creating backup...")
	if not create_backup():
	print("Cannot proceed without backup. Exiting.")
	return

	# Perform cleanup
	print("\nStep 2: Cleaning up project...")
	removed_count = cleanup_project()

	print(f"\n✅ Project cleanup completed!")
	print(f"📊 Removed {removed_count} redundant items")
	print("🔧 Essential files preserved for production")
	print("💾 Backup created for safety")

	if __name__ == "__main__":
	main()