|
|
| """
|
| Project Cleanup Script
|
| Removes redundant files and directories to reduce project size
|
| while preserving essential components for production and future development.
|
| """
|
|
|
| import os
|
| import shutil
|
| import time
|
| from pathlib import Path
|
|
|
| def create_backup():
|
| """Create a backup of the project before cleanup"""
|
| backup_name = f"hallucination_detector_backup_{int(time.time())}"
|
| backup_path = Path("..") / backup_name
|
|
|
| print(f"Creating backup at: {backup_path}")
|
| try:
|
| shutil.copytree(".", backup_path, ignore=shutil.ignore_patterns('.git'))
|
| print(f"✅ Backup created successfully at {backup_path}")
|
| return True
|
| except Exception as e:
|
| print(f"❌ Backup failed: {e}")
|
| return False
|
|
|
| def safe_remove(path):
|
| """Safely remove a file or directory"""
|
| try:
|
| if os.path.isfile(path):
|
| os.remove(path)
|
| print(f" Removed file: {path}")
|
| elif os.path.isdir(path):
|
| shutil.rmtree(path)
|
| print(f" Removed directory: {path}")
|
| else:
|
| print(f" Not found: {path}")
|
| except Exception as e:
|
| print(f" Error removing {path}: {e}")
|
|
|
| def cleanup_project():
|
| """Remove redundant files and directories"""
|
|
|
|
|
| old_models = [
|
| "bert_hallucination_model",
|
| "bias_fixed_mega_model",
|
| "competition_model",
|
| "enhanced_contradiction_model",
|
| "enhanced_hallucination_model",
|
| "enhanced_hallucination_model_v2",
|
| "enhanced_model",
|
| "enhanced_v2_model",
|
| "final_balanced_mega_model",
|
| "fine_tuned_hallucination_model",
|
| "fixed_enhanced_model",
|
| "mega_hallucination_model_20250814_151158",
|
| "progressive_model_1",
|
| "progressive_model_2",
|
| "progressive_model_3",
|
| "progressive_stage_1_20250814_155617",
|
| "progressive_stage_2_20250814_155850",
|
| "progressive_stage_3_20250814_160640",
|
| "stable_mega_enhanced_model",
|
| "stable_mega_model_20250814_155042",
|
| "cv_fold_0",
|
| "ensemble_models",
|
| "evaluation_results",
|
| "model_checkpoints",
|
| "training_checkpoints"
|
| ]
|
|
|
|
|
| old_training_scripts = [
|
| "train_bert_alternative.py",
|
| "train_bias_fix.py",
|
| "train_competition_model.py",
|
| "train_enhanced_contradiction.py",
|
| "train_enhanced_model.py",
|
| "train_enhanced_performance.py",
|
| "train_final_balance.py",
|
| "train_fixed_enhanced.py",
|
| "train_incremental_improvement.py",
|
| "train_mega_model.py",
|
| "train_progressive_complete.py",
|
| "train_simple_competition.py",
|
| "train_simple_model.py",
|
| "train_simple_production.py",
|
| "train_simple_working_model.py",
|
| "train_smart_incremental.py",
|
| "train_stable_mega.py",
|
| "train_working_enhanced.py",
|
| "train_enhanced_model.py"
|
| ]
|
|
|
|
|
| old_test_scripts = [
|
| "test_all_models.py",
|
| "test_bias_fixed.py",
|
| "test_complete_mega_model.py",
|
| "test_detector_direct.py",
|
| "test_device_patterns.py",
|
| "test_enhanced_api.py",
|
| "test_enhanced_model.py",
|
| "test_enhanced_v2_comprehensive.py",
|
| "test_final_balance.py",
|
| "test_fresh_predictions.py",
|
| "test_hybrid_accuracy.py",
|
| "test_mega_final.py",
|
| "test_mega_model.py",
|
| "test_model.py",
|
| "test_raw_model.py",
|
| "test_regex_fix.py",
|
| "test_reliability.py",
|
| "test_stable_mega.py"
|
| ]
|
|
|
|
|
| debug_files = [
|
| "debug_prediction.py",
|
| "debug_rules.py",
|
| "apply_quick_fix.py",
|
| "hybrid_detector.py",
|
| "standalone_test.py",
|
| "simple_test_server.py",
|
| "comprehensive_server_test.py",
|
| "comprehensive_test.py"
|
| ]
|
|
|
|
|
| old_data_files = [
|
| "advanced_training_data.csv",
|
| "bias_fix_training.csv",
|
| "combined_training_data.csv",
|
| "competition_training_data.csv",
|
| "edge_cases_training_data.csv",
|
| "halueval_combined_training.csv",
|
| "halueval_training_prepared.csv",
|
| "mega_training_data.csv",
|
| "sample_training_data.csv"
|
| ]
|
|
|
|
|
| cache_files = [
|
| "cache_persistent.pkl",
|
| "app.log",
|
| "__pycache__",
|
| ".pytest_cache",
|
| "logs",
|
| "tmp",
|
| "model_cache",
|
| "plots"
|
| ]
|
|
|
|
|
| analysis_scripts = [
|
| "analyze_halueval.py",
|
| "analyze_training_metrics.py",
|
| "complete_data_audit.py",
|
| "create_verified_dataset.py",
|
| "diagnose_bias.py",
|
| "fact_check_report.py",
|
| "monitor_training.py",
|
| "training_summary.py",
|
| "verify_fixes_and_retrain.py"
|
| ]
|
|
|
|
|
| config_files = [
|
| "competition_config.json",
|
| "monitoring_config.json",
|
| "setup_report.json",
|
| "requirements_competition.txt"
|
| ]
|
|
|
|
|
| batch_files = [
|
| "run_simple_training.bat",
|
| "run_training.bat"
|
| ]
|
|
|
|
|
| all_files_to_remove = (
|
| old_models + old_training_scripts + old_test_scripts +
|
| debug_files + old_data_files + cache_files +
|
| analysis_scripts + config_files + batch_files
|
| )
|
|
|
| print("Starting cleanup...")
|
| print("=" * 60)
|
|
|
| removed_count = 0
|
|
|
| for item in all_files_to_remove:
|
| if os.path.exists(item):
|
| print(f"Removing: {item}")
|
| safe_remove(item)
|
| removed_count += 1
|
|
|
|
|
| cleanup_scripts = ["analyze_cleanup.py", "cleanup_project.py"]
|
| for script in cleanup_scripts:
|
| if os.path.exists(script):
|
| print(f"Removing cleanup script: {script}")
|
| safe_remove(script)
|
| removed_count += 1
|
|
|
| print("=" * 60)
|
| print(f"Cleanup completed! Removed {removed_count} items.")
|
| print("\nRemaining essential files:")
|
| print("- app/ (core application)")
|
| print("- complete_halueval_model/ (working model)")
|
| print("- comprehensive_training_data.csv (main training data)")
|
| print("- training.csv (backup training data)")
|
| print("- test_comprehensive_hybrid.py (main test)")
|
| print("- test_api.py (API tests)")
|
| print("- requirements.txt (dependencies)")
|
| print("- config.yaml (configuration)")
|
| print("- README.md (documentation)")
|
| print("- Dockerfile & docker-compose.yml (deployment)")
|
|
|
| return removed_count
|
|
|
| def main():
|
| """Main function"""
|
| print("PROJECT CLEANUP TOOL")
|
| print("=" * 60)
|
| print("This will remove redundant files to reduce project size.")
|
| print("Essential files for production will be preserved.")
|
| print()
|
|
|
|
|
| response = input("Do you want to proceed with cleanup? (y/N): ")
|
| if response.lower() != 'y':
|
| print("Cleanup cancelled.")
|
| return
|
|
|
|
|
| print("\nStep 1: Creating backup...")
|
| if not create_backup():
|
| print("Cannot proceed without backup. Exiting.")
|
| return
|
|
|
|
|
| print("\nStep 2: Cleaning up project...")
|
| removed_count = cleanup_project()
|
|
|
| print(f"\n✅ Project cleanup completed!")
|
| print(f"📊 Removed {removed_count} redundant items")
|
| print("🔧 Essential files preserved for production")
|
| print("💾 Backup created for safety")
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|