hallucination-detector-project / cleanup_project.py
KShoichi's picture
Upload cleanup_project.py with huggingface_hub
912940a verified
#!/usr/bin/env python3
"""
Project Cleanup Script
Removes redundant files and directories to reduce project size
while preserving essential components for production and future development.
"""
import os
import shutil
import time
from pathlib import Path
def create_backup():
"""Create a backup of the project before cleanup"""
backup_name = f"hallucination_detector_backup_{int(time.time())}"
backup_path = Path("..") / backup_name
print(f"Creating backup at: {backup_path}")
try:
shutil.copytree(".", backup_path, ignore=shutil.ignore_patterns('.git'))
print(f"✅ Backup created successfully at {backup_path}")
return True
except Exception as e:
print(f"❌ Backup failed: {e}")
return False
def safe_remove(path):
"""Safely remove a file or directory"""
try:
if os.path.isfile(path):
os.remove(path)
print(f" Removed file: {path}")
elif os.path.isdir(path):
shutil.rmtree(path)
print(f" Removed directory: {path}")
else:
print(f" Not found: {path}")
except Exception as e:
print(f" Error removing {path}: {e}")
def cleanup_project():
"""Remove redundant files and directories"""
# OLD MODELS TO REMOVE
old_models = [
"bert_hallucination_model",
"bias_fixed_mega_model",
"competition_model",
"enhanced_contradiction_model",
"enhanced_hallucination_model",
"enhanced_hallucination_model_v2",
"enhanced_model",
"enhanced_v2_model",
"final_balanced_mega_model",
"fine_tuned_hallucination_model",
"fixed_enhanced_model",
"mega_hallucination_model_20250814_151158",
"progressive_model_1",
"progressive_model_2",
"progressive_model_3",
"progressive_stage_1_20250814_155617",
"progressive_stage_2_20250814_155850",
"progressive_stage_3_20250814_160640",
"stable_mega_enhanced_model",
"stable_mega_model_20250814_155042",
"cv_fold_0",
"ensemble_models",
"evaluation_results",
"model_checkpoints",
"training_checkpoints"
]
# OLD TRAINING SCRIPTS TO REMOVE
old_training_scripts = [
"train_bert_alternative.py",
"train_bias_fix.py",
"train_competition_model.py",
"train_enhanced_contradiction.py",
"train_enhanced_model.py",
"train_enhanced_performance.py",
"train_final_balance.py",
"train_fixed_enhanced.py",
"train_incremental_improvement.py",
"train_mega_model.py",
"train_progressive_complete.py",
"train_simple_competition.py",
"train_simple_model.py",
"train_simple_production.py",
"train_simple_working_model.py",
"train_smart_incremental.py",
"train_stable_mega.py",
"train_working_enhanced.py",
"train_enhanced_model.py"
]
# OLD TEST SCRIPTS TO REMOVE
old_test_scripts = [
"test_all_models.py",
"test_bias_fixed.py",
"test_complete_mega_model.py",
"test_detector_direct.py",
"test_device_patterns.py",
"test_enhanced_api.py",
"test_enhanced_model.py",
"test_enhanced_v2_comprehensive.py",
"test_final_balance.py",
"test_fresh_predictions.py",
"test_hybrid_accuracy.py",
"test_mega_final.py",
"test_mega_model.py",
"test_model.py",
"test_raw_model.py",
"test_regex_fix.py",
"test_reliability.py",
"test_stable_mega.py"
]
# DEBUG/DEVELOPMENT FILES TO REMOVE
debug_files = [
"debug_prediction.py",
"debug_rules.py",
"apply_quick_fix.py",
"hybrid_detector.py",
"standalone_test.py",
"simple_test_server.py",
"comprehensive_server_test.py",
"comprehensive_test.py"
]
# OLD DATA FILES TO REMOVE
old_data_files = [
"advanced_training_data.csv",
"bias_fix_training.csv",
"combined_training_data.csv",
"competition_training_data.csv",
"edge_cases_training_data.csv",
"halueval_combined_training.csv",
"halueval_training_prepared.csv",
"mega_training_data.csv",
"sample_training_data.csv"
]
# CACHE/TEMP FILES TO REMOVE
cache_files = [
"cache_persistent.pkl",
"app.log",
"__pycache__",
".pytest_cache",
"logs",
"tmp",
"model_cache",
"plots"
]
# OLD ANALYSIS SCRIPTS TO REMOVE
analysis_scripts = [
"analyze_halueval.py",
"analyze_training_metrics.py",
"complete_data_audit.py",
"create_verified_dataset.py",
"diagnose_bias.py",
"fact_check_report.py",
"monitor_training.py",
"training_summary.py",
"verify_fixes_and_retrain.py"
]
# CONFIG FILES TO REMOVE
config_files = [
"competition_config.json",
"monitoring_config.json",
"setup_report.json",
"requirements_competition.txt"
]
# BATCH FILES TO REMOVE (keeping only essential ones)
batch_files = [
"run_simple_training.bat",
"run_training.bat"
]
# ALL FILES TO REMOVE
all_files_to_remove = (
old_models + old_training_scripts + old_test_scripts +
debug_files + old_data_files + cache_files +
analysis_scripts + config_files + batch_files
)
print("Starting cleanup...")
print("=" * 60)
removed_count = 0
for item in all_files_to_remove:
if os.path.exists(item):
print(f"Removing: {item}")
safe_remove(item)
removed_count += 1
# Also remove this cleanup script and analysis script after use
cleanup_scripts = ["analyze_cleanup.py", "cleanup_project.py"]
for script in cleanup_scripts:
if os.path.exists(script):
print(f"Removing cleanup script: {script}")
safe_remove(script)
removed_count += 1
print("=" * 60)
print(f"Cleanup completed! Removed {removed_count} items.")
print("\nRemaining essential files:")
print("- app/ (core application)")
print("- complete_halueval_model/ (working model)")
print("- comprehensive_training_data.csv (main training data)")
print("- training.csv (backup training data)")
print("- test_comprehensive_hybrid.py (main test)")
print("- test_api.py (API tests)")
print("- requirements.txt (dependencies)")
print("- config.yaml (configuration)")
print("- README.md (documentation)")
print("- Dockerfile & docker-compose.yml (deployment)")
return removed_count
def main():
"""Main function"""
print("PROJECT CLEANUP TOOL")
print("=" * 60)
print("This will remove redundant files to reduce project size.")
print("Essential files for production will be preserved.")
print()
# Ask for confirmation
response = input("Do you want to proceed with cleanup? (y/N): ")
if response.lower() != 'y':
print("Cleanup cancelled.")
return
# Create backup first
print("\nStep 1: Creating backup...")
if not create_backup():
print("Cannot proceed without backup. Exiting.")
return
# Perform cleanup
print("\nStep 2: Cleaning up project...")
removed_count = cleanup_project()
print(f"\n✅ Project cleanup completed!")
print(f"📊 Removed {removed_count} redundant items")
print("🔧 Essential files preserved for production")
print("💾 Backup created for safety")
if __name__ == "__main__":
main()