Spaces:

orderlymirror
/

lov2

Sleeping

lov2 / scripts /validate_dataset.py

work-sejal

Deploy LearningOutcomeOS AI Service and dataset to HF Space

3795605 about 1 month ago

2.88 kB

	"""CLI dataset validation script.

	Invokes the DatasetValidator with settings-based configuration and writes
	a validation report to artifacts/reports/data_validation_report.md.

	Exit codes:
	0 — All validation checks passed.
	1 — One or more validation checks failed.
	2 — Setup failure (missing dataset directory, configuration error).
	"""

	import sys
	from pathlib import Path

	# Add AI_Services_V2 root to sys.path so imports work when run directly
	_SCRIPT_DIR = Path(__file__).resolve().parent
	_PROJECT_ROOT = _SCRIPT_DIR.parent
	sys.path.insert(0, str(_PROJECT_ROOT))

	from app.core.config import settings
	from app.core.exceptions import DatasetError
	from app.data.loader import DatasetLoader
	from app.data.validator import DatasetValidator


	def main() -> int:
	"""Run dataset validation and return exit code.

	Returns:
	0 on success, 1 on validation failures, 2 on setup failures.
	"""
	try:
	dataset_path = Path(settings.dataset_dir)
	if not dataset_path.is_absolute():
	dataset_path = (_PROJECT_ROOT / dataset_path).resolve()

	if not dataset_path.exists():
	print(f"ERROR: Dataset directory not found: {dataset_path}")
	return 2

	loader = DatasetLoader(dataset_path)
	metadata = loader.load_metadata()

	except DatasetError as exc:
	print(f"ERROR: Setup failure — {exc}")
	return 2
	except Exception as exc:
	print(f"ERROR: Unexpected setup failure — {exc}")
	return 2

	try:
	validator = DatasetValidator(loader, metadata)
	report = validator.run_all()

	report_path = Path(settings.reports_dir)
	if not report_path.is_absolute():
	report_path = (_PROJECT_ROOT / report_path).resolve()
	report_path = report_path / "data_validation_report.md"

	validator.write_report(report, report_path)

	# Print summary to stdout
	if report.passed:
	print(f"PASSED: All {report.checks_run} validation checks passed.")
	print(f"Report written to: {report_path}")
	return 0
	else:
	print(
	f"FAILED: {report.checks_run - report.checks_passed}/{report.checks_run} "
	f"checks failed with {len(report.issues)} issue(s)."
	)
	for issue in report.issues:
	col_info = f" ({issue.column})" if issue.column else ""
	print(f" - [{issue.severity.upper()}] {issue.table}{col_info}: {issue.message}")
	print(f"\nReport written to: {report_path}")
	return 1

	except DatasetError as exc:
	print(f"ERROR: Validation failure — {exc}")
	return 2
	except Exception as exc:
	print(f"ERROR: Unexpected failure during validation — {exc}")
	return 2


	if __name__ == "__main__":
	sys.exit(main())