Spaces:

Dishaaa25
/

data-cleaning-openenv

Sleeping

App Files Files Community

data-cleaning-openenv / env /quality.py

Dishaaa25

Upload folder using huggingface_hub

c22bf49 verified about 1 month ago

raw

history blame contribute delete

2.23 kB

	from __future__ import annotations

	from typing import Any

	from env.actions import is_missing


	def _is_numeric_value(value: Any, dtype: str) -> bool:
	if is_missing(value):
	return False
	try:
	if dtype == "int":
	int(str(value))
	elif dtype == "float":
	float(str(value))
	else:
	return False
	return True
	except (TypeError, ValueError):
	return False


	def _compute_consistency(dataset: list[dict], column_infos: list) -> float:
	if not dataset or not column_infos:
	return 1.0

	valid_checks = 0
	total_checks = 0

	for info in column_infos:
	values = [row.get(info.name) for row in dataset]
	if info.dtype in {"int", "float"}:
	for value in values:
	total_checks += 1
	if _is_numeric_value(value, info.dtype):
	valid_checks += 1
	else:
	non_missing = [str(value) for value in values if not is_missing(value)]
	if not non_missing:
	continue
	lowered = {}
	for value in non_missing:
	lowered.setdefault(value.lower(), set()).add(value)
	has_inconsistency = any(len(forms) > 1 for forms in lowered.values())
	total_checks += 1
	if not has_inconsistency:
	valid_checks += 1

	return valid_checks / total_checks if total_checks else 1.0


	def compute_quality_score(dataset: list[dict], column_infos: list, original_issues_count: int) -> float:
	if original_issues_count == 0:
	return 0.99

	total_cells = len(dataset) * len(dataset[0]) if dataset else 1
	missing_cells = sum(
	1 for row in dataset for value in row.values() if value is None or value == "" or value == "not_available"
	)
	completeness = 1.0 - (missing_cells / total_cells)

	total_rows = len(dataset)
	unique_rows = len(set(str(sorted(row.items())) for row in dataset))
	uniqueness = unique_rows / total_rows if total_rows > 0 else 1.0

	consistency = _compute_consistency(dataset, column_infos)

	score = 0.4 * completeness + 0.3 * uniqueness + 0.3 * consistency
	return round(max(0.01, min(0.99, score)), 4)