TFG-model / tools /analyze_normalization_impact.py

Upload folder using huggingface_hub

04f866d verified 5 days ago

5.45 kB

	#!/usr/bin/env python3
	"""Analyze image quality loss from normalization (without actual processing)."""

	import csv
	from pathlib import Path


	def main():
	print("=" * 70)
	print("ANÁLISIS: IMPACTO DE NORMALIZAR IMÁGENES A 224×224 PARA SPAI")
	print("=" * 70)
	print()

	# Load original metadata with dimensions
	csv_file = Path("output/quality_filtered/metadata/contextual_images_filtered.csv")
	with open(csv_file) as f:
	rows = list(csv.DictReader(f))

	# Analyze upscaling requirements
	print(f"IMÁGENES ANALIZADAS: {len(rows)}\n")

	# Group by upscaling factor
	groups = {
	"downscale_only": [], # Downscale only (good)
	"moderate_upscale": [], # Up to 1.5x (acceptable)
	"significant_upscale": [], # 1.5-2.5x (some quality loss)
	"heavy_upscale": [], # >2.5x (significant loss)
	}

	quality_scores = []

	for row in rows:
	w = int(row.get("detected_width", 0))
	h = int(row.get("detected_height", 0))

	if not w or not h:
	continue

	# For square fit in 224x224, use the smaller dimension
	smaller_dim = min(w, h)
	upscale_ratio = 224 / smaller_dim

	# Quality score (0-1, higher better)
	quality_score = min(1.0 / upscale_ratio, 1.0)
	quality_scores.append(quality_score)

	img_info = {
	"name": Path(row.get("filtered_path", "")).name,
	"size": f"{w}x{h}",
	"mp": round(w * h / 1e6, 3),
	"upscale_ratio": round(upscale_ratio, 2),
	"quality_score": round(quality_score, 2),
	"category": row.get("category", "?"),
	"original_dimensions": (w, h),
	}

	if upscale_ratio <= 1.0:
	groups["downscale_only"].append(img_info)
	elif upscale_ratio <= 1.5:
	groups["moderate_upscale"].append(img_info)
	elif upscale_ratio <= 2.5:
	groups["significant_upscale"].append(img_info)
	else:
	groups["heavy_upscale"].append(img_info)

	# Print grouped analysis
	print("CATEGORÍAS DE IMPACTO AL NORMALIZAR A 224×224:\n")

	labels = {
	"downscale_only": "✅ DOWNSCALE ONLY (No quality loss, mejor)",
	"moderate_upscale": "🟢 MODERATE UPSCALE (1.0-1.5x, aceptable)",
	"significant_upscale": "🟡 SIGNIFICANT UPSCALE (1.5-2.5x, pérdida moderada)",
	"heavy_upscale": "🔴 HEAVY UPSCALE (>2.5x, pérdida significativa)",
	}

	for key, label in labels.items():
	count = len(groups[key])
	pct = 100 * count / len(rows) if rows else 0
	print(f"{label}")
	print(f" {count} imágenes ({pct:.1f}%)")

	if groups[key]:
	# Show some examples
	examples = sorted(groups[key], key=lambda x: x["upscale_ratio"])[:3]
	for ex in examples:
	print(f" • {ex['size']:15} → 224×224 (ratio: {ex['upscale_ratio']}x, score: {ex['quality_score']})")
	print()

	# Summary statistics
	print("=" * 70)
	print("ESTADÍSTICAS GLOBALES:\n")

	avg_ratio = sum(r["upscale_ratio"] for r in sum(groups.values(), []) if "upscale_ratio" in r) / len(rows) if rows else 1
	avg_quality = sum(quality_scores) / len(quality_scores) if quality_scores else 1.0

	print(f"Promedio de upscale ratio: {avg_ratio:.2f}x")
	print(f"Promedio de quality score: {avg_quality:.2f}/1.0")
	print()

	# Recommendation
	print("=" * 70)
	print("RECOMENDACIÓN:\n")

	heavy_count = len(groups["heavy_upscale"])
	if heavy_count == 0:
	print("✅ TODAS LAS IMÁGENES SON ACEPTABLES para SPAI sin pre-procesamiento")
	print(" La mayoría serán downscaled o moderadamente upscaled → mínima pérdida")
	print(" RECOMENDACIÓN: Usar directamente sin normalizar")
	elif heavy_count <= len(rows) * 0.1:
	print(f"✅ CASI TODAS LAS IMÁGENES SON ACEPTABLES ({heavy_count} problemáticas)")
	print(" RECOMENDACIÓN: Considerar filtrar solo las {heavy_count} muy pequeñas")
	else:
	print(f"⚠️ ALGUNAS IMÁGENES REQUIEREN UPSCALING ({heavy_count} muy pequeñas)")
	print(" RECOMENDACIÓN: Pre-procesamiento opcional para mejor calidad")

	print()
	print("=" * 70)

	# Distribution by category
	print("\nDISTRIBUCIÓN POR CATEGORÍA:\n")

	categories = {}
	for row in rows:
	cat = row.get("category", "?")
	w = int(row.get("detected_width", 0))
	h = int(row.get("detected_height", 0))

	smaller_dim = min(w, h) if w and h else 0
	ratio = 224 / smaller_dim if smaller_dim else 1

	if cat not in categories:
	categories[cat] = {"count": 0, "ratios": []}

	categories[cat]["count"] += 1
	categories[cat]["ratios"].append(ratio)

	for cat in sorted(categories.keys()):
	stats = categories[cat]
	avg_cat_ratio = sum(stats["ratios"]) / len(stats["ratios"])
	avg_cat_quality = sum(min(1/r, 1) for r in stats["ratios"]) / len(stats["ratios"])

	print(f" {cat}:")
	print(f" {stats['count']} imágenes")
	print(f" Avg upscale ratio: {avg_cat_ratio:.2f}x")
	print(f" Avg quality score: {avg_cat_quality:.2f}/1.0")
	print()


	if __name__ == "__main__":
	main()