TFG-model / tools /analyze_normalization_impact.py
aminasifar1's picture
Upload folder using huggingface_hub
04f866d verified
#!/usr/bin/env python3
"""Analyze image quality loss from normalization (without actual processing)."""
import csv
from pathlib import Path
def main():
print("=" * 70)
print("ANÁLISIS: IMPACTO DE NORMALIZAR IMÁGENES A 224×224 PARA SPAI")
print("=" * 70)
print()
# Load original metadata with dimensions
csv_file = Path("output/quality_filtered/metadata/contextual_images_filtered.csv")
with open(csv_file) as f:
rows = list(csv.DictReader(f))
# Analyze upscaling requirements
print(f"IMÁGENES ANALIZADAS: {len(rows)}\n")
# Group by upscaling factor
groups = {
"downscale_only": [], # Downscale only (good)
"moderate_upscale": [], # Up to 1.5x (acceptable)
"significant_upscale": [], # 1.5-2.5x (some quality loss)
"heavy_upscale": [], # >2.5x (significant loss)
}
quality_scores = []
for row in rows:
w = int(row.get("detected_width", 0))
h = int(row.get("detected_height", 0))
if not w or not h:
continue
# For square fit in 224x224, use the smaller dimension
smaller_dim = min(w, h)
upscale_ratio = 224 / smaller_dim
# Quality score (0-1, higher better)
quality_score = min(1.0 / upscale_ratio, 1.0)
quality_scores.append(quality_score)
img_info = {
"name": Path(row.get("filtered_path", "")).name,
"size": f"{w}x{h}",
"mp": round(w * h / 1e6, 3),
"upscale_ratio": round(upscale_ratio, 2),
"quality_score": round(quality_score, 2),
"category": row.get("category", "?"),
"original_dimensions": (w, h),
}
if upscale_ratio <= 1.0:
groups["downscale_only"].append(img_info)
elif upscale_ratio <= 1.5:
groups["moderate_upscale"].append(img_info)
elif upscale_ratio <= 2.5:
groups["significant_upscale"].append(img_info)
else:
groups["heavy_upscale"].append(img_info)
# Print grouped analysis
print("CATEGORÍAS DE IMPACTO AL NORMALIZAR A 224×224:\n")
labels = {
"downscale_only": "✅ DOWNSCALE ONLY (No quality loss, mejor)",
"moderate_upscale": "🟢 MODERATE UPSCALE (1.0-1.5x, aceptable)",
"significant_upscale": "🟡 SIGNIFICANT UPSCALE (1.5-2.5x, pérdida moderada)",
"heavy_upscale": "🔴 HEAVY UPSCALE (>2.5x, pérdida significativa)",
}
for key, label in labels.items():
count = len(groups[key])
pct = 100 * count / len(rows) if rows else 0
print(f"{label}")
print(f" {count} imágenes ({pct:.1f}%)")
if groups[key]:
# Show some examples
examples = sorted(groups[key], key=lambda x: x["upscale_ratio"])[:3]
for ex in examples:
print(f" • {ex['size']:15} → 224×224 (ratio: {ex['upscale_ratio']}x, score: {ex['quality_score']})")
print()
# Summary statistics
print("=" * 70)
print("ESTADÍSTICAS GLOBALES:\n")
avg_ratio = sum(r["upscale_ratio"] for r in sum(groups.values(), []) if "upscale_ratio" in r) / len(rows) if rows else 1
avg_quality = sum(quality_scores) / len(quality_scores) if quality_scores else 1.0
print(f"Promedio de upscale ratio: {avg_ratio:.2f}x")
print(f"Promedio de quality score: {avg_quality:.2f}/1.0")
print()
# Recommendation
print("=" * 70)
print("RECOMENDACIÓN:\n")
heavy_count = len(groups["heavy_upscale"])
if heavy_count == 0:
print("✅ TODAS LAS IMÁGENES SON ACEPTABLES para SPAI sin pre-procesamiento")
print(" La mayoría serán downscaled o moderadamente upscaled → mínima pérdida")
print(" RECOMENDACIÓN: Usar directamente sin normalizar")
elif heavy_count <= len(rows) * 0.1:
print(f"✅ CASI TODAS LAS IMÁGENES SON ACEPTABLES ({heavy_count} problemáticas)")
print(" RECOMENDACIÓN: Considerar filtrar solo las {heavy_count} muy pequeñas")
else:
print(f"⚠️ ALGUNAS IMÁGENES REQUIEREN UPSCALING ({heavy_count} muy pequeñas)")
print(" RECOMENDACIÓN: Pre-procesamiento opcional para mejor calidad")
print()
print("=" * 70)
# Distribution by category
print("\nDISTRIBUCIÓN POR CATEGORÍA:\n")
categories = {}
for row in rows:
cat = row.get("category", "?")
w = int(row.get("detected_width", 0))
h = int(row.get("detected_height", 0))
smaller_dim = min(w, h) if w and h else 0
ratio = 224 / smaller_dim if smaller_dim else 1
if cat not in categories:
categories[cat] = {"count": 0, "ratios": []}
categories[cat]["count"] += 1
categories[cat]["ratios"].append(ratio)
for cat in sorted(categories.keys()):
stats = categories[cat]
avg_cat_ratio = sum(stats["ratios"]) / len(stats["ratios"])
avg_cat_quality = sum(min(1/r, 1) for r in stats["ratios"]) / len(stats["ratios"])
print(f" {cat}:")
print(f" {stats['count']} imágenes")
print(f" Avg upscale ratio: {avg_cat_ratio:.2f}x")
print(f" Avg quality score: {avg_cat_quality:.2f}/1.0")
print()
if __name__ == "__main__":
main()