Spaces:

STMicroelectronics
/

stm32-modelzoo-app

Running

App Files Files Community

stm32-modelzoo-app / object_detection /tf /src /datasets /utils /dataset_analyzers.py

FBAGSTM

STM32 AI Experimentation Hub

747451d about 1 month ago

raw

history blame contribute delete

9.49 kB

	# /*---------------------------------------------------------------------------------------------
	# * Copyright (c) 2025 STMicroelectronics.
	# * All rights reserved.
	# * This software is licensed under terms that can be found in the LICENSE file in
	# * the root directory of this software component.
	# * If no LICENSE file comes with this software, it is provided AS-IS.
	# --------------------------------------------------------------------------------------------/

	import os
	import sys
	from pathlib import Path
	from glob import glob
	from tqdm import tqdm
	from collections import Counter
	from statistics import mean
	import matplotlib.pyplot as plt

	def parse_label_file(txt_file_path : str=None) -> list:
	"""
	Provides detections in a list from input text file

	Args:
	txt_file_path (str) : Path of the detection file to analyze

	Returns:
	List : list of detected labels
	"""
	labels = []
	if os.path.isfile(txt_file_path):
	with open(txt_file_path, "r") as f:
	data = f.readlines()
	for line in data:
	if line.rstrip() != "":
	fields = line.split()
	labels.append([float(x) for x in fields])
	return labels


	def compute_labels_stats(dataset_path : str=None,
	dataset_name : str=None,
	histogram_dir: str=None) -> None:
	"""
	Provides statistics on the dataset labels

	Args:
	dataset_path (str) : Path of the dataset to analyze
	dataset_name (str) : Name of the dataset used
	histogram_dir (str): location of the histograms storage

	Returns:
	None
	"""
	print("\nCalculating groundtruth labels statistics:")
	print("-----------------------------------------")
	print("Dataset root:", dataset_path)

	jpg_file_paths = glob.glob(os.path.join(dataset_path, "*.jpg"))
	if len(jpg_file_paths) == 0:
	raise ValueError(f"Could not find any .jpg file in dataset root directory")

	num_jpg_files = len(jpg_file_paths)
	num_txt_files = 0
	num_empty_txt = 0

	label_sizes = []
	for jpg_path in tqdm(jpg_file_paths):
	txt_path = os.path.join(Path(jpg_path).parent, Path(jpg_path).stem + ".txt")
	if os.path.isfile(txt_path):
	num_txt_files += 1
	labels = parse_label_file(txt_path)
	if not labels:
	num_empty_txt += 1
	label_sizes.append(0)
	else:
	label_sizes.append(len(labels))
	# label_sizes.append(len(labels))

	print("Image files: ", num_jpg_files)
	print("Labels files:", num_txt_files)
	print("Empty labels files:", num_empty_txt)
	print("Labels per image: min = {}, max = {}, mean = {:.2f}".
	format(min(label_sizes), max(label_sizes), mean(label_sizes)))

	plt.figure(figsize=(8, 8))
	plt.hist(label_sizes, bins=max(label_sizes))
	plot_title = "Number of labels per image"
	if dataset_name:
	plot_title += " in dataset " + dataset_name
	plt.title(plot_title)
	if histogram_dir:
	if not os.path.isdir(histogram_dir):
	os.makedirs(histogram_dir, exist_ok=True)
	plt.savefig(os.path.join(histogram_dir, "labels_stats_" + dataset_name + ".png"))
	plt.show()
	plt.close()


	def compute_class_stats(dataset_path : str=None,
	dataset_name : str=None,
	histogram_dir: str=None) -> None:
	"""
	Provides statistics on the dataset classes

	Args:
	dataset_path (str) : Path of the dataset to analyze
	dataset_name (str) : Name of the dataset used
	histogram_dir (str): location of the histograms storage

	Returns:
	None
	"""
	print("\nCalculating groundtruth class statistics:")
	print("----------------------------------------")
	print("Dataset root:", dataset_path)

	jpg_file_paths = glob.glob(os.path.join(dataset_path, "*.jpg"))
	if len(jpg_file_paths) == 0:
	raise ValueError(f"Could not find any .jpg file in dataset root directory")

	classes = []
	for jpg_path in tqdm(jpg_file_paths):
	txt_path = os.path.join(Path(jpg_path).parent, Path(jpg_path).stem + ".txt")
	if not os.path.isfile(txt_path):
	continue
	labels = parse_label_file(txt_path)
	# Skip .txt files with no objects
	if len(labels) == 0:
	continue
	for i in range(len(labels)):
	id = int(labels[i][0])
	classes.append(id)

	classes_dict = Counter(classes)

	class_ids = list(classes_dict.keys())
	class_ids.sort()
	num_classes = max(class_ids) + 1

	print("Number of classes:", num_classes)
	print("Occurences:")
	class_occurences = []
	for id in range(num_classes):
	n = classes_dict[id] if id in classes_dict else 0
	class_occurences.append(n)
	print(f"Class {id}: {n}")

	plt.figure(figsize=(8, 8))
	plot_title = "Class occurences"
	if dataset_name:
	plot_title += " in dataset " + dataset_name
	plt.title(plot_title)
	plt.xticks(class_ids)
	plt.bar(class_ids, class_occurences, width=0.4)
	if histogram_dir:
	if not os.path.isdir(histogram_dir):
	os.makedirs(histogram_dir, exist_ok=True)
	print(histogram_dir, dataset_name)
	plt.savefig(os.path.join(histogram_dir, "classes_stats_" + dataset_name + ".png"))
	plt.show()
	plt.close()


	def num_labels_above_cutoff(dataset_path : str=None,
	padded_labels_size : int=15) -> float:
	"""
	Calculates the percentage of filtered images corresponding to the maximum number of detections
	kept per image

	Args:
	dataset_path (str): Path of the dataset to analyze
	padded_labels_size (int) : The max number of detection allowed per image

	Returns:
	float : The corresponding percentage of filtered detections
	"""
	print("\nCalculating number of truncated groundtruth labels:")
	print("--------------------------------------------------")
	print("Dataset root:", dataset_path)

	if (padded_labels_size <= 0):
	print("Please make sure that you provided maximum number of detections bigger than 0")
	print("Exiting the script...")
	sys.exit()

	jpg_file_paths = glob.glob(os.path.join(dataset_path, "*.jpg"))
	if len(jpg_file_paths) == 0:
	raise ValueError(f"Could not find any .jpg file under dataset root {dataset_path}")

	num_examples = 0
	above_cutoff = 0
	txt_file_paths = glob.glob(os.path.join(dataset_path, "*.txt"))
	for path in txt_file_paths:
	num_examples += 1
	labels = parse_label_file(path)
	if len(labels) > padded_labels_size:
	above_cutoff += 1

	cutoff_percentage = 100 * above_cutoff/num_examples
	print("Padded labels size:", padded_labels_size)
	print("Examples with a number of labels greater than padding size: {}/{} ({:.2f}%)".
	format(above_cutoff, num_examples, cutoff_percentage))

	return (cutoff_percentage)


	def num_labels_above_percentage(dataset_path : str=None,
	target_percentage : float=0.0) -> int:
	"""
	Calculates the maximum number of detections in the input images corresponding to the max percentage
	of the dataset to be filtered by removing images with a lot a detections

	Args:
	dataset_path (str) : Path of the dataset to analyze
	target_percentage (float) : The max percentage of the dataset to be filtered by
	removing images with a lot a detections

	Returns:
	int : The corresponding maximum number of detections per image filtered.
	"""
	print("\nCalculating number of truncated groundtruth labels:")
	print("--------------------------------------------------")
	print("Dataset root:", dataset_path)
	if (target_percentage < 0.0) and (target_percentage >= 100.0):
	print("Please make sure that you provided maximum percentage of images to filter between [0.0 100[")
	print("Exiting the script...")
	sys.exit()

	jpg_file_paths = glob.glob(os.path.join(dataset_path, "*.jpg"))
	if len(jpg_file_paths) == 0:
	raise ValueError(f"Could not find any .jpg file under dataset root {dataset_path}")

	num_examples = 0
	label_sizes = []
	txt_file_paths = glob.glob(os.path.join(dataset_path, "*.txt"))
	for path in txt_file_paths:
	num_examples += 1
	labels = parse_label_file(path)
	if not labels:
	label_sizes.append(0)
	else:
	label_sizes.append(len(labels))

	padded_labels_size = max(label_sizes)
	above_cutoff_final = 0
	while padded_labels_size > 0:
	above_cutoff = 0
	for path in txt_file_paths:
	labels = parse_label_file(path)
	if len(labels) > padded_labels_size:
	above_cutoff += 1
	current_percentage = 100 * above_cutoff/num_examples
	if (current_percentage <= target_percentage):
	above_cutoff_final = above_cutoff
	padded_labels_size -= 1
	else:
	above_cutoff = above_cutoff_final
	break

	print("Padded labels size:", padded_labels_size+1)
	print("Examples with a number of labels greater than padding size: {}/{} ({:.2f}%)".
	format(above_cutoff, num_examples, 100 * above_cutoff/num_examples))

	return (padded_labels_size+1)