import os import json import scipy.io import numpy as np DATA_DIR = "data" OUTPUT_FILE = "Webapp/dataset_metadata.json" def get_metadata(): metadata = {} if not os.path.exists(DATA_DIR): print(f"Data directory {DATA_DIR} not found.") return metadata for filename in os.listdir(DATA_DIR): if filename.endswith(".mat"): name = filename[:-4] # Remove .mat path = os.path.join(DATA_DIR, filename) try: mat = scipy.io.loadmat(path) # Usually X is data, Y is label, or similar keys # We need to find the data key. # Common keys in such datasets: 'X', 'data', 'features' n_samples = 0 n_features = 0 n_classes = 0 # Heuristic to find data keys = [k for k in mat.keys() if not k.startswith('__')] # Assume the largest array is the data max_size = 0 data_key = None for k in keys: if isinstance(mat[k], np.ndarray): if mat[k].size > max_size: max_size = mat[k].size data_key = k if data_key: data = mat[data_key] if len(data.shape) == 2: n_samples, n_features = data.shape # Try to find labels to count classes # Usually the other array or 'Y' label_key = None for k in keys: if k != data_key and isinstance(mat[k], np.ndarray): # Labels usually have same length as samples if mat[k].shape[0] == n_samples or (len(mat[k].shape) > 1 and mat[k].shape[1] == n_samples): label_key = k break if label_key: labels = mat[label_key] n_classes = len(np.unique(labels)) metadata[name] = { "n_samples": int(n_samples), "n_features": int(n_features), "n_classes": int(n_classes) } print(f"Processed {name}: {n_samples}x{n_features}, {n_classes} classes") except Exception as e: print(f"Error processing {filename}: {e}") return metadata if __name__ == "__main__": meta = get_metadata() with open(OUTPUT_FILE, "w") as f: json.dump(meta, f, indent=2) print(f"Metadata saved to {OUTPUT_FILE}")