File size: 2,868 Bytes
b5567db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | import os
import json
import scipy.io
import numpy as np
DATA_DIR = "data"
OUTPUT_FILE = "Webapp/dataset_metadata.json"
def get_metadata():
metadata = {}
if not os.path.exists(DATA_DIR):
print(f"Data directory {DATA_DIR} not found.")
return metadata
for filename in os.listdir(DATA_DIR):
if filename.endswith(".mat"):
name = filename[:-4] # Remove .mat
path = os.path.join(DATA_DIR, filename)
try:
mat = scipy.io.loadmat(path)
# Usually X is data, Y is label, or similar keys
# We need to find the data key.
# Common keys in such datasets: 'X', 'data', 'features'
n_samples = 0
n_features = 0
n_classes = 0
# Heuristic to find data
keys = [k for k in mat.keys() if not k.startswith('__')]
# Assume the largest array is the data
max_size = 0
data_key = None
for k in keys:
if isinstance(mat[k], np.ndarray):
if mat[k].size > max_size:
max_size = mat[k].size
data_key = k
if data_key:
data = mat[data_key]
if len(data.shape) == 2:
n_samples, n_features = data.shape
# Try to find labels to count classes
# Usually the other array or 'Y'
label_key = None
for k in keys:
if k != data_key and isinstance(mat[k], np.ndarray):
# Labels usually have same length as samples
if mat[k].shape[0] == n_samples or (len(mat[k].shape) > 1 and mat[k].shape[1] == n_samples):
label_key = k
break
if label_key:
labels = mat[label_key]
n_classes = len(np.unique(labels))
metadata[name] = {
"n_samples": int(n_samples),
"n_features": int(n_features),
"n_classes": int(n_classes)
}
print(f"Processed {name}: {n_samples}x{n_features}, {n_classes} classes")
except Exception as e:
print(f"Error processing {filename}: {e}")
return metadata
if __name__ == "__main__":
meta = get_metadata()
with open(OUTPUT_FILE, "w") as f:
json.dump(meta, f, indent=2)
print(f"Metadata saved to {OUTPUT_FILE}")
|