import os
import json
import scipy.io
import numpy as np

DATA_DIR = "data"
OUTPUT_FILE = "Webapp/dataset_metadata.json"

def get_metadata():
    metadata = {}
    if not os.path.exists(DATA_DIR):
        print(f"Data directory {DATA_DIR} not found.")
        return metadata

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".mat"):
            name = filename[:-4] # Remove .mat
            path = os.path.join(DATA_DIR, filename)
            try:
                mat = scipy.io.loadmat(path)
                # Usually X is data, Y is label, or similar keys
                # We need to find the data key.
                # Common keys in such datasets: 'X', 'data', 'features'
                
                n_samples = 0
                n_features = 0
                n_classes = 0
                
                # Heuristic to find data
                keys = [k for k in mat.keys() if not k.startswith('__')]
                
                # Assume the largest array is the data
                max_size = 0
                data_key = None
                
                for k in keys:
                    if isinstance(mat[k], np.ndarray):
                        if mat[k].size > max_size:
                            max_size = mat[k].size
                            data_key = k
                
                if data_key:
                    data = mat[data_key]
                    if len(data.shape) == 2:
                        n_samples, n_features = data.shape
                    
                    # Try to find labels to count classes
                    # Usually the other array or 'Y'
                    label_key = None
                    for k in keys:
                        if k != data_key and isinstance(mat[k], np.ndarray):
                            # Labels usually have same length as samples
                            if mat[k].shape[0] == n_samples or (len(mat[k].shape) > 1 and mat[k].shape[1] == n_samples):
                                label_key = k
                                break
                    
                    if label_key:
                        labels = mat[label_key]
                        n_classes = len(np.unique(labels))
                    
                metadata[name] = {
                    "n_samples": int(n_samples),
                    "n_features": int(n_features),
                    "n_classes": int(n_classes)
                }
                print(f"Processed {name}: {n_samples}x{n_features}, {n_classes} classes")
                
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                
    return metadata

if __name__ == "__main__":
    meta = get_metadata()
    with open(OUTPUT_FILE, "w") as f:
        json.dump(meta, f, indent=2)
    print(f"Metadata saved to {OUTPUT_FILE}")