KasaHealth / utils /check_overlap.py
78anand's picture
Upload folder using huggingface_hub
f317798 verified
import numpy as np
import os
base = r"c:\Users\ASUS\lung_ai_project\data"
dir_orig = os.path.join(base, "hear_embeddings")
dir_aug = os.path.join(base, "hear_embeddings_augmented")
x1 = np.load(os.path.join(dir_orig, "X_hear.npy"))
y1 = np.load(os.path.join(dir_orig, "y_hear.npy"))
x2 = np.load(os.path.join(dir_aug, "X_hear_aug.npy"))
y2 = np.load(os.path.join(dir_aug, "y_hear_aug.npy"))
# Detailed check
print(f"Original: {x1.shape}")
print(f"Augmented: {x2.shape}")
# Check first sick sample in Orig
sick_indices_1 = np.where(y1 == 'sick')[0]
sick_sample_1 = x1[sick_indices_1[0]]
# Check if this sample exists in x2
matches = np.all(np.isclose(x2, sick_sample_1, atol=1e-5), axis=1)
if np.any(matches):
print("Found exact match of original sick sample in augmented data.")
print(f"Count of matches: {np.sum(matches)}")
else:
print("Original sick sample NOT found in augmented data (implies transformation or different subset).")
# Check if x2 contains duplicates within itself
_, counts = np.unique(x2, axis=0, return_counts=True)
if np.any(counts > 1):
print("Augmented data contains exact duplicates!")
else:
print("Augmented data has unique samples.")