Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- app/main.py +23 -37
- download_new_dataset.py +36 -28
- kasahealth-frontend/index.html +3 -7
- kasahealth-frontend/static/css/style.css +14 -8
- kasahealth-frontend/static/images/logo.jpeg +0 -0
- kasahealth-frontend/vercel.json +11 -0
- models/compare_and_test.py +70 -0
- models/final_test_50_healthy.py +64 -0
- models/final_test_50_sick.py +63 -0
- models/healthy_test_result.txt +1 -0
- models/hear_classifier_v8_elite.h5 +3 -0
- models/test_results.txt +0 -0
- models/v8_final_test.py +66 -0
- predict_user_file.py +12 -0
- utils/analyze_misses.py +55 -0
- utils/audio_validator.py +45 -0
- utils/audit_mislabels.py +66 -0
- utils/audit_mislabels.txt +19 -0
- utils/extract_elite_samples.py +96 -0
- utils/extract_hybrid_v5.py +106 -0
- utils/hear_extractor.py +19 -3
- utils/inspect_labels.py +29 -0
- utils/process_coughvid.py +115 -0
- utils/purge_and_retrain.py +89 -0
- utils/test_overlap.py +53 -0
app/main.py
CHANGED
|
@@ -52,34 +52,29 @@ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit
|
|
| 52 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
| 53 |
|
| 54 |
# Configuration
|
| 55 |
-
MODEL_PATH = os.path.join(project_root, "models", "
|
| 56 |
-
CLASSES_PATH = os.path.join(project_root, "models", "hear_classes_advanced.npy")
|
| 57 |
|
| 58 |
-
# Global variables for
|
| 59 |
extractor = None
|
| 60 |
classifier_model = None
|
| 61 |
-
classes = None
|
| 62 |
|
| 63 |
def load_resources():
|
| 64 |
-
global extractor, classifier_model
|
| 65 |
if extractor is None:
|
| 66 |
print("Initializing HeAR Extractor...")
|
| 67 |
-
# Use the HF_TOKEN from Space Secrets for gated model access
|
| 68 |
hf_token = os.environ.get('HF_TOKEN')
|
| 69 |
extractor = HeARExtractor(token=hf_token)
|
| 70 |
|
| 71 |
if classifier_model is None:
|
| 72 |
-
print(f"Loading Model from {MODEL_PATH}...")
|
| 73 |
classifier_model = load_model(MODEL_PATH, compile=False)
|
| 74 |
-
classes = np.load(CLASSES_PATH)
|
| 75 |
-
print(f"Classes: {classes}")
|
| 76 |
|
| 77 |
@app.route('/')
|
| 78 |
def index():
|
| 79 |
return jsonify({
|
| 80 |
"status": "online",
|
| 81 |
-
"service": "KasaHealth Diagnostic API",
|
| 82 |
-
"version": "1.
|
| 83 |
"message": "Send audio files via POST to /predict"
|
| 84 |
})
|
| 85 |
|
|
@@ -98,39 +93,34 @@ def predict():
|
|
| 98 |
file.save(filepath)
|
| 99 |
|
| 100 |
try:
|
| 101 |
-
# Ensure resources are loaded
|
| 102 |
load_resources()
|
| 103 |
-
|
| 104 |
-
# 1. Load and resample
|
| 105 |
y, sr = librosa.load(filepath, sr=16000, duration=5.0)
|
| 106 |
|
| 107 |
-
#
|
| 108 |
-
# Calculate the root mean square (RMS) energy to find volume
|
| 109 |
rms_energy = np.mean(librosa.feature.rms(y=y))
|
| 110 |
-
|
| 111 |
-
# If the recording is practically silent, reject it
|
| 112 |
if rms_energy < 0.005:
|
| 113 |
os.remove(filepath)
|
| 114 |
-
return jsonify({
|
| 115 |
-
"error": "No cough detected. The recording was too quiet. Please cough forcefully."
|
| 116 |
-
}), 400
|
| 117 |
|
| 118 |
-
# 2. Preprocess
|
| 119 |
y_clean = advanced_preprocess(y, sr)
|
| 120 |
-
|
| 121 |
-
# 3. Extract Features
|
| 122 |
emb = extractor.extract(y_clean)
|
| 123 |
|
| 124 |
if emb is not None:
|
| 125 |
-
# 4. Predict
|
| 126 |
X = emb[np.newaxis, ...]
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
|
| 132 |
-
#
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
if raw_label == "sick" and confidence < THRESHOLD:
|
| 135 |
final_label = "healthy"
|
| 136 |
is_inconclusive = True
|
|
@@ -138,9 +128,7 @@ def predict():
|
|
| 138 |
final_label = raw_label
|
| 139 |
is_inconclusive = False
|
| 140 |
|
| 141 |
-
# Clean up file
|
| 142 |
os.remove(filepath)
|
| 143 |
-
|
| 144 |
return jsonify({
|
| 145 |
"status": "success",
|
| 146 |
"result": final_label,
|
|
@@ -151,12 +139,10 @@ def predict():
|
|
| 151 |
})
|
| 152 |
else:
|
| 153 |
os.remove(filepath)
|
| 154 |
-
return jsonify({"error": "
|
| 155 |
|
| 156 |
except Exception as e:
|
| 157 |
-
if os.path.exists(filepath):
|
| 158 |
-
os.remove(filepath)
|
| 159 |
-
print(f"Error processing audio: {e}")
|
| 160 |
return jsonify({"error": str(e)}), 500
|
| 161 |
|
| 162 |
def get_recommendation(label, is_inconclusive):
|
|
|
|
| 52 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
| 53 |
|
| 54 |
# Configuration
|
| 55 |
+
MODEL_PATH = os.path.join(project_root, "models", "hear_classifier_v8_elite.h5")
|
|
|
|
| 56 |
|
| 57 |
+
# Global variables for lazy loading
|
| 58 |
extractor = None
|
| 59 |
classifier_model = None
|
|
|
|
| 60 |
|
| 61 |
def load_resources():
|
| 62 |
+
global extractor, classifier_model
|
| 63 |
if extractor is None:
|
| 64 |
print("Initializing HeAR Extractor...")
|
|
|
|
| 65 |
hf_token = os.environ.get('HF_TOKEN')
|
| 66 |
extractor = HeARExtractor(token=hf_token)
|
| 67 |
|
| 68 |
if classifier_model is None:
|
| 69 |
+
print(f"Loading Elite Model from {MODEL_PATH}...")
|
| 70 |
classifier_model = load_model(MODEL_PATH, compile=False)
|
|
|
|
|
|
|
| 71 |
|
| 72 |
@app.route('/')
|
| 73 |
def index():
|
| 74 |
return jsonify({
|
| 75 |
"status": "online",
|
| 76 |
+
"service": "KasaHealth Diagnostic API (Elite V8)",
|
| 77 |
+
"version": "1.2.0",
|
| 78 |
"message": "Send audio files via POST to /predict"
|
| 79 |
})
|
| 80 |
|
|
|
|
| 93 |
file.save(filepath)
|
| 94 |
|
| 95 |
try:
|
|
|
|
| 96 |
load_resources()
|
|
|
|
|
|
|
| 97 |
y, sr = librosa.load(filepath, sr=16000, duration=5.0)
|
| 98 |
|
| 99 |
+
# VAD Lite
|
|
|
|
| 100 |
rms_energy = np.mean(librosa.feature.rms(y=y))
|
|
|
|
|
|
|
| 101 |
if rms_energy < 0.005:
|
| 102 |
os.remove(filepath)
|
| 103 |
+
return jsonify({"error": "No cough detected. Please record in a quieter area."}), 400
|
|
|
|
|
|
|
| 104 |
|
|
|
|
| 105 |
y_clean = advanced_preprocess(y, sr)
|
|
|
|
|
|
|
| 106 |
emb = extractor.extract(y_clean)
|
| 107 |
|
| 108 |
if emb is not None:
|
|
|
|
| 109 |
X = emb[np.newaxis, ...]
|
| 110 |
+
prob = classifier_model.predict(X, verbose=0)[0][0]
|
| 111 |
+
|
| 112 |
+
# Logic: sigmoid (0=healthy, 1=sick)
|
| 113 |
+
raw_label = "sick" if prob > 0.5 else "healthy"
|
| 114 |
|
| 115 |
+
# Confidence is distance from decision boundary
|
| 116 |
+
if raw_label == "sick":
|
| 117 |
+
confidence = float(prob)
|
| 118 |
+
else:
|
| 119 |
+
confidence = float(1.0 - prob)
|
| 120 |
+
|
| 121 |
+
# --- Elite Threshold Check ---
|
| 122 |
+
# Only report 'sick' if we are VERY sure (> 0.8)
|
| 123 |
+
THRESHOLD = 0.80
|
| 124 |
if raw_label == "sick" and confidence < THRESHOLD:
|
| 125 |
final_label = "healthy"
|
| 126 |
is_inconclusive = True
|
|
|
|
| 128 |
final_label = raw_label
|
| 129 |
is_inconclusive = False
|
| 130 |
|
|
|
|
| 131 |
os.remove(filepath)
|
|
|
|
| 132 |
return jsonify({
|
| 133 |
"status": "success",
|
| 134 |
"result": final_label,
|
|
|
|
| 139 |
})
|
| 140 |
else:
|
| 141 |
os.remove(filepath)
|
| 142 |
+
return jsonify({"error": "Feature extraction failed"}), 500
|
| 143 |
|
| 144 |
except Exception as e:
|
| 145 |
+
if os.path.exists(filepath): os.remove(filepath)
|
|
|
|
|
|
|
| 146 |
return jsonify({"error": str(e)}), 500
|
| 147 |
|
| 148 |
def get_recommendation(label, is_inconclusive):
|
download_new_dataset.py
CHANGED
|
@@ -8,7 +8,7 @@ import shutil
|
|
| 8 |
# --- Configuration ---
|
| 9 |
DATASET_SLUG = "andrewmvd/covid19-cough-audio-classification"
|
| 10 |
DOWNLOAD_DIR = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public"
|
| 11 |
-
EXTRACT_DIR =
|
| 12 |
ORGANIZED_DIR = os.path.join(DOWNLOAD_DIR, "organized")
|
| 13 |
|
| 14 |
def run_command(cmd):
|
|
@@ -18,36 +18,46 @@ def run_command(cmd):
|
|
| 18 |
print(f"Error executing command: {e}")
|
| 19 |
return False
|
| 20 |
return True
|
| 21 |
-
|
| 22 |
def main():
|
| 23 |
if not os.path.exists(DOWNLOAD_DIR):
|
| 24 |
os.makedirs(DOWNLOAD_DIR)
|
| 25 |
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if not metadata_file:
|
| 50 |
-
print("Metadata CSV not found.
|
| 51 |
return
|
| 52 |
|
| 53 |
print(f"Step 3: Organizing files based on {metadata_file}...")
|
|
@@ -89,9 +99,7 @@ def main():
|
|
| 89 |
else: count_s += 1
|
| 90 |
|
| 91 |
print(f"Done! Organized {count_h} healthy and {count_s} sick files.")
|
| 92 |
-
print(f"
|
| 93 |
-
|
| 94 |
-
print(f"Finished organizing {processed_count} files into {ORGANIZED_DIR}")
|
| 95 |
|
| 96 |
if __name__ == "__main__":
|
| 97 |
main()
|
|
|
|
| 8 |
# --- Configuration ---
|
| 9 |
DATASET_SLUG = "andrewmvd/covid19-cough-audio-classification"
|
| 10 |
DOWNLOAD_DIR = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public"
|
| 11 |
+
EXTRACT_DIR = DOWNLOAD_DIR # Use the base folder since user has manually unzipped there
|
| 12 |
ORGANIZED_DIR = os.path.join(DOWNLOAD_DIR, "organized")
|
| 13 |
|
| 14 |
def run_command(cmd):
|
|
|
|
| 18 |
print(f"Error executing command: {e}")
|
| 19 |
return False
|
| 20 |
return True
|
|
|
|
| 21 |
def main():
|
| 22 |
if not os.path.exists(DOWNLOAD_DIR):
|
| 23 |
os.makedirs(DOWNLOAD_DIR)
|
| 24 |
|
| 25 |
+
# Check if we already have the metadata CSV to skip steps 1 and 2
|
| 26 |
+
metadata_filename = "metadata_compiled.csv"
|
| 27 |
+
potential_metadata = os.path.join(DOWNLOAD_DIR, metadata_filename)
|
| 28 |
+
|
| 29 |
+
if os.path.exists(potential_metadata):
|
| 30 |
+
print(f"Step 1 & 2: Skipping! {metadata_filename} already found in {DOWNLOAD_DIR}")
|
| 31 |
+
metadata_file = potential_metadata
|
| 32 |
+
else:
|
| 33 |
+
# 1. Download dataset using Kaggle CLI
|
| 34 |
+
print(f"Step 1: Downloading {DATASET_SLUG}...")
|
| 35 |
+
if not run_command(f"kaggle datasets download -d {DATASET_SLUG} -p {DOWNLOAD_DIR}"):
|
| 36 |
+
print("Failed to download dataset. Check your Kaggle API key at ~/.kaggle/kaggle.json")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
# 2. Extract dataset
|
| 40 |
+
zips = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith(".zip")]
|
| 41 |
+
if not zips:
|
| 42 |
+
print(f"No zip found in {DOWNLOAD_DIR}. Was it already unzipped?")
|
| 43 |
+
else:
|
| 44 |
+
zip_filename = zips[0]
|
| 45 |
+
zip_path = os.path.join(DOWNLOAD_DIR, zip_filename)
|
| 46 |
+
print(f"Step 2: Extracting {zip_filename} to {DOWNLOAD_DIR}...")
|
| 47 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 48 |
+
zip_ref.extractall(DOWNLOAD_DIR)
|
| 49 |
|
| 50 |
+
# 3. Find and load metadata again (in case it unzipped somewhere else)
|
| 51 |
+
metadata_file = None
|
| 52 |
+
for root, dirs, files in os.walk(DOWNLOAD_DIR):
|
| 53 |
+
for f in files:
|
| 54 |
+
if "metadata" in f.lower() and f.endswith(".csv"):
|
| 55 |
+
metadata_file = os.path.join(root, f)
|
| 56 |
+
break
|
| 57 |
+
if metadata_file: break
|
| 58 |
+
|
| 59 |
if not metadata_file:
|
| 60 |
+
print("Metadata CSV not found. Please ensure the dataset is downloaded correctly.")
|
| 61 |
return
|
| 62 |
|
| 63 |
print(f"Step 3: Organizing files based on {metadata_file}...")
|
|
|
|
| 99 |
else: count_s += 1
|
| 100 |
|
| 101 |
print(f"Done! Organized {count_h} healthy and {count_s} sick files.")
|
| 102 |
+
print(f"Organized datasets location: {ORGANIZED_DIR}")
|
|
|
|
|
|
|
| 103 |
|
| 104 |
if __name__ == "__main__":
|
| 105 |
main()
|
kasahealth-frontend/index.html
CHANGED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
|
| 19 |
<!-- Styles -->
|
| 20 |
<link rel="stylesheet" href="static/css/style.css">
|
| 21 |
-
<link rel="icon" type="image/
|
| 22 |
</head>
|
| 23 |
<body>
|
| 24 |
|
|
@@ -26,9 +26,8 @@
|
|
| 26 |
<nav class="glass" id="main-nav">
|
| 27 |
<div class="container nav-container">
|
| 28 |
<a href="#" class="logo-wrap" onclick="location.reload()">
|
| 29 |
-
<img src="static/images/logo.
|
| 30 |
</a>
|
| 31 |
-
<span class="credibility-tag"><i class="fas fa-shield-alt"></i> Clinical AI Edge</span>
|
| 32 |
</div>
|
| 33 |
</nav>
|
| 34 |
|
|
@@ -39,9 +38,6 @@
|
|
| 39 |
<!-- ============================================== -->
|
| 40 |
<section id="screen-landing" class="screen active-screen">
|
| 41 |
<div class="hero-content">
|
| 42 |
-
<div class="badge">
|
| 43 |
-
<i class="fas fa-microchip"></i> Powered by Google HeAR API
|
| 44 |
-
</div>
|
| 45 |
<h1>Next-Gen Acoustic<br><span>Lung Screening</span></h1>
|
| 46 |
<p>Smartphone-based respiratory analysis to detect early signs of pulmonary anomalies with clinical-grade precision.</p>
|
| 47 |
|
|
@@ -75,7 +71,7 @@
|
|
| 75 |
<i class="fas fa-mobile-alt anim-float text-navy"></i>
|
| 76 |
</div>
|
| 77 |
<h4>1. Hold Phone Closer</h4>
|
| 78 |
-
<p>
|
| 79 |
</div>
|
| 80 |
|
| 81 |
<!-- Step 2 -->
|
|
|
|
| 18 |
|
| 19 |
<!-- Styles -->
|
| 20 |
<link rel="stylesheet" href="static/css/style.css">
|
| 21 |
+
<link rel="icon" type="image/jpeg" href="static/images/logo.jpeg">
|
| 22 |
</head>
|
| 23 |
<body>
|
| 24 |
|
|
|
|
| 26 |
<nav class="glass" id="main-nav">
|
| 27 |
<div class="container nav-container">
|
| 28 |
<a href="#" class="logo-wrap" onclick="location.reload()">
|
| 29 |
+
<img src="static/images/logo.jpeg" alt="KasaHealth Logo">
|
| 30 |
</a>
|
|
|
|
| 31 |
</div>
|
| 32 |
</nav>
|
| 33 |
|
|
|
|
| 38 |
<!-- ============================================== -->
|
| 39 |
<section id="screen-landing" class="screen active-screen">
|
| 40 |
<div class="hero-content">
|
|
|
|
|
|
|
|
|
|
| 41 |
<h1>Next-Gen Acoustic<br><span>Lung Screening</span></h1>
|
| 42 |
<p>Smartphone-based respiratory analysis to detect early signs of pulmonary anomalies with clinical-grade precision.</p>
|
| 43 |
|
|
|
|
| 71 |
<i class="fas fa-mobile-alt anim-float text-navy"></i>
|
| 72 |
</div>
|
| 73 |
<h4>1. Hold Phone Closer</h4>
|
| 74 |
+
<p>Keep it 4-5 inches away from your mouth for best acoustic quality.</p>
|
| 75 |
</div>
|
| 76 |
|
| 77 |
<!-- Step 2 -->
|
kasahealth-frontend/static/css/style.css
CHANGED
|
@@ -1,14 +1,15 @@
|
|
| 1 |
:root {
|
| 2 |
-
|
| 3 |
-
--primary-
|
| 4 |
-
--
|
|
|
|
| 5 |
--text-main: #2c3e50;
|
| 6 |
--text-light: #576574;
|
| 7 |
--bg-surface: #ffffff;
|
| 8 |
-
--bg-page: #
|
| 9 |
--border-soft: #e2e8f0;
|
| 10 |
|
| 11 |
-
--color-success: #
|
| 12 |
--color-danger: #e74c3c;
|
| 13 |
--color-warning: #f59e0b;
|
| 14 |
|
|
@@ -63,9 +64,14 @@ nav {
|
|
| 63 |
border-bottom: 1px solid rgba(0,0,0,0.05);
|
| 64 |
}
|
| 65 |
|
| 66 |
-
.nav-container { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; }
|
| 67 |
-
.logo-wrap
|
| 68 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
/* Main Container (SPA layout) */
|
| 71 |
.app-container {
|
|
|
|
| 1 |
:root {
|
| 2 |
+
/* Exact colors sampled from KasaHealth Logo */
|
| 3 |
+
--primary-navy: #184e85; /* From "Kasa" */
|
| 4 |
+
--primary-teal: #26b797; /* From "Health" */
|
| 5 |
+
--accent-teal: #e2f6f2; /* Soft background companion */
|
| 6 |
--text-main: #2c3e50;
|
| 7 |
--text-light: #576574;
|
| 8 |
--bg-surface: #ffffff;
|
| 9 |
+
--bg-page: #f9fbfb; /* Very subtle clinical blue-tint */
|
| 10 |
--border-soft: #e2e8f0;
|
| 11 |
|
| 12 |
+
--color-success: #26b797; /* Re-mapped success to brand teal */
|
| 13 |
--color-danger: #e74c3c;
|
| 14 |
--color-warning: #f59e0b;
|
| 15 |
|
|
|
|
| 64 |
border-bottom: 1px solid rgba(0,0,0,0.05);
|
| 65 |
}
|
| 66 |
|
| 67 |
+
.nav-container { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; max-width: 1200px; margin: 0 auto; width: 100%; }
|
| 68 |
+
.logo-wrap { display: flex; align-items: center; height: 100%; }
|
| 69 |
+
.logo-wrap img { height: 55px; object-fit: contain; } /* Mobile size */
|
| 70 |
+
|
| 71 |
+
@media (min-width: 768px) {
|
| 72 |
+
.logo-wrap img { height: 85px; } /* Crisp, large size for laptops and desktops */
|
| 73 |
+
.nav-container { padding: 1.5rem 2rem; }
|
| 74 |
+
}
|
| 75 |
|
| 76 |
/* Main Container (SPA layout) */
|
| 77 |
.app-container {
|
kasahealth-frontend/static/images/logo.jpeg
ADDED
|
kasahealth-frontend/vercel.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": 2,
|
| 3 |
+
"name": "kasahealth-elite",
|
| 4 |
+
"builds": [
|
| 5 |
+
{ "src": "index.html", "use": "@vercel/static" },
|
| 6 |
+
{ "src": "static/**", "use": "@vercel/static" }
|
| 7 |
+
],
|
| 8 |
+
"routes": [
|
| 9 |
+
{ "src": "/(.*)", "dest": "/index.html" }
|
| 10 |
+
]
|
| 11 |
+
}
|
models/compare_and_test.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
from sklearn.utils import shuffle
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
# Paths
|
| 9 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 10 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 11 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 12 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 13 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 14 |
+
|
| 15 |
+
# Clean y
|
| 16 |
+
def clean_y(y):
|
| 17 |
+
if y.dtype.kind in ['U', 'S']:
|
| 18 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 19 |
+
return y.astype(np.float32)
|
| 20 |
+
|
| 21 |
+
# Load data (Same data we used, but we need an "unseen" set)
|
| 22 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 23 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 24 |
+
X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
|
| 25 |
+
y_all = np.concatenate([y1, y2], axis=0)
|
| 26 |
+
|
| 27 |
+
# Shift validation set for testing (Using a seed NOT used in training)
|
| 28 |
+
# Actually, let's just use the validation split from the 11k to be sure it's consistent.
|
| 29 |
+
# For truly unseen, let's use a 15% split like before but with 40 random samples.
|
| 30 |
+
|
| 31 |
+
X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=999) # New seed 999
|
| 32 |
+
|
| 33 |
+
def run_test_on_model(model_name, model_path):
|
| 34 |
+
if not os.path.exists(model_path):
|
| 35 |
+
print(f"Model {model_name} not found yet. Skipping...")
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
model = tf.keras.models.load_model(model_path)
|
| 39 |
+
|
| 40 |
+
# 20 Healthy
|
| 41 |
+
h_idx = np.where(y_target == 0)[0]
|
| 42 |
+
s_idx = np.where(y_target == 1)[0]
|
| 43 |
+
|
| 44 |
+
rng = np.random.default_rng(2026)
|
| 45 |
+
sel_h = rng.choice(h_idx, 20, replace=False)
|
| 46 |
+
sel_s = rng.choice(s_idx, 20, replace=False)
|
| 47 |
+
|
| 48 |
+
X_h_test = X_target[sel_h]
|
| 49 |
+
X_s_test = X_target[sel_s]
|
| 50 |
+
|
| 51 |
+
# Predictions
|
| 52 |
+
preds_h = (model.predict(X_h_test) > 0.5).astype(int).flatten()
|
| 53 |
+
preds_s = (model.predict(X_s_test) > 0.5).astype(int).flatten()
|
| 54 |
+
|
| 55 |
+
acc_h = (np.sum(preds_h == 0) / 20) * 100
|
| 56 |
+
acc_s = (np.sum(preds_s == 1) / 20) * 100
|
| 57 |
+
|
| 58 |
+
print(f"\n--- Model: {model_name} ---")
|
| 59 |
+
print(f"Healthy Accuracy (20/20 Target): {acc_h:.2f}% ({np.sum(preds_h == 0)}/20)")
|
| 60 |
+
print(f"Sick Accuracy (20/20 Target): {acc_s:.2f}% ({np.sum(preds_s == 1)}/20)")
|
| 61 |
+
print(f"Total Model Accuracy (40 samples): {(acc_h + acc_s)/2:.2f}%")
|
| 62 |
+
|
| 63 |
+
# Models to compare
|
| 64 |
+
models_map = {
|
| 65 |
+
"V3 (Standard)": r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v3.h5",
|
| 66 |
+
"V5 PRO (Balanced)": r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
for name, path in models_map.items():
|
| 70 |
+
run_test_on_model(name, path)
|
models/final_test_50_healthy.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
from sklearn.utils import shuffle
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# Paths
|
| 8 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 9 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 10 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 11 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 12 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 13 |
+
|
| 14 |
+
# Clean y
|
| 15 |
+
def clean_y(y):
|
| 16 |
+
if y.dtype.kind in ['U', 'S']:
|
| 17 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 18 |
+
return y.astype(np.float32)
|
| 19 |
+
|
| 20 |
+
# Load data
|
| 21 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 22 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 23 |
+
X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
|
| 24 |
+
y_all = np.concatenate([y1, y2], axis=0)
|
| 25 |
+
|
| 26 |
+
# Unseen set
|
| 27 |
+
X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=777) # Seed 777
|
| 28 |
+
|
| 29 |
+
# Load Model
|
| 30 |
+
model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
|
| 31 |
+
model = tf.keras.models.load_model(model_path)
|
| 32 |
+
|
| 33 |
+
# Filter for HEALTHY samples
|
| 34 |
+
h_idx = np.where(y_target == 0)[0]
|
| 35 |
+
if len(h_idx) < 50:
|
| 36 |
+
print(f"Warning: Only found {len(h_idx)} healthy samples.")
|
| 37 |
+
count = len(h_idx)
|
| 38 |
+
else:
|
| 39 |
+
count = 50
|
| 40 |
+
|
| 41 |
+
rng = np.random.default_rng(2026)
|
| 42 |
+
sel_h = rng.choice(h_idx, count, replace=False)
|
| 43 |
+
X_h_test = X_target[sel_h]
|
| 44 |
+
y_h_test = y_target[sel_h]
|
| 45 |
+
|
| 46 |
+
# Predictions
|
| 47 |
+
preds = model.predict(X_h_test)
|
| 48 |
+
y_pred_bin = (preds > 0.5).astype(int).flatten()
|
| 49 |
+
|
| 50 |
+
# Results (Correct = 0)
|
| 51 |
+
correct = np.sum(y_pred_bin == 0)
|
| 52 |
+
accuracy = (correct / count) * 100
|
| 53 |
+
|
| 54 |
+
print(f"\n--- Final Model: {os.path.basename(model_path)} ---")
|
| 55 |
+
print(f"Total Unseen Healthy Samples Tested: {count}")
|
| 56 |
+
print(f"Correct Identifications: {correct}")
|
| 57 |
+
print(f"Accuracy: {accuracy:.2f}%")
|
| 58 |
+
|
| 59 |
+
print("\n--- Summary of Correctness (Sick Confidence) ---")
|
| 60 |
+
for i in range(count):
|
| 61 |
+
conf = preds[i][0]
|
| 62 |
+
# For healthy, we want low sick confidence
|
| 63 |
+
result = "✅ HEALTHY" if y_pred_bin[i] == 0 else "❌ SICK (False Alarm)"
|
| 64 |
+
print(f"Sample {i+1:2d} | Sick Prob {conf*100:5.2f}% | Result: {result}")
|
models/final_test_50_sick.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
from sklearn.utils import shuffle
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# Paths
|
| 8 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 9 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 10 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 11 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 12 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 13 |
+
|
| 14 |
+
# Clean y
|
| 15 |
+
def clean_y(y):
|
| 16 |
+
if y.dtype.kind in ['U', 'S']:
|
| 17 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 18 |
+
return y.astype(np.float32)
|
| 19 |
+
|
| 20 |
+
# Load data (Includes the new 1000 samples)
|
| 21 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 22 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 23 |
+
X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
|
| 24 |
+
y_all = np.concatenate([y1, y2], axis=0)
|
| 25 |
+
|
| 26 |
+
# Shift validation set for testing (Using a seed NOT used in training)
|
| 27 |
+
X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=777) # Seed 777
|
| 28 |
+
|
| 29 |
+
# Load Model
|
| 30 |
+
model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
|
| 31 |
+
model = tf.keras.models.load_model(model_path)
|
| 32 |
+
|
| 33 |
+
# Filter for SICK samples
|
| 34 |
+
s_idx = np.where(y_target == 1)[0]
|
| 35 |
+
if len(s_idx) < 50:
|
| 36 |
+
print(f"Warning: Only found {len(s_idx)} sick samples in target set.")
|
| 37 |
+
count = len(s_idx)
|
| 38 |
+
else:
|
| 39 |
+
count = 50
|
| 40 |
+
|
| 41 |
+
rng = np.random.default_rng(2026)
|
| 42 |
+
sel_s = rng.choice(s_idx, count, replace=False)
|
| 43 |
+
X_s_test = X_target[sel_s]
|
| 44 |
+
y_s_test = y_target[sel_s]
|
| 45 |
+
|
| 46 |
+
# Predictions
|
| 47 |
+
preds = model.predict(X_s_test)
|
| 48 |
+
y_pred_bin = (preds > 0.5).astype(int).flatten()
|
| 49 |
+
|
| 50 |
+
# Results
|
| 51 |
+
correct = np.sum(y_pred_bin == 1)
|
| 52 |
+
accuracy = (correct / count) * 100
|
| 53 |
+
|
| 54 |
+
print(f"\n--- Final Model: {os.path.basename(model_path)} ---")
|
| 55 |
+
print(f"Total Unseen Sick Samples Tested: {count}")
|
| 56 |
+
print(f"Correct Identifications: {correct}")
|
| 57 |
+
print(f"Accuracy: {accuracy:.2f}%")
|
| 58 |
+
|
| 59 |
+
print("\n--- Summary of Correctness ---")
|
| 60 |
+
for i in range(count):
|
| 61 |
+
conf = preds[i][0]
|
| 62 |
+
result = "✅ SICK" if y_pred_bin[i] == 1 else "❌ HEALTHY (Miss)"
|
| 63 |
+
print(f"Sample {i+1:2d} | Confidence {conf*100:5.2f}% | Result: {result}")
|
models/healthy_test_result.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Correct Healthy: 19/20, Accuracy: 95.00%
|
models/hear_classifier_v8_elite.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e65637808333de73d6d389c835639dbe198b50c45ef6c8202604cd4c332d5e3b
|
| 3 |
+
size 14286224
|
models/test_results.txt
ADDED
|
Binary file (1.88 kB). View file
|
|
|
models/v8_final_test.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
from sklearn.utils import shuffle
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# Paths
|
| 8 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 9 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 10 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 11 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 12 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 13 |
+
path_elite_x = os.path.join(base_dir, "hear_embeddings_elite", "X_elite.npy")
|
| 14 |
+
path_elite_y = os.path.join(base_dir, "hear_embeddings_elite", "y_elite.npy")
|
| 15 |
+
|
| 16 |
+
# Clean y
|
| 17 |
+
def clean_y(y):
|
| 18 |
+
if y.dtype.kind in ['U', 'S']:
|
| 19 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 20 |
+
return y.astype(np.float32)
|
| 21 |
+
|
| 22 |
+
# Load data (use the FULL set to find "truly unseen" samples)
|
| 23 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 24 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 25 |
+
Xe, ye = np.load(path_elite_x), clean_y(np.load(path_elite_y))
|
| 26 |
+
X_all = np.concatenate([X1, X2, Xe], axis=0).astype(np.float32)
|
| 27 |
+
y_all = np.concatenate([y1, y2, ye], axis=0)
|
| 28 |
+
|
| 29 |
+
# Unseen set (Seed 1000 for the final test)
|
| 30 |
+
X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=1000)
|
| 31 |
+
|
| 32 |
+
# Load V8 Elite Model
|
| 33 |
+
model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v8_elite.h5"
|
| 34 |
+
model = tf.keras.models.load_model(model_path)
|
| 35 |
+
|
| 36 |
+
# Filter for SICK and HEALTHY
|
| 37 |
+
s_idx = np.where(y_target == 1)[0]
|
| 38 |
+
h_idx = np.where(y_target == 0)[0]
|
| 39 |
+
|
| 40 |
+
rng = np.random.default_rng(2027) # Change seed again to ensure randomness
|
| 41 |
+
sel_s = rng.choice(s_idx, 50, replace=False)
|
| 42 |
+
sel_h = rng.choice(h_idx, 50, replace=False)
|
| 43 |
+
|
| 44 |
+
# Predictions - Sick
|
| 45 |
+
preds_s = model.predict(X_target[sel_s])
|
| 46 |
+
y_pred_s = (preds_s > 0.5).astype(int).flatten()
|
| 47 |
+
acc_s = (np.sum(y_pred_s == 1) / 50) * 100
|
| 48 |
+
|
| 49 |
+
# Predictions - Healthy
|
| 50 |
+
preds_h = model.predict(X_target[sel_h])
|
| 51 |
+
y_pred_h = (preds_h > 0.5).astype(int).flatten()
|
| 52 |
+
acc_h = (np.sum(y_pred_h == 0) / 50) * 100
|
| 53 |
+
|
| 54 |
+
print(f"\n--- Model V8: THE ELITE GUARD ---")
|
| 55 |
+
print(f"Dataset: Elite-Merged & Purged (10,342 Balanced Samples)")
|
| 56 |
+
print(f"\nSICK TEST (50 Samples): {acc_s:.2f}% ({np.sum(y_pred_s == 1)}/50)")
|
| 57 |
+
print(f"HEALTHY TEST (50 Samples): {acc_h:.2f}% ({np.sum(y_pred_h == 0)}/50)")
|
| 58 |
+
print(f"OVERALL ACCURACY: {(acc_s + acc_h)/2:.2f}%")
|
| 59 |
+
|
| 60 |
+
# Individual report for Sick misses
|
| 61 |
+
miss_s = 50 - np.sum(y_pred_s == 1)
|
| 62 |
+
print(f"\nNote: Misidentified {miss_s} sick samples.")
|
| 63 |
+
if miss_s > 0:
|
| 64 |
+
for i in range(miss_s):
|
| 65 |
+
miss_idx = np.where(y_pred_s == 0)[0][i]
|
| 66 |
+
print(f"Missed Sick {i+1:2d} | Confidence: {preds_s[miss_idx][0]*100:5.2f}%")
|
predict_user_file.py
CHANGED
|
@@ -51,6 +51,18 @@ def predict_single_file(file_path):
|
|
| 51 |
print("Loading and preprocessing audio...")
|
| 52 |
y, sr = librosa.load(file_path, sr=16000, duration=5.0)
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# Apply Advanced Preprocessing (Critical for correct result!)
|
| 55 |
y_clean = advanced_preprocess(y, sr)
|
| 56 |
|
|
|
|
| 51 |
print("Loading and preprocessing audio...")
|
| 52 |
y, sr = librosa.load(file_path, sr=16000, duration=5.0)
|
| 53 |
|
| 54 |
+
# --- NEW: Audio Validation (Gatekeeper) ---
|
| 55 |
+
from utils.audio_validator import validate_audio_is_cough
|
| 56 |
+
is_valid_cough, reason, val_conf = validate_audio_is_cough(y, sr)
|
| 57 |
+
|
| 58 |
+
if not is_valid_cough:
|
| 59 |
+
print("\n" + "="*50)
|
| 60 |
+
print(f"REJECTED: Audio Validation Failed!")
|
| 61 |
+
print(f"REASON: {reason}")
|
| 62 |
+
print(f"RECOMMENDATION: Please record a clear, loud cough in a quiet room.")
|
| 63 |
+
print("="*50)
|
| 64 |
+
return
|
| 65 |
+
|
| 66 |
# Apply Advanced Preprocessing (Critical for correct result!)
|
| 67 |
y_clean = advanced_preprocess(y, sr)
|
| 68 |
|
utils/analyze_misses.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
from sklearn.utils import shuffle
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# Paths
|
| 8 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 9 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 10 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 11 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 12 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 13 |
+
|
| 14 |
+
# Load and clean
|
| 15 |
+
def clean_y(y):
|
| 16 |
+
if y.dtype.kind in ['U', 'S']:
|
| 17 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 18 |
+
return y.astype(np.float32)
|
| 19 |
+
|
| 20 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 21 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 22 |
+
|
| 23 |
+
# Merge
|
| 24 |
+
X = np.concatenate([X1, X2], axis=0).astype(np.float32)
|
| 25 |
+
y = np.concatenate([y1, y2], axis=0)
|
| 26 |
+
# No shuffle yet to keep indices traceable if needed, but we'll shuffle after identifying
|
| 27 |
+
# Actually, we just need the samples.
|
| 28 |
+
|
| 29 |
+
# Load Model
|
| 30 |
+
model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v3.h5"
|
| 31 |
+
model = tf.keras.models.load_model(model_path)
|
| 32 |
+
|
| 33 |
+
# Predict all
|
| 34 |
+
preds = model.predict(X, batch_size=128)
|
| 35 |
+
y_pred_bin = (preds > 0.5).astype(int).flatten()
|
| 36 |
+
|
| 37 |
+
# Find misclassified SICK samples
|
| 38 |
+
sick_mask = (y == 1)
|
| 39 |
+
misclassified_sick = (sick_mask & (y_pred_bin == 0))
|
| 40 |
+
correctly_classified_sick = (sick_mask & (y_pred_bin == 1))
|
| 41 |
+
|
| 42 |
+
total_sick = np.sum(sick_mask)
|
| 43 |
+
total_mis_sick = np.sum(misclassified_sick)
|
| 44 |
+
|
| 45 |
+
# Find misclassified HEALTHY samples
|
| 46 |
+
healthy_mask = (y == 0)
|
| 47 |
+
misclassified_healthy = (healthy_mask & (y_pred_bin == 1))
|
| 48 |
+
total_healthy = np.sum(healthy_mask)
|
| 49 |
+
total_mis_healthy = np.sum(misclassified_healthy)
|
| 50 |
+
|
| 51 |
+
print(f"Total Sick: {total_sick}, Misclassified: {total_mis_sick} ({total_mis_sick/total_sick*100:.2f}%)")
|
| 52 |
+
print(f"Total Healthy: {total_healthy}, Misclassified: {total_mis_healthy} ({total_mis_healthy/total_healthy*100:.2f}%)")
|
| 53 |
+
|
| 54 |
+
# Let's see how many samples we have in total
|
| 55 |
+
print(f"Total Samples: {len(X)}")
|
utils/audio_validator.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
def validate_audio_is_cough(y, sr):
|
| 5 |
+
"""
|
| 6 |
+
Validates if the audio signal represents a typical cough pattern.
|
| 7 |
+
Returns: (is_valid: bool, reason: str, confidence: float)
|
| 8 |
+
"""
|
| 9 |
+
try:
|
| 10 |
+
# Extract features
|
| 11 |
+
rms = librosa.feature.rms(y=y)[0]
|
| 12 |
+
zcr = librosa.feature.zero_crossing_rate(y=y)[0]
|
| 13 |
+
centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
|
| 14 |
+
|
| 15 |
+
# 1. Check for total silence or extremely low volume
|
| 16 |
+
mean_rms = np.mean(rms)
|
| 17 |
+
if mean_rms < 0.001:
|
| 18 |
+
return False, "Audio is too quiet or empty.", 0.0
|
| 19 |
+
|
| 20 |
+
# 2. Check for continuous talking/laughing (high harmonicity/steady energy)
|
| 21 |
+
# Coughs have high energy variance (bursts). Talking/laughing is more continuous.
|
| 22 |
+
rms_variance = np.var(rms)
|
| 23 |
+
rms_mean_ratio = rms_variance / (mean_rms + 1e-6)
|
| 24 |
+
|
| 25 |
+
# 3. Check Spectral Centroid (Pitch/Brightness)
|
| 26 |
+
# Laughing and talking often have a lower, more stable spectral centroid than the harsh burst of a cough.
|
| 27 |
+
mean_centroid = np.mean(centroid)
|
| 28 |
+
|
| 29 |
+
# 4. Check Zero Crossing Rate (Noisiness)
|
| 30 |
+
# Coughs are noisy (high ZCR bursts). Vowels in laughing/talking are harmonic (steady, lower ZCR).
|
| 31 |
+
mean_zcr = np.mean(zcr)
|
| 32 |
+
|
| 33 |
+
# Simple heuristic thresholding (can be tuned based on user files)
|
| 34 |
+
# A typical cough has high variance (spikes) and high noisiness.
|
| 35 |
+
is_continuous_noise = rms_mean_ratio < 0.015 and mean_zcr < 0.05
|
| 36 |
+
|
| 37 |
+
if is_continuous_noise:
|
| 38 |
+
# Looks like steady speech, laughing, or background humming
|
| 39 |
+
return False, "Audio detected as speaking, laughing, or steady noise. Please record a clear cough.", 0.95
|
| 40 |
+
|
| 41 |
+
# Passed basic validation
|
| 42 |
+
return True, "Valid audio signal detected.", 0.85
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
return False, f"Validation error: {str(e)}", 0.0
|
utils/audit_mislabels.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Paths
|
| 6 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 7 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 8 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 9 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 10 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 11 |
+
mapping_path = os.path.join(base_dir, "hear_embeddings_coughvid", "processed_paths.txt")
|
| 12 |
+
|
| 13 |
+
# Clean y
|
| 14 |
+
def clean_y(y):
|
| 15 |
+
if y.dtype.kind in ['U', 'S']:
|
| 16 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 17 |
+
return y.astype(np.float32)
|
| 18 |
+
|
| 19 |
+
# Load data
|
| 20 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 21 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 22 |
+
X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
|
| 23 |
+
y_all = np.concatenate([y1, y2], axis=0)
|
| 24 |
+
|
| 25 |
+
# Load Model
|
| 26 |
+
model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
|
| 27 |
+
model = tf.keras.models.load_model(model_path)
|
| 28 |
+
|
| 29 |
+
# Predict all
|
| 30 |
+
print("Auditing 11,000 samples for mislabels...")
|
| 31 |
+
preds = model.predict(X_all, batch_size=128).flatten()
|
| 32 |
+
|
| 33 |
+
# Filter for SICK samples where model is VERY SURE they are healthy
|
| 34 |
+
# i.e., label=1, but pred < 0.1
|
| 35 |
+
sick_labels = (y_all == 1)
|
| 36 |
+
confused_sick_indices = np.where(sick_labels & (preds < 0.1))[0]
|
| 37 |
+
|
| 38 |
+
# Mapping
|
| 39 |
+
coughvid_paths = []
|
| 40 |
+
if os.path.exists(mapping_path):
|
| 41 |
+
with open(mapping_path, 'r') as f:
|
| 42 |
+
coughvid_paths = [line.strip() for line in f]
|
| 43 |
+
|
| 44 |
+
print(f"\nFound {len(confused_sick_indices)} Sick samples the model is SURE are Healthy.")
|
| 45 |
+
|
| 46 |
+
# Report Top 10
|
| 47 |
+
audit_report = "--- DATA AUDIT: SICK SAMPLES FLAGGED AS HEALTHY ---\n"
|
| 48 |
+
audit_report += f"Total Samples Audited: {len(X_all)}\n"
|
| 49 |
+
audit_report += f"Total Flags: {len(confused_sick_indices)}\n\n"
|
| 50 |
+
|
| 51 |
+
results_to_show = confused_sick_indices[:15]
|
| 52 |
+
for idx in results_to_show:
|
| 53 |
+
confidence_as_healthy = (1 - preds[idx]) * 100
|
| 54 |
+
if idx >= 6824: # From Coughvid
|
| 55 |
+
sub_idx = idx - 6824
|
| 56 |
+
if sub_idx < len(coughvid_paths):
|
| 57 |
+
path = coughvid_paths[sub_idx]
|
| 58 |
+
audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: {os.path.basename(path)}\n"
|
| 59 |
+
else:
|
| 60 |
+
audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: Unknown (Coughvid index {sub_idx})\n"
|
| 61 |
+
else:
|
| 62 |
+
audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: Unknown (Original Dataset)\n"
|
| 63 |
+
|
| 64 |
+
with open("audit_mislabels.txt", "w") as f:
|
| 65 |
+
f.write(audit_report)
|
| 66 |
+
print(audit_report)
|
utils/audit_mislabels.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--- DATA AUDIT: SICK SAMPLES FLAGGED AS HEALTHY ---
|
| 2 |
+
Total Samples Audited: 11092
|
| 3 |
+
Total Flags: 213
|
| 4 |
+
|
| 5 |
+
Index 4 | Model Confidence: 90.3% (Healthy) | File: Unknown (Original Dataset)
|
| 6 |
+
Index 122 | Model Confidence: 91.2% (Healthy) | File: Unknown (Original Dataset)
|
| 7 |
+
Index 165 | Model Confidence: 93.8% (Healthy) | File: Unknown (Original Dataset)
|
| 8 |
+
Index 211 | Model Confidence: 93.1% (Healthy) | File: Unknown (Original Dataset)
|
| 9 |
+
Index 255 | Model Confidence: 90.7% (Healthy) | File: Unknown (Original Dataset)
|
| 10 |
+
Index 270 | Model Confidence: 98.4% (Healthy) | File: Unknown (Original Dataset)
|
| 11 |
+
Index 332 | Model Confidence: 98.1% (Healthy) | File: Unknown (Original Dataset)
|
| 12 |
+
Index 472 | Model Confidence: 90.9% (Healthy) | File: Unknown (Original Dataset)
|
| 13 |
+
Index 475 | Model Confidence: 94.0% (Healthy) | File: Unknown (Original Dataset)
|
| 14 |
+
Index 477 | Model Confidence: 93.2% (Healthy) | File: Unknown (Original Dataset)
|
| 15 |
+
Index 486 | Model Confidence: 92.0% (Healthy) | File: Unknown (Original Dataset)
|
| 16 |
+
Index 501 | Model Confidence: 90.8% (Healthy) | File: Unknown (Original Dataset)
|
| 17 |
+
Index 594 | Model Confidence: 92.1% (Healthy) | File: Unknown (Original Dataset)
|
| 18 |
+
Index 630 | Model Confidence: 90.1% (Healthy) | File: Unknown (Original Dataset)
|
| 19 |
+
Index 631 | Model Confidence: 92.3% (Healthy) | File: Unknown (Original Dataset)
|
utils/extract_elite_samples.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
# Add project root to path
|
| 8 |
+
PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project"
|
| 9 |
+
if PROJECT_ROOT not in sys.path:
|
| 10 |
+
sys.path.append(PROJECT_ROOT)
|
| 11 |
+
|
| 12 |
+
from utils.hear_extractor import HeARExtractor
|
| 13 |
+
|
| 14 |
+
# --- Config ---
|
| 15 |
+
META_PATH = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\metadata_compiled.csv"
|
| 16 |
+
AUDIO_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized"
|
| 17 |
+
OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_elite"
|
| 18 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
def main():
|
| 21 |
+
print("Identifying Elite Samples...")
|
| 22 |
+
df = pd.read_csv(META_PATH)
|
| 23 |
+
|
| 24 |
+
# SICK (Elite Match)
|
| 25 |
+
elite_sick_uuids = df[(df['status'].isin(['COVID-19', 'sick'])) & (df['cough_detected'] > 0.8) & (df['SNR'] > 5)]['uuid'].tolist()
|
| 26 |
+
|
| 27 |
+
# HEALTHY (Elite Match)
|
| 28 |
+
elite_healthy_uuids = df[(df['status'] == 'healthy') & (df['cough_detected'] > 0.95)]['uuid'].tolist()
|
| 29 |
+
|
| 30 |
+
print(f"Total Eligible Elite Sick: {len(elite_sick_uuids)}")
|
| 31 |
+
print(f"Total Eligible Elite Healthy: {len(elite_healthy_uuids)}")
|
| 32 |
+
|
| 33 |
+
# Let's limit Healthy to 1,000 for speed, Sick as many as we can find (approx 600-700)
|
| 34 |
+
sick_to_process = elite_sick_uuids[:1000]
|
| 35 |
+
healthy_to_process = elite_healthy_uuids[:1000]
|
| 36 |
+
|
| 37 |
+
# Map UUIDs to actual paths
|
| 38 |
+
all_tasks = []
|
| 39 |
+
|
| 40 |
+
# Sick mapping
|
| 41 |
+
sick_folder = os.path.join(AUDIO_ROOT, 'sick')
|
| 42 |
+
for uuid in sick_to_process:
|
| 43 |
+
found = False
|
| 44 |
+
for ext in ['.webm', '.wav', '.ogg']:
|
| 45 |
+
# Files in organized folder start with 'cv_'
|
| 46 |
+
path = os.path.join(sick_folder, "cv_" + uuid + ext)
|
| 47 |
+
if os.path.exists(path):
|
| 48 |
+
all_tasks.append((path, 'sick'))
|
| 49 |
+
found = True
|
| 50 |
+
break
|
| 51 |
+
if not found:
|
| 52 |
+
# Try without prefix just in case some are different
|
| 53 |
+
for ext in ['.webm', '.wav', '.ogg']:
|
| 54 |
+
path = os.path.join(sick_folder, uuid + ext)
|
| 55 |
+
if os.path.exists(path):
|
| 56 |
+
all_tasks.append((path, 'sick'))
|
| 57 |
+
break
|
| 58 |
+
|
| 59 |
+
# Healthy mapping
|
| 60 |
+
healthy_folder = os.path.join(AUDIO_ROOT, 'healthy')
|
| 61 |
+
for uuid in healthy_to_process:
|
| 62 |
+
found = False
|
| 63 |
+
for ext in ['.webm', '.wav', '.ogg']:
|
| 64 |
+
path = os.path.join(healthy_folder, "cv_" + uuid + ext)
|
| 65 |
+
if os.path.exists(path):
|
| 66 |
+
all_tasks.append((path, 'healthy'))
|
| 67 |
+
found = True
|
| 68 |
+
break
|
| 69 |
+
if not found:
|
| 70 |
+
for ext in ['.webm', '.wav', '.ogg']:
|
| 71 |
+
path = os.path.join(healthy_folder, uuid + ext)
|
| 72 |
+
if os.path.exists(path):
|
| 73 |
+
all_tasks.append((path, 'healthy'))
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
print(f"Starting Elite Extraction (Total: {len(all_tasks)} samples)...")
|
| 77 |
+
|
| 78 |
+
extractor = HeARExtractor()
|
| 79 |
+
features = []
|
| 80 |
+
labels = []
|
| 81 |
+
|
| 82 |
+
for path, label in tqdm(all_tasks):
|
| 83 |
+
try:
|
| 84 |
+
emb = extractor.extract(path)
|
| 85 |
+
if emb is not None:
|
| 86 |
+
features.append(emb)
|
| 87 |
+
labels.append(label)
|
| 88 |
+
except Exception:
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
+
np.save(os.path.join(OUTPUT_DIR, "X_elite.npy"), np.array(features))
|
| 92 |
+
np.save(os.path.join(OUTPUT_DIR, "y_elite.npy"), np.array(labels))
|
| 93 |
+
print(f"Elite Data Saved: {len(features)} samples.")
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
main()
|
utils/extract_hybrid_v5.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
# Add project root to path
|
| 8 |
+
PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project"
|
| 9 |
+
if PROJECT_ROOT not in sys.path:
|
| 10 |
+
sys.path.append(PROJECT_ROOT)
|
| 11 |
+
|
| 12 |
+
from utils.hear_extractor import HeARExtractor
|
| 13 |
+
|
| 14 |
+
# --- Config ---
|
| 15 |
+
AUDIO_SEARCH_PATHS = [
|
| 16 |
+
r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized\sick",
|
| 17 |
+
r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized\healthy",
|
| 18 |
+
r"c:\Users\ASUS\lung_ai_project\data\cough",
|
| 19 |
+
r"c:\Users\ASUS\lung_ai_project\data\coswara\coswara_data\kaggle_data"
|
| 20 |
+
]
|
| 21 |
+
OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hybrid_features"
|
| 22 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 23 |
+
|
| 24 |
+
def extract_traditional_features(y, sr):
|
| 25 |
+
# MFCC (13)
|
| 26 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
| 27 |
+
mfcc_mean = np.mean(mfcc, axis=1)
|
| 28 |
+
|
| 29 |
+
# Spectral Centroid
|
| 30 |
+
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
|
| 31 |
+
spec_cent_mean = np.mean(spec_cent)
|
| 32 |
+
|
| 33 |
+
# Zero Crossing Rate
|
| 34 |
+
zcr = librosa.feature.zero_crossing_rate(y)
|
| 35 |
+
zcr_mean = np.mean(zcr)
|
| 36 |
+
|
| 37 |
+
# RMSE (Energy)
|
| 38 |
+
rmse = librosa.feature.rms(y=y)
|
| 39 |
+
rmse_mean = np.mean(rmse)
|
| 40 |
+
|
| 41 |
+
return np.concatenate([mfcc_mean, [spec_cent_mean, zcr_mean, rmse_mean]])
|
| 42 |
+
|
| 43 |
+
def main():
|
| 44 |
+
print("Finding audio files...")
|
| 45 |
+
all_files = []
|
| 46 |
+
for path in AUDIO_SEARCH_PATHS:
|
| 47 |
+
if not os.path.exists(path): continue
|
| 48 |
+
if "kaggle_data" in path: # Coswara structure
|
| 49 |
+
for pid in os.listdir(path):
|
| 50 |
+
p_dir = os.path.join(path, pid)
|
| 51 |
+
for f in ["cough-heavy.wav", "cough-shallow.wav", "cough.wav"]:
|
| 52 |
+
f_path = os.path.join(p_dir, f)
|
| 53 |
+
if os.path.exists(f_path):
|
| 54 |
+
# Determine label - need metadata for Coswara
|
| 55 |
+
# For now, let's just focus on coughvid where folders are explicit
|
| 56 |
+
pass
|
| 57 |
+
else:
|
| 58 |
+
label = 'sick' if 'sick' in path else 'healthy'
|
| 59 |
+
for f in os.listdir(path):
|
| 60 |
+
if f.endswith(('.wav', '.webm', '.ogg')):
|
| 61 |
+
all_files.append((os.path.join(path, f), label))
|
| 62 |
+
|
| 63 |
+
print(f"Total files found (Coughvid + Cough): {len(all_files)}")
|
| 64 |
+
|
| 65 |
+
# To speed up, we'll limit to 1000 balanced samples (500 each)
|
| 66 |
+
import random
|
| 67 |
+
random.shuffle(all_files)
|
| 68 |
+
|
| 69 |
+
sick_list = [f for f in all_files if f[1] == 'sick'][:500]
|
| 70 |
+
healthy_list = [f for f in all_files if f[1] == 'healthy'][:500]
|
| 71 |
+
balanced_files = sick_list + healthy_list
|
| 72 |
+
|
| 73 |
+
print(f"Processing {len(balanced_files)} balanced samples for Hybrid model...")
|
| 74 |
+
|
| 75 |
+
extractor = HeARExtractor()
|
| 76 |
+
|
| 77 |
+
hybrid_features = []
|
| 78 |
+
labels = []
|
| 79 |
+
|
| 80 |
+
for audio_path, label in tqdm(balanced_files):
|
| 81 |
+
try:
|
| 82 |
+
# 1. HeAR Embedding
|
| 83 |
+
emb = extractor.extract(audio_path)
|
| 84 |
+
if emb is None: continue
|
| 85 |
+
|
| 86 |
+
# 2. Traditional Features
|
| 87 |
+
y, sr = librosa.load(audio_path, sr=16000)
|
| 88 |
+
trad = extract_traditional_features(y, sr)
|
| 89 |
+
|
| 90 |
+
# Combine
|
| 91 |
+
combined = np.concatenate([emb, trad])
|
| 92 |
+
hybrid_features.append(combined)
|
| 93 |
+
labels.append(label)
|
| 94 |
+
|
| 95 |
+
except Exception as e:
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
X_hybrid = np.array(hybrid_features)
|
| 99 |
+
y_hybrid = np.array(labels)
|
| 100 |
+
|
| 101 |
+
np.save(os.path.join(OUTPUT_DIR, "X_hybrid.npy"), X_hybrid)
|
| 102 |
+
np.save(os.path.join(OUTPUT_DIR, "y_hybrid.npy"), y_hybrid)
|
| 103 |
+
print(f"Saved {len(X_hybrid)} hybrid samples to {OUTPUT_DIR}")
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|
utils/hear_extractor.py
CHANGED
|
@@ -1,5 +1,20 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import librosa
|
| 5 |
import tensorflow as tf
|
|
@@ -26,8 +41,8 @@ class HeARExtractor:
|
|
| 26 |
from huggingface_hub import login
|
| 27 |
login(token=token)
|
| 28 |
|
| 29 |
-
# Use
|
| 30 |
-
model_cache_path = os.path.join(
|
| 31 |
|
| 32 |
# Download model files manually to avoid symlink issues on Windows
|
| 33 |
# and ignore unrelated folders (like event_detector) to speed up download
|
|
@@ -94,7 +109,8 @@ class HeARExtractor:
|
|
| 94 |
return np.mean(embeddings, axis=0)
|
| 95 |
|
| 96 |
except Exception as e:
|
| 97 |
-
|
|
|
|
| 98 |
return None
|
| 99 |
|
| 100 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
+
import glob
|
| 4 |
+
|
| 5 |
+
# --- FFmpeg Path Fix for Windows ---
|
| 6 |
+
# 1. Try common WinGet location found on this machine
|
| 7 |
+
FFMPEG_DIR = r"C:\Users\ASUS\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-8.1-full_build\bin"
|
| 8 |
+
if not os.path.exists(FFMPEG_DIR):
|
| 9 |
+
# 2. Try dynamic search in WinGet folder as fallback
|
| 10 |
+
winget_base = os.path.join(os.environ.get("LOCALAPPDATA", ""), "Microsoft", "WinGet", "Packages")
|
| 11 |
+
ffmpeg_bins = glob.glob(os.path.join(winget_base, "*ffmpeg*", "**", "bin"), recursive=True)
|
| 12 |
+
if ffmpeg_bins:
|
| 13 |
+
FFMPEG_DIR = ffmpeg_bins[0]
|
| 14 |
+
|
| 15 |
+
if os.path.exists(FFMPEG_DIR) and FFMPEG_DIR not in os.environ["PATH"]:
|
| 16 |
+
os.environ["PATH"] += os.pathsep + FFMPEG_DIR
|
| 17 |
+
|
| 18 |
import numpy as np
|
| 19 |
import librosa
|
| 20 |
import tensorflow as tf
|
|
|
|
| 41 |
from huggingface_hub import login
|
| 42 |
login(token=token)
|
| 43 |
|
| 44 |
+
# Use a local folder in the project for the model cache
|
| 45 |
+
model_cache_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "hear_model_cache")
|
| 46 |
|
| 47 |
# Download model files manually to avoid symlink issues on Windows
|
| 48 |
# and ignore unrelated folders (like event_detector) to speed up download
|
|
|
|
| 109 |
return np.mean(embeddings, axis=0)
|
| 110 |
|
| 111 |
except Exception as e:
|
| 112 |
+
error_msg = str(e) if str(e) else "Unknown error (check if FFmpeg is working or file is corrupted)"
|
| 113 |
+
print(f"Extraction error ({os.path.basename(audio_input) if isinstance(audio_input, str) else 'array'}): {error_msg}")
|
| 114 |
return None
|
| 115 |
|
| 116 |
if __name__ == "__main__":
|
utils/inspect_labels.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 5 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 6 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 7 |
+
|
| 8 |
+
y1 = np.load(path1_y)
|
| 9 |
+
y2 = np.load(path2_y)
|
| 10 |
+
|
| 11 |
+
print(f"y1 dtype: {y1.dtype}, unique: {np.unique(y1)}")
|
| 12 |
+
print(f"y2 dtype: {y2.dtype}, unique: {np.unique(y2)}")
|
| 13 |
+
|
| 14 |
+
# Convert y1 if string
|
| 15 |
+
if y1.dtype.kind in ['U', 'S']:
|
| 16 |
+
y1_converted = np.where(y1 == 'sick', 1, 0).astype(np.int32)
|
| 17 |
+
print(f"y1 converted dtype: {y1_converted.dtype}, unique: {np.unique(y1_converted)}")
|
| 18 |
+
else:
|
| 19 |
+
y1_converted = y1.astype(np.int32)
|
| 20 |
+
|
| 21 |
+
# Convert y2 if string
|
| 22 |
+
if y2.dtype.kind in ['U', 'S']:
|
| 23 |
+
y2_converted = np.where(y2 == 'sick', 1, 0).astype(np.int32)
|
| 24 |
+
print(f"y2 converted dtype: {y2_converted.dtype}, unique: {np.unique(y2_converted)}")
|
| 25 |
+
else:
|
| 26 |
+
y2_converted = y2.astype(np.int32)
|
| 27 |
+
|
| 28 |
+
y_merged = np.concatenate([y1_converted, y2_converted])
|
| 29 |
+
print(f"y_merged dtype: {y_merged.dtype}, unique: {np.unique(y_merged)}")
|
utils/process_coughvid.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import sys
|
| 5 |
+
import ctypes # To keep Windows awake
|
| 6 |
+
|
| 7 |
+
# Add project root to path to allow absolute imports
|
| 8 |
+
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 9 |
+
if PROJECT_ROOT not in sys.path:
|
| 10 |
+
sys.path.append(PROJECT_ROOT)
|
| 11 |
+
|
| 12 |
+
from utils.hear_extractor import HeARExtractor
|
| 13 |
+
|
| 14 |
+
# --- Configuration ---
|
| 15 |
+
DATA_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized"
|
| 16 |
+
OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_coughvid"
|
| 17 |
+
CHECKPOINT_EVERY = 50
|
| 18 |
+
TARGET_SICK_COUNT = 2500 # Extracting 1,000 more sick samples
|
| 19 |
+
|
| 20 |
+
def run_extraction():
|
| 21 |
+
# Keep Windows awake during extraction
|
| 22 |
+
try:
|
| 23 |
+
# ES_CONTINUOUS (0x80000000) | ES_SYSTEM_REQUIRED (0x00000001)
|
| 24 |
+
ctypes.windll.kernel32.SetThreadExecutionState(0x80000000 | 0x00000001)
|
| 25 |
+
print(">>> Windows 'Stay Awake' mode enabled.")
|
| 26 |
+
except Exception:
|
| 27 |
+
print(">>> Warning: Could not enable 'Stay Awake' mode.")
|
| 28 |
+
|
| 29 |
+
if not os.path.exists(OUTPUT_DIR):
|
| 30 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
features_path = os.path.join(OUTPUT_DIR, "X_coughvid.npy")
|
| 33 |
+
labels_path = os.path.join(OUTPUT_DIR, "y_coughvid.npy")
|
| 34 |
+
|
| 35 |
+
# Load existing if available to resume
|
| 36 |
+
features = []
|
| 37 |
+
labels = []
|
| 38 |
+
if os.path.exists(features_path) and os.path.exists(labels_path):
|
| 39 |
+
print("Loading existing embeddings...")
|
| 40 |
+
features = list(np.load(features_path))
|
| 41 |
+
labels = list(np.load(labels_path))
|
| 42 |
+
|
| 43 |
+
current_sick_count = sum(1 for l in labels if l == 'sick')
|
| 44 |
+
print(f"Current Sick Samples: {current_sick_count}")
|
| 45 |
+
|
| 46 |
+
if current_sick_count >= TARGET_SICK_COUNT:
|
| 47 |
+
print(f"Goal reached! You already have {current_sick_count} sick samples.")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
# Tracker for processed paths to avoid duplicates
|
| 51 |
+
tracker_path = os.path.join(OUTPUT_DIR, "processed_paths.txt")
|
| 52 |
+
processed_paths = set()
|
| 53 |
+
if os.path.exists(tracker_path):
|
| 54 |
+
with open(tracker_path, 'r') as f:
|
| 55 |
+
processed_paths = set(line.strip() for line in f)
|
| 56 |
+
|
| 57 |
+
# Collect only SICK files
|
| 58 |
+
folder = os.path.join(DATA_ROOT, 'sick')
|
| 59 |
+
all_sick_files = []
|
| 60 |
+
if os.path.exists(folder):
|
| 61 |
+
for f in os.listdir(folder):
|
| 62 |
+
full_path = os.path.join(folder, f)
|
| 63 |
+
if f.endswith(('.webm', '.ogg', '.wav')) and full_path not in processed_paths:
|
| 64 |
+
all_sick_files.append(full_path)
|
| 65 |
+
|
| 66 |
+
remaining_to_goal = TARGET_SICK_COUNT - current_sick_count
|
| 67 |
+
files_to_process = all_sick_files[:remaining_to_goal]
|
| 68 |
+
|
| 69 |
+
print(f"Extraction Target: {len(files_to_process)} more sick samples.")
|
| 70 |
+
|
| 71 |
+
if not files_to_process:
|
| 72 |
+
print("No more unique sick files found to process.")
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
+
# Initialize Extractor
|
| 76 |
+
print("Initializing HeAR Extractor...")
|
| 77 |
+
extractor = HeARExtractor()
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
count = 0
|
| 81 |
+
with open(tracker_path, 'a') as tracker:
|
| 82 |
+
for path in tqdm(files_to_process, desc="Extracting Sick"):
|
| 83 |
+
emb = extractor.extract(path)
|
| 84 |
+
if emb is not None:
|
| 85 |
+
features.append(emb)
|
| 86 |
+
labels.append('sick')
|
| 87 |
+
tracker.write(path + "\n")
|
| 88 |
+
count += 1
|
| 89 |
+
|
| 90 |
+
if count % CHECKPOINT_EVERY == 0 and count > 0:
|
| 91 |
+
np.save(features_path, np.array(features))
|
| 92 |
+
np.save(labels_path, np.array(labels))
|
| 93 |
+
|
| 94 |
+
# Final save
|
| 95 |
+
np.save(features_path, np.array(features))
|
| 96 |
+
np.save(labels_path, np.array(labels))
|
| 97 |
+
print(f"Success! Now you have {sum(1 for l in labels if l == 'sick')} sick samples in total.")
|
| 98 |
+
|
| 99 |
+
except KeyboardInterrupt:
|
| 100 |
+
print("\nStopping and saving progress...")
|
| 101 |
+
np.save(features_path, np.array(features))
|
| 102 |
+
np.save(labels_path, np.array(labels))
|
| 103 |
+
print("Progress saved.")
|
| 104 |
+
finally:
|
| 105 |
+
# Reset Windows sleep settings to normal
|
| 106 |
+
try:
|
| 107 |
+
ctypes.windll.kernel32.SetThreadExecutionState(0x80000000)
|
| 108 |
+
except Exception:
|
| 109 |
+
pass
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
run_extraction()
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
run_extraction()
|
utils/purge_and_retrain.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
import os
|
| 4 |
+
from sklearn.model_selection import StratifiedKFold, train_test_split
|
| 5 |
+
from sklearn.utils import shuffle
|
| 6 |
+
|
| 7 |
+
# Paths
|
| 8 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 9 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 10 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 11 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 12 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 13 |
+
|
| 14 |
+
def clean_y(y):
|
| 15 |
+
if y.dtype.kind in ['U', 'S']:
|
| 16 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 17 |
+
return y.astype(np.float32)
|
| 18 |
+
|
| 19 |
+
# Load data
|
| 20 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 21 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 22 |
+
X = np.concatenate([X1, X2], axis=0).astype(np.float32)
|
| 23 |
+
y = np.concatenate([y1, y2], axis=0)
|
| 24 |
+
|
| 25 |
+
# Load current best model
|
| 26 |
+
model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
|
| 27 |
+
model = tf.keras.models.load_model(model_path)
|
| 28 |
+
|
| 29 |
+
# Predict all
|
| 30 |
+
print("Finding unreliable samples for the GREAT PURGE...")
|
| 31 |
+
preds = model.predict(X, batch_size=128).flatten()
|
| 32 |
+
|
| 33 |
+
# PURGE CRITERIA:
|
| 34 |
+
# 1. Label says SICK (1), but model is >85% sure it is HEALTHY (pred < 0.15)
|
| 35 |
+
# 2. Label says HEALTHY (0), but model is >85% sure it is SICK (pred > 0.85)
|
| 36 |
+
purge_mask = ((y == 1) & (preds < 0.15)) | ((y == 0) & (preds > 0.85))
|
| 37 |
+
|
| 38 |
+
# KEEP ONLY RELIABLE SAMPLES
|
| 39 |
+
X_clean = X[~purge_mask]
|
| 40 |
+
y_clean = y[~purge_mask]
|
| 41 |
+
|
| 42 |
+
print(f"Purged {np.sum(purge_mask)} 'Contradictory' samples.")
|
| 43 |
+
print(f"Clean Dataset size: {len(X_clean)}")
|
| 44 |
+
|
| 45 |
+
# STRICT BALANCING (Match minority class size)
|
| 46 |
+
h_idx = np.where(y_clean == 0)[0]
|
| 47 |
+
s_idx = np.where(y_clean == 1)[0]
|
| 48 |
+
min_size = min(len(h_idx), len(s_idx))
|
| 49 |
+
print(f"Balancing Clean Set to {min_size} samples per class.")
|
| 50 |
+
|
| 51 |
+
np.random.seed(42)
|
| 52 |
+
bal_h = np.random.choice(h_idx, min_size, replace=False)
|
| 53 |
+
bal_s = np.random.choice(s_idx, min_size, replace=False)
|
| 54 |
+
idx = np.concatenate([bal_h, bal_s])
|
| 55 |
+
np.random.shuffle(idx)
|
| 56 |
+
|
| 57 |
+
X_final = X_clean[idx]
|
| 58 |
+
y_final = y_clean[idx]
|
| 59 |
+
|
| 60 |
+
# Final Training with Higher Capacity
|
| 61 |
+
final_model = tf.keras.Sequential([
|
| 62 |
+
tf.keras.layers.Dense(1024, activation='relu', input_shape=(X_final.shape[1],)),
|
| 63 |
+
tf.keras.layers.BatchNormalization(),
|
| 64 |
+
tf.keras.layers.Dropout(0.4),
|
| 65 |
+
|
| 66 |
+
tf.keras.layers.Dense(512, activation='relu'),
|
| 67 |
+
tf.keras.layers.BatchNormalization(),
|
| 68 |
+
tf.keras.layers.Dropout(0.4),
|
| 69 |
+
|
| 70 |
+
tf.keras.layers.Dense(256, activation='relu'),
|
| 71 |
+
tf.keras.layers.Dense(1, activation='sigmoid')
|
| 72 |
+
])
|
| 73 |
+
|
| 74 |
+
final_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])
|
| 75 |
+
cb = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)]
|
| 76 |
+
|
| 77 |
+
# Train on 85/15 split
|
| 78 |
+
X_t, X_v, y_t, y_v = train_test_split(X_final, y_final, test_size=0.15, random_state=42)
|
| 79 |
+
|
| 80 |
+
print("Starting FINAL TRAINING on Purged & Balanced dataset...")
|
| 81 |
+
history = final_model.fit(X_t, y_t, validation_data=(X_v, y_v), epochs=150, batch_size=32, callbacks=cb)
|
| 82 |
+
|
| 83 |
+
# Save
|
| 84 |
+
save_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v6_final.h5"
|
| 85 |
+
final_model.save(save_path)
|
| 86 |
+
print(f"Purged Model saved to {save_path}")
|
| 87 |
+
|
| 88 |
+
best_acc = max(history.history['val_accuracy'])
|
| 89 |
+
print(f"Best Purged-Set Accuracy: {best_acc*100:.2f}%")
|
utils/test_overlap.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import os
|
| 3 |
+
from scipy.spatial.distance import cdist
|
| 4 |
+
|
| 5 |
+
# Paths
|
| 6 |
+
base_dir = r"c:\Users\ASUS\lung_ai_project\data"
|
| 7 |
+
path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
|
| 8 |
+
path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
|
| 9 |
+
path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
|
| 10 |
+
path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
|
| 11 |
+
|
| 12 |
+
# Load and clean
|
| 13 |
+
def clean_y(y):
|
| 14 |
+
if y.dtype.kind in ['U', 'S']:
|
| 15 |
+
return np.where(y == 'sick', 1, 0).astype(np.float32)
|
| 16 |
+
return y.astype(np.float32)
|
| 17 |
+
|
| 18 |
+
X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
|
| 19 |
+
X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
|
| 20 |
+
|
| 21 |
+
X = np.concatenate([X1, X2], axis=0).astype(np.float32)
|
| 22 |
+
y = np.concatenate([y1, y2], axis=0)
|
| 23 |
+
|
| 24 |
+
# Randomly sample some sick and healthy to check proximity
|
| 25 |
+
sick_indices = np.where(y == 1)[0]
|
| 26 |
+
healthy_indices = np.where(y == 0)[0]
|
| 27 |
+
|
| 28 |
+
# Pick a small Subset to check distances (full 11k cdist is too slow)
|
| 29 |
+
subs_s = np.random.choice(sick_indices, 500, replace=False)
|
| 30 |
+
subs_h = np.random.choice(healthy_indices, 500, replace=False)
|
| 31 |
+
|
| 32 |
+
X_s = X[subs_s]
|
| 33 |
+
X_h = X[subs_h]
|
| 34 |
+
|
| 35 |
+
# Check distances between 500 sick and 500 healthy samples
|
| 36 |
+
dist_matrix = cdist(X_s, X_h, 'cosine')
|
| 37 |
+
|
| 38 |
+
# Find how many sick samples are extremely close to healthy ones
|
| 39 |
+
very_close = np.where(dist_matrix < 0.05)
|
| 40 |
+
print(f"Overlap Analysis (Cosine Distance < 0.05): {len(very_close[0])} pairs found.")
|
| 41 |
+
|
| 42 |
+
avg_dist_sick_to_healthy = np.mean(dist_matrix)
|
| 43 |
+
print(f"Average Distance (Sick to Healthy): {avg_dist_sick_to_healthy:.4f}")
|
| 44 |
+
|
| 45 |
+
# Check distances within sick
|
| 46 |
+
dist_within_sick = cdist(X_s, X_s, 'cosine')
|
| 47 |
+
avg_dist_within_sick = np.mean(dist_within_sick)
|
| 48 |
+
print(f"Average Distance (Within Sick): {avg_dist_within_sick:.4f}")
|
| 49 |
+
|
| 50 |
+
# Check distances within healthy
|
| 51 |
+
dist_within_healthy = cdist(X_h, X_h, 'cosine')
|
| 52 |
+
avg_dist_within_healthy = np.mean(dist_within_healthy)
|
| 53 |
+
print(f"Average Distance (Within Healthy): {avg_dist_within_healthy:.4f}")
|