MeysamSh commited on
Commit
78e0f7a
Β·
1 Parent(s): 40a5641

cleaning and add recording verification and cuting samples in 2 seconds

Browse files
Files changed (1) hide show
  1. app.py +151 -82
app.py CHANGED
@@ -7,6 +7,10 @@ import tempfile
7
  from datetime import datetime
8
  from huggingface_hub import HfApi, hf_hub_download,CommitOperationDelete
9
  from pathlib import Path
 
 
 
 
10
 
11
 
12
  label_codes = {
@@ -19,7 +23,7 @@ label_decoder = {v: k for k, v in label_codes.items()}
19
  # --- CONFIGURATION ---
20
  DATASET_REPO_ID = "MeysamSh/ENSIMSoundDataCollection"
21
  HF_TOKEN = os.environ.get("HF_TOKEN")
22
- COUPON_SALT = os.environ.get("COUPON_SALT")#"ENSIM_2026_SECRET"
23
 
24
  # Admin Credentials
25
  ADMIN_USERNAME = "admin"
@@ -47,32 +51,92 @@ def verify_user(email):
47
  return gr.update(visible=True), f"βœ… Access Granted: {clean_email}"
48
  return gr.update(visible=False), "🚫 Not authorized."
49
 
 
50
  def upload_data(email, label, audio_path):
51
- if not audio_path: return "⚠️ No audio.", ""
 
 
 
 
 
 
 
52
  try:
53
- clean_email = email.strip().lower()
54
- email_index = AUTHORIZED_USERS.index(clean_email) if clean_email in AUTHORIZED_USERS else "unknown"
55
-
56
- # Determine if this is file #1, #2, etc. for this user to assign Training/Test
57
- all_files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
58
- user_submissions = [f for f in all_files if f.startswith(f"data/{email_index}_")]
59
- submission_number = len(user_submissions) + 1
60
 
61
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
62
- unique_filename = f"{email_index}_{timestamp}.wav"
 
 
63
 
64
- # Upload Audio
65
- api.upload_file(path_or_fileobj=audio_path, path_in_repo=f"data/{unique_filename}",
66
- repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN)
67
- coupon = generate_coupon(unique_filename)
68
- # Upload Metadata
69
- meta_content = f"user_id,label,file_name,time,order\n{clean_email},{label},{unique_filename},{timestamp},{submission_number}"
70
- api.upload_file(path_or_fileobj=meta_content.encode(), path_in_repo=f"metadata/meta_{email_index}_{timestamp}.csv",
71
- repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN)
 
 
 
 
 
 
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- return f"πŸŽ‰ Success! Submission #{submission_number} saved.", coupon
75
- except Exception as e: return f"❌ Error: {str(e)}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # --- ADMIN LOGIC ---
78
 
@@ -180,72 +244,72 @@ def delete_selected_file(file_path):
180
  except Exception as e: return f"❌ Error: {str(e)}", gr.update()
181
 
182
  def access_dataset_zip(email, coupons_str):
183
- """Verifies coupons and creates a ZIP of accessible data."""
184
  if not email or not coupons_str:
185
  return None, "⚠️ Please provide your email and coupons."
186
 
187
- # Clean coupons input
188
  coupons_list = [c.strip().upper() for c in coupons_str.split(",") if c.strip()]
189
  num_coupons = len(coupons_list)
190
 
191
  if num_coupons == 0:
192
  return None, "⚠️ No valid coupons provided."
193
 
194
- # try:
195
- all_files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
196
- meta_files = [f for f in all_files if f.startswith("metadata/")]
197
-
198
- # Create a temporary directory for the ZIP
199
- tmp_dir = tempfile.mkdtemp()
200
- zip_path = os.path.join(tmp_dir, f"ENSIM_Data_{num_coupons}_coupons.zip")
201
- print(f"Creating ZIP at: {zip_path}")
202
-
203
- training_metadata = []
204
-
205
- with zipfile.ZipFile(zip_path, 'w') as zipf:
206
- for m_file in meta_files:
207
- # Get metadata for each entry
208
- print(f"Processing metadata file: {m_file}")
209
- local_meta = hf_hub_download(repo_id=DATASET_REPO_ID, filename=m_file, repo_type="dataset", token=HF_TOKEN)
210
- print(f"Downloaded metadata to: {local_meta}")
211
- df = pd.read_csv(local_meta)
212
- row = df.iloc[0]
213
-
214
- order = int(row['order'])
215
- audio_filename = row['file_name']
216
- audio_repo_path = f"data/{audio_filename}"
217
- print(f"Processing {audio_filename} with order {order} for coupon access...")
218
-
219
- # ACCESS RULES:
220
- # 1. Training Set (Odd): Only if student has enough coupons
221
- # 2. Test Set (Even): Always included if they have at least 1 coupon
222
-
223
- is_training = order % 2 != 0
224
-
225
- if is_training and order <= num_coupons:
226
- # Download and add to Training folder
227
- audio_local = hf_hub_download(repo_id=DATASET_REPO_ID, filename=audio_repo_path, repo_type="dataset", token=HF_TOKEN)
228
- print(f"Adding to ZIP: {audio_filename} (Label: {row['label']})")
229
- zipf.write(audio_local, arcname=f"training_set/{audio_filename}")
230
- print(f"Added {audio_filename} to ZIP under training_set/")
231
- training_metadata.append({"file": audio_filename, "label": row['label']})
232
-
233
- elif not is_training:
234
- # Download and add to Test folder (No labels added to zip)
235
- audio_local = hf_hub_download(repo_id=DATASET_REPO_ID, filename=audio_repo_path, repo_type="dataset", token=HF_TOKEN)
236
- print(f"Adding to ZIP: {audio_filename} (Test Set)")
237
- zipf.write(audio_local, arcname=f"test_set/{audio_filename}")
238
- print(f"Added {audio_filename} to ZIP under test_set/")
239
-
240
- # Add Training Metadata CSV to the ZIP if data exists
241
- if training_metadata:
242
- print(f"Adding {len(training_metadata)} training metadata entries to ZIP...")
243
- train_df = pd.DataFrame(training_metadata)
244
- csv_path = os.path.join(tmp_dir, "training_labels.csv")
245
- train_df.to_csv(csv_path, index=False)
246
- zipf.write(csv_path, arcname="training_set/labels.csv")
247
-
248
- return zip_path, f"βœ… Successfully bundled data for {num_coupons} coupons."
 
 
249
 
250
  # except Exception as e:
251
  # return None, f"❌ Error creating ZIP: {str(e)}"
@@ -255,11 +319,11 @@ def access_dataset_zip(email, coupons_str):
255
  # --- UI ---
256
 
257
  with gr.Blocks() as demo:
258
- gr.Markdown("# πŸŽ™οΈ ENSIM Sound Data Platform")
259
 
260
  with gr.Tabs():
261
  # STUDENT TAB
262
- with gr.TabItem("Student Recording"):
263
  with gr.Row():
264
  email_input = gr.Textbox(label="Email", placeholder="test")
265
  login_btn = gr.Button("Verify", variant="primary")
@@ -273,7 +337,7 @@ with gr.Blocks() as demo:
273
  coupon_display = gr.Textbox(label="🎟️ YOUR COUPON (Save this!)", interactive=False)
274
 
275
  # 2. DATASET ACCESS TAB
276
- with gr.TabItem("2. Dataset Access"):
277
  gr.Markdown("""
278
  ### πŸ”“ Unlock Your Data Partition
279
  - **Training Data:** You receive Training samples (Audio + Label) proportional to your coupons.
@@ -311,7 +375,12 @@ with gr.Blocks() as demo:
311
 
312
  # --- EVENT HANDLERS ---
313
  login_btn.click(verify_user, [email_input], [recording_zone, login_status])
314
- submit_btn.click(upload_data, [email_input, label_input, audio_input], [res_msg, coupon_display])
 
 
 
 
 
315
 
316
  admin_login_btn.click(
317
  admin_login,
 
7
  from datetime import datetime
8
  from huggingface_hub import HfApi, hf_hub_download,CommitOperationDelete
9
  from pathlib import Path
10
+ import librosa
11
+ import soundfile as sf
12
+ import tempfile
13
+ import numpy as np
14
 
15
 
16
  label_codes = {
 
23
  # --- CONFIGURATION ---
24
  DATASET_REPO_ID = "MeysamSh/ENSIMSoundDataCollection"
25
  HF_TOKEN = os.environ.get("HF_TOKEN")
26
+ COUPON_SALT = os.environ.get("COUPON_SALT")
27
 
28
  # Admin Credentials
29
  ADMIN_USERNAME = "admin"
 
51
  return gr.update(visible=True), f"βœ… Access Granted: {clean_email}"
52
  return gr.update(visible=False), "🚫 Not authorized."
53
 
54
+
55
  def upload_data(email, label, audio_path):
56
+ # --- Energy Threshold Setting ---
57
+ ENERGY_THRESHOLD = 0.02 # Adjust this: 0.01 is very sensitive, 0.05 is strict
58
+
59
+ if audio_path is None:
60
+ return "⚠️ Please record or upload a sound file.", None, gr.update(), ""
61
+ if not label:
62
+ return "⚠️ Please select a category label.", gr.update(), gr.update(), ""
63
+
64
  try:
65
+ y, sr = librosa.load(audio_path, sr=None)
66
+ duration = librosa.get_duration(y=y, sr=sr)
 
 
 
 
 
67
 
68
+ if duration < 2.0:
69
+ return f"⚠️ Sound too short ({duration:.1f}s).", gr.update(), gr.update(), ""
70
+
71
+ raw_segments = []
72
 
73
+ # --- SPLITTING LOGIC ---
74
+ if duration < 5.0:
75
+ raw_segments.append(y[:int(2 * sr)])
76
+ elif duration >= 7.0:
77
+ start_sample = int(3 * sr)
78
+ remaining_audio = y[start_sample:]
79
+ window_size = int(2 * sr)
80
+ for i in range(0, len(remaining_audio) - window_size + 1, window_size):
81
+ raw_segments.append(remaining_audio[i : i + window_size])
82
+ else:
83
+ raw_segments.append(y[:int(2 * sr)])
84
+
85
+ # --- ENERGY CALCULATION & FILTERING ---
86
+ valid_segments = []
87
+ rejected_count = 0
88
 
89
+ for seg in raw_segments:
90
+ # Calculate RMS energy: sqrt(mean(x^2))
91
+ rms = np.sqrt(np.mean(seg**2))
92
+
93
+ if rms >= ENERGY_THRESHOLD:
94
+ valid_segments.append(seg)
95
+ else:
96
+ rejected_count += 1
97
+
98
+ if not valid_segments:
99
+ return f"❌ Rejected: {rejected_count} segments were too quiet. Please record closer to the source.", None, gr.update(), ""
100
+
101
+ # --- UPLOAD PROCESS ---
102
+ clean_email = email.strip().lower()
103
+ email_index = AUTHORIZED_USERS.index(clean_email) if clean_email in AUTHORIZED_USERS else "unknown"
104
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
105
 
106
+ coupons = []
107
+ for idx, seg in enumerate(valid_segments):
108
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_seg:
109
+ sf.write(tmp_seg.name, seg, sr)
110
+ seg_filename = f"{email_index}_{timestamp}_seg{idx}.wav"
111
+ coupon = generate_coupon(seg_filename)
112
+ coupons.append(coupon)
113
+
114
+ api.upload_file(
115
+ path_or_fileobj=tmp_seg.name,
116
+ path_in_repo=f"data/{seg_filename}",
117
+ repo_id=DATASET_REPO_ID,
118
+ repo_type="dataset",
119
+ token=HF_TOKEN
120
+ )
121
+
122
+ meta_content = f"user_id,label,file_name,time,order\n{clean_email},{label},{seg_filename},{timestamp},{idx+1}"
123
+ api.upload_file(
124
+ path_or_fileobj=meta_content.encode(),
125
+ path_in_repo=f"metadata/meta_{email_index}_{timestamp}_seg{idx}.csv",
126
+ repo_id=DATASET_REPO_ID,
127
+ repo_type="dataset",
128
+ token=HF_TOKEN
129
+ )
130
+ os.unlink(tmp_seg.name)
131
+
132
+ status_msg = f"πŸŽ‰ Success! {len(valid_segments)} samples accepted."
133
+ if rejected_count > 0:
134
+ status_msg += f" ({rejected_count} quiet segments discarded)."
135
+
136
+ return status_msg, None, gr.update(value=None), ", ".join(coupons)
137
+
138
+ except Exception as e:
139
+ return f"❌ Error: {str(e)}", gr.update(), gr.update(), ""
140
 
141
  # --- ADMIN LOGIC ---
142
 
 
244
  except Exception as e: return f"❌ Error: {str(e)}", gr.update()
245
 
246
  def access_dataset_zip(email, coupons_str):
 
247
  if not email or not coupons_str:
248
  return None, "⚠️ Please provide your email and coupons."
249
 
 
250
  coupons_list = [c.strip().upper() for c in coupons_str.split(",") if c.strip()]
251
  num_coupons = len(coupons_list)
252
 
253
  if num_coupons == 0:
254
  return None, "⚠️ No valid coupons provided."
255
 
256
+ try:
257
+ all_files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
258
+ meta_files = [f for f in all_files if f.startswith("metadata/")]
259
+
260
+ tmp_dir = tempfile.mkdtemp()
261
+ zip_path = os.path.join(tmp_dir, f"ENSIM_Data_Collection.zip")
262
+
263
+ # This list will hold rows for our single combined CSV
264
+ compiled_metadata = []
265
+
266
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
267
+ for m_file in meta_files:
268
+ local_meta = hf_hub_download(repo_id=DATASET_REPO_ID, filename=m_file, repo_type="dataset", token=HF_TOKEN)
269
+ df = pd.read_csv(local_meta)
270
+ row = df.iloc[0]
271
+
272
+ order = int(row['order'])
273
+ audio_filename = row['file_name']
274
+ audio_repo_path = f"data/{audio_filename}"
275
+ is_training = order % 2 != 0
276
+
277
+ # --- ACCESS LOGIC ---
278
+ # 1. Training files (Odd): include only if within coupon count
279
+ if is_training and order <= num_coupons:
280
+ audio_local = hf_hub_download(repo_id=DATASET_REPO_ID, filename=audio_repo_path, repo_type="dataset", token=HF_TOKEN)
281
+ zipf.write(audio_local, arcname=f"training_set/{audio_filename}")
282
+
283
+ # Add to the compiled metadata list
284
+ compiled_metadata.append({
285
+ "wav_filename": audio_filename,
286
+ "label": row['label']
287
+ })
288
+
289
+ # 2. Test files (Even): Always included (Labels omitted from compiled CSV)
290
+ elif not is_training:
291
+ audio_local = hf_hub_download(repo_id=DATASET_REPO_ID, filename=audio_repo_path, repo_type="dataset", token=HF_TOKEN)
292
+ zipf.write(audio_local, arcname=f"test_set/{audio_filename}")
293
+
294
+ # Add to compiled metadata but set label to HIDDEN or empty
295
+ compiled_metadata.append({
296
+ "wav_filename": audio_filename,
297
+ "label": "HIDDEN"
298
+ })
299
+
300
+ # --- CREATE THE SINGLE CONSOLIDATED CSV ---
301
+ if compiled_metadata:
302
+ master_df = pd.DataFrame(compiled_metadata)
303
+ master_csv_path = os.path.join(tmp_dir, "metadata_summary.csv")
304
+ # Save only the columns requested
305
+ master_df.to_csv(master_csv_path, index=False, columns=["wav_filename", "label"])
306
+ # Place it at the root of the ZIP for easy access
307
+ zipf.write(master_csv_path, arcname="metadata_summary.csv")
308
+
309
+ return zip_path, f"βœ… ZIP created with {len(compiled_metadata)} total references."
310
+
311
+ except Exception as e:
312
+ return None, f"❌ Error: {str(e)}"
313
 
314
  # except Exception as e:
315
  # return None, f"❌ Error creating ZIP: {str(e)}"
 
319
  # --- UI ---
320
 
321
  with gr.Blocks() as demo:
322
+ gr.Markdown("# πŸŽ™οΈ Sound Data Platform")
323
 
324
  with gr.Tabs():
325
  # STUDENT TAB
326
+ with gr.TabItem("Dataset Collection"):
327
  with gr.Row():
328
  email_input = gr.Textbox(label="Email", placeholder="test")
329
  login_btn = gr.Button("Verify", variant="primary")
 
337
  coupon_display = gr.Textbox(label="🎟️ YOUR COUPON (Save this!)", interactive=False)
338
 
339
  # 2. DATASET ACCESS TAB
340
+ with gr.TabItem("Dataset Access"):
341
  gr.Markdown("""
342
  ### πŸ”“ Unlock Your Data Partition
343
  - **Training Data:** You receive Training samples (Audio + Label) proportional to your coupons.
 
375
 
376
  # --- EVENT HANDLERS ---
377
  login_btn.click(verify_user, [email_input], [recording_zone, login_status])
378
+
379
+ submit_btn.click(
380
+ fn=upload_data,
381
+ inputs=[email_input, label_input, audio_input],
382
+ outputs=[res_msg, audio_input, label_input, coupon_display]
383
+ )
384
 
385
  admin_login_btn.click(
386
  admin_login,