MeysamSh commited on
Commit
40a5641
Β·
1 Parent(s): ec12e16

update 3 tabs

Browse files
Files changed (1) hide show
  1. app.py +215 -18
app.py CHANGED
@@ -1,13 +1,25 @@
1
  import os
2
  import gradio as gr
3
  import hashlib
 
 
 
4
  from datetime import datetime
5
- from huggingface_hub import HfApi
6
  from pathlib import Path
7
 
 
 
 
 
 
 
 
 
8
  # --- CONFIGURATION ---
9
  DATASET_REPO_ID = "MeysamSh/ENSIMSoundDataCollection"
10
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
11
 
12
  # Admin Credentials
13
  ADMIN_USERNAME = "admin"
@@ -22,6 +34,11 @@ api = HfApi()
22
 
23
  # --- LOGIC FUNCTIONS ---
24
 
 
 
 
 
 
25
  def verify_user(email):
26
  if not email: return gr.update(visible=False), "⚠️ Enter email."
27
  clean_email = email.strip().lower()
@@ -31,42 +48,116 @@ def verify_user(email):
31
  return gr.update(visible=False), "🚫 Not authorized."
32
 
33
  def upload_data(email, label, audio_path):
34
- if not audio_path: return "⚠️ No audio recorded."
35
  try:
36
  clean_email = email.strip().lower()
37
  email_index = AUTHORIZED_USERS.index(clean_email) if clean_email in AUTHORIZED_USERS else "unknown"
 
 
 
 
 
 
38
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
39
  unique_filename = f"{email_index}_{timestamp}.wav"
40
 
41
  # Upload Audio
42
  api.upload_file(path_or_fileobj=audio_path, path_in_repo=f"data/{unique_filename}",
43
  repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN)
 
44
  # Upload Metadata
45
- meta_content = f"user_id,label,file_name,timestamp\n{clean_email},{label},{unique_filename},{timestamp}"
46
  api.upload_file(path_or_fileobj=meta_content.encode(), path_in_repo=f"metadata/meta_{email_index}_{timestamp}.csv",
47
  repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN)
48
- return f"πŸŽ‰ Success! Audio saved as {label}."
49
- except Exception as e: return f"❌ Error: {str(e)}"
 
 
50
 
51
  # --- ADMIN LOGIC ---
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def get_stats():
54
- """Helper to calculate stats from repository files"""
55
  try:
56
- files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
57
- audio_files = [f for f in files if f.startswith("data/") and f.endswith(".wav")]
58
 
59
- # Extract user indices from filenames like 'data/0_20260511.wav'
 
 
 
 
60
  user_indices = set()
61
  for f in audio_files:
62
- filename = f.split("/")[-1] # get '0_20260511.wav'
63
  user_id = filename.split("_")[0]
64
  user_indices.add(user_id)
65
 
66
- stats_text = f"πŸ“Š **Stats:** {len(audio_files)} recordings from {len(user_indices)} unique contributors."
67
- return audio_files, stats_text
68
- except:
69
- return [], "⚠️ Could not retrieve stats."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def admin_login(user, pwd):
72
  pwd_hash = hashlib.sha256(pwd.encode()).hexdigest()
@@ -88,6 +179,79 @@ def delete_selected_file(file_path):
88
  return f"πŸ—‘οΈ Deleted {file_path}. {stats_text}", gr.update(choices=audio_files, value=None)
89
  except Exception as e: return f"❌ Error: {str(e)}", gr.update()
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  # --- UI ---
92
 
93
  with gr.Blocks() as demo:
@@ -102,11 +266,26 @@ with gr.Blocks() as demo:
102
  login_status = gr.Markdown("Waiting for login...")
103
 
104
  with gr.Column(visible=False) as recording_zone:
105
- label_input = gr.Radio(choices=["Engine", "Environmental", "Mechanical"], label="Category")
106
  audio_input = gr.Audio(label="Record (40s)", sources=["microphone"], type="filepath")
107
  submit_btn = gr.Button("πŸš€ Submit", variant="primary")
108
- final_status = gr.Textbox(label="Status", interactive=False)
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # ADMIN TAB
111
  with gr.TabItem("Administration"):
112
  with gr.Row():
@@ -119,14 +298,20 @@ with gr.Blocks() as demo:
119
  admin_stats_display = gr.Markdown("")
120
 
121
  with gr.Column(visible=False) as admin_panel:
122
- gr.Separator()
123
  file_dropdown = gr.Dropdown(label="Select File to Remove", choices=[])
124
  delete_btn = gr.Button("πŸ—‘οΈ Delete Selected File", variant="stop")
125
  delete_status = gr.Textbox(label="Delete Progress")
126
 
 
 
 
 
 
 
 
127
  # --- EVENT HANDLERS ---
128
  login_btn.click(verify_user, [email_input], [recording_zone, login_status])
129
- submit_btn.click(upload_data, [email_input, label_input, audio_input], final_status)
130
 
131
  admin_login_btn.click(
132
  admin_login,
@@ -140,5 +325,17 @@ with gr.Blocks() as demo:
140
  [delete_status, file_dropdown]
141
  )
142
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  if __name__ == "__main__":
144
  demo.launch(theme=gr.themes.Soft())
 
1
  import os
2
  import gradio as gr
3
  import hashlib
4
+ import pandas as pd
5
+ import zipfile
6
+ import tempfile
7
  from datetime import datetime
8
+ from huggingface_hub import HfApi, hf_hub_download,CommitOperationDelete
9
  from pathlib import Path
10
 
11
+
12
+ label_codes = {
13
+ "1":"Engine",
14
+ "2":"Environmental",
15
+ "3":"Mechanical"
16
+ }
17
+ label_decoder = {v: k for k, v in label_codes.items()}
18
+
19
  # --- CONFIGURATION ---
20
  DATASET_REPO_ID = "MeysamSh/ENSIMSoundDataCollection"
21
  HF_TOKEN = os.environ.get("HF_TOKEN")
22
+ COUPON_SALT = os.environ.get("COUPON_SALT")#"ENSIM_2026_SECRET"
23
 
24
  # Admin Credentials
25
  ADMIN_USERNAME = "admin"
 
34
 
35
  # --- LOGIC FUNCTIONS ---
36
 
37
+ def generate_coupon(filename):
38
+ """Creates a unique string for the student to save."""
39
+ return hashlib.sha1(f"{filename}{COUPON_SALT}".encode()).hexdigest()[:10].upper()
40
+
41
+
42
  def verify_user(email):
43
  if not email: return gr.update(visible=False), "⚠️ Enter email."
44
  clean_email = email.strip().lower()
 
48
  return gr.update(visible=False), "🚫 Not authorized."
49
 
50
  def upload_data(email, label, audio_path):
51
+ if not audio_path: return "⚠️ No audio.", ""
52
  try:
53
  clean_email = email.strip().lower()
54
  email_index = AUTHORIZED_USERS.index(clean_email) if clean_email in AUTHORIZED_USERS else "unknown"
55
+
56
+ # Determine if this is file #1, #2, etc. for this user to assign Training/Test
57
+ all_files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
58
+ user_submissions = [f for f in all_files if f.startswith(f"data/{email_index}_")]
59
+ submission_number = len(user_submissions) + 1
60
+
61
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
62
  unique_filename = f"{email_index}_{timestamp}.wav"
63
 
64
  # Upload Audio
65
  api.upload_file(path_or_fileobj=audio_path, path_in_repo=f"data/{unique_filename}",
66
  repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN)
67
+ coupon = generate_coupon(unique_filename)
68
  # Upload Metadata
69
+ meta_content = f"user_id,label,file_name,time,order\n{clean_email},{label},{unique_filename},{timestamp},{submission_number}"
70
  api.upload_file(path_or_fileobj=meta_content.encode(), path_in_repo=f"metadata/meta_{email_index}_{timestamp}.csv",
71
  repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN)
72
+
73
+
74
+ return f"πŸŽ‰ Success! Submission #{submission_number} saved.", coupon
75
+ except Exception as e: return f"❌ Error: {str(e)}", ""
76
 
77
  # --- ADMIN LOGIC ---
78
 
79
+ def delete_all_files(confirm):
80
+ if not confirm:
81
+ return "⚠️ You must check the 'Confirm' box to delete everything.", gr.update()
82
+
83
+ try:
84
+ # 1. Get all files in the repo
85
+ all_files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
86
+
87
+ # 2. Filter for files in our managed folders
88
+ files_to_delete = [f for f in all_files if f.startswith("data/") or f.startswith("metadata/")]
89
+
90
+ if not files_to_delete:
91
+ return "ℹ️ The dataset is already empty.", gr.update(choices=[])
92
+
93
+ # 3. Use bulk deletion to avoid hundreds of individual API calls
94
+ # This is much faster for "Delete All"
95
+
96
+ operations = [CommitOperationDelete(path_in_repo=f) for f in files_to_delete]
97
+
98
+ api.create_commit(
99
+ repo_id=DATASET_REPO_ID,
100
+ repo_type="dataset",
101
+ operations=operations,
102
+ commit_message=f"Admin: Bulk delete of {len(files_to_delete)} files",
103
+ token=HF_TOKEN
104
+ )
105
+
106
+ return f"πŸ’₯ Success! Deleted {len(files_to_delete)} files. Dataset is now clean.", gr.update(choices=[], value=None)
107
+
108
+ except Exception as e:
109
+ return f"❌ Bulk delete failed: {str(e)}", gr.update()
110
+
111
  def get_stats():
112
+ """Helper to calculate stats and label distribution from repository"""
113
  try:
114
+ # List all files once to avoid multiple API calls
115
+ all_files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
116
 
117
+ audio_files = [f for f in all_files if f.startswith("data/") and f.endswith(".wav")]
118
+ metadata_files = [f for f in all_files if f.startswith("metadata/") and f.endswith(".csv")]
119
+ print(f"Found {len(audio_files)} audio files and {len(metadata_files)} metadata files in the repository.")
120
+
121
+ # 1. Count Unique Contributors
122
  user_indices = set()
123
  for f in audio_files:
124
+ filename = f.split("/")[-1]
125
  user_id = filename.split("_")[0]
126
  user_indices.add(user_id)
127
 
128
+ # 2. Count Files per Category (Label)
129
+ category_counts = {label_codes["1"]: 0, label_codes["2"]: 0, label_codes["3"]: 0}
130
+
131
+ for m_file in metadata_files:
132
+ try:
133
+ # Download and read the small metadata file
134
+ file_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename=m_file, repo_type="dataset", token=HF_TOKEN)
135
+ with open(file_path, 'r') as f:
136
+ content = f.readlines()
137
+ if len(content) > 1:
138
+ # The label is the second column in: user_id,label,file_name,timestamp
139
+ label = content[1].split(",")[1].strip()
140
+ if label in category_counts:
141
+ category_counts[label] += 1
142
+ else:
143
+ # Handle cases where label might not match exactly
144
+ category_counts[label] = category_counts.get(label, 0) + 1
145
+ except Exception:
146
+ print(f"⚠️ Failed to process metadata file: {m_file}")
147
+ continue # Skip files that fail to download or parse
148
+
149
+ # 3. Format the stats string
150
+ stats_md = f"### πŸ“Š Dataset Statistics\n"
151
+ stats_md += f"**Total Recordings:** {len(audio_files)} \n"
152
+ stats_md += f"**Unique Contributors:** {len(user_indices)} \n\n"
153
+ stats_md += "**Category Breakdown:**\n"
154
+ for cat, count in category_counts.items():
155
+ stats_md += f"- **{cat}:** {count} files\n"
156
+
157
+ return audio_files, stats_md
158
+ except Exception as e:
159
+ return [], f"⚠️ Error retrieving stats: {str(e)}"
160
+
161
 
162
  def admin_login(user, pwd):
163
  pwd_hash = hashlib.sha256(pwd.encode()).hexdigest()
 
179
  return f"πŸ—‘οΈ Deleted {file_path}. {stats_text}", gr.update(choices=audio_files, value=None)
180
  except Exception as e: return f"❌ Error: {str(e)}", gr.update()
181
 
182
+ def access_dataset_zip(email, coupons_str):
183
+ """Verifies coupons and creates a ZIP of accessible data."""
184
+ if not email or not coupons_str:
185
+ return None, "⚠️ Please provide your email and coupons."
186
+
187
+ # Clean coupons input
188
+ coupons_list = [c.strip().upper() for c in coupons_str.split(",") if c.strip()]
189
+ num_coupons = len(coupons_list)
190
+
191
+ if num_coupons == 0:
192
+ return None, "⚠️ No valid coupons provided."
193
+
194
+ # try:
195
+ all_files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
196
+ meta_files = [f for f in all_files if f.startswith("metadata/")]
197
+
198
+ # Create a temporary directory for the ZIP
199
+ tmp_dir = tempfile.mkdtemp()
200
+ zip_path = os.path.join(tmp_dir, f"ENSIM_Data_{num_coupons}_coupons.zip")
201
+ print(f"Creating ZIP at: {zip_path}")
202
+
203
+ training_metadata = []
204
+
205
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
206
+ for m_file in meta_files:
207
+ # Get metadata for each entry
208
+ print(f"Processing metadata file: {m_file}")
209
+ local_meta = hf_hub_download(repo_id=DATASET_REPO_ID, filename=m_file, repo_type="dataset", token=HF_TOKEN)
210
+ print(f"Downloaded metadata to: {local_meta}")
211
+ df = pd.read_csv(local_meta)
212
+ row = df.iloc[0]
213
+
214
+ order = int(row['order'])
215
+ audio_filename = row['file_name']
216
+ audio_repo_path = f"data/{audio_filename}"
217
+ print(f"Processing {audio_filename} with order {order} for coupon access...")
218
+
219
+ # ACCESS RULES:
220
+ # 1. Training Set (Odd): Only if student has enough coupons
221
+ # 2. Test Set (Even): Always included if they have at least 1 coupon
222
+
223
+ is_training = order % 2 != 0
224
+
225
+ if is_training and order <= num_coupons:
226
+ # Download and add to Training folder
227
+ audio_local = hf_hub_download(repo_id=DATASET_REPO_ID, filename=audio_repo_path, repo_type="dataset", token=HF_TOKEN)
228
+ print(f"Adding to ZIP: {audio_filename} (Label: {row['label']})")
229
+ zipf.write(audio_local, arcname=f"training_set/{audio_filename}")
230
+ print(f"Added {audio_filename} to ZIP under training_set/")
231
+ training_metadata.append({"file": audio_filename, "label": row['label']})
232
+
233
+ elif not is_training:
234
+ # Download and add to Test folder (No labels added to zip)
235
+ audio_local = hf_hub_download(repo_id=DATASET_REPO_ID, filename=audio_repo_path, repo_type="dataset", token=HF_TOKEN)
236
+ print(f"Adding to ZIP: {audio_filename} (Test Set)")
237
+ zipf.write(audio_local, arcname=f"test_set/{audio_filename}")
238
+ print(f"Added {audio_filename} to ZIP under test_set/")
239
+
240
+ # Add Training Metadata CSV to the ZIP if data exists
241
+ if training_metadata:
242
+ print(f"Adding {len(training_metadata)} training metadata entries to ZIP...")
243
+ train_df = pd.DataFrame(training_metadata)
244
+ csv_path = os.path.join(tmp_dir, "training_labels.csv")
245
+ train_df.to_csv(csv_path, index=False)
246
+ zipf.write(csv_path, arcname="training_set/labels.csv")
247
+
248
+ return zip_path, f"βœ… Successfully bundled data for {num_coupons} coupons."
249
+
250
+ # except Exception as e:
251
+ # return None, f"❌ Error creating ZIP: {str(e)}"
252
+
253
+ # except Exception as e: return f"❌ Error: {str(e)}"
254
+
255
  # --- UI ---
256
 
257
  with gr.Blocks() as demo:
 
266
  login_status = gr.Markdown("Waiting for login...")
267
 
268
  with gr.Column(visible=False) as recording_zone:
269
+ label_input = gr.Radio(choices=[label_codes["1"], label_codes["2"], label_codes["3"]], label="Category")
270
  audio_input = gr.Audio(label="Record (40s)", sources=["microphone"], type="filepath")
271
  submit_btn = gr.Button("πŸš€ Submit", variant="primary")
272
+ res_msg = gr.Textbox(label="Status", interactive=False)
273
+ coupon_display = gr.Textbox(label="🎟️ YOUR COUPON (Save this!)", interactive=False)
274
 
275
+ # 2. DATASET ACCESS TAB
276
+ with gr.TabItem("2. Dataset Access"):
277
+ gr.Markdown("""
278
+ ### πŸ”“ Unlock Your Data Partition
279
+ - **Training Data:** You receive Training samples (Audio + Label) proportional to your coupons.
280
+ - **Test Data:** You receive the full global Test set (Audio Only) to evaluate your models.
281
+ """)
282
+ acc_email = gr.Textbox(label="Email")
283
+ coupons_input = gr.Textbox(label="Coupons List (comma separated)", placeholder="C1, C2, C3...")
284
+ download_btn = gr.Button("πŸ“¦ Generate Data ZIP", variant="primary")
285
+
286
+ status_out = gr.Textbox(label="Status")
287
+ file_out = gr.File(label="Download Your Data")
288
+
289
  # ADMIN TAB
290
  with gr.TabItem("Administration"):
291
  with gr.Row():
 
298
  admin_stats_display = gr.Markdown("")
299
 
300
  with gr.Column(visible=False) as admin_panel:
 
301
  file_dropdown = gr.Dropdown(label="Select File to Remove", choices=[])
302
  delete_btn = gr.Button("πŸ—‘οΈ Delete Selected File", variant="stop")
303
  delete_status = gr.Textbox(label="Delete Progress")
304
 
305
+ gr.Markdown("### 🧨 Danger Zone")
306
+ confirm_check = gr.Checkbox(label="I understand this will permanently delete ALL recordings and metadata.")
307
+ delete_all_btn = gr.Button("πŸ”₯ DELETE ALL DATASET FILES", variant="stop")
308
+
309
+ delete_status = gr.Textbox(label="Status Log")
310
+
311
+
312
  # --- EVENT HANDLERS ---
313
  login_btn.click(verify_user, [email_input], [recording_zone, login_status])
314
+ submit_btn.click(upload_data, [email_input, label_input, audio_input], [res_msg, coupon_display])
315
 
316
  admin_login_btn.click(
317
  admin_login,
 
325
  [delete_status, file_dropdown]
326
  )
327
 
328
+ download_btn.click(
329
+ fn=access_dataset_zip,
330
+ inputs=[acc_email, coupons_input],
331
+ outputs=[file_out, status_out]
332
+ )
333
+
334
+ delete_all_btn.click(
335
+ fn=delete_all_files,
336
+ inputs=[confirm_check],
337
+ outputs=[delete_status, file_dropdown]
338
+ )
339
+
340
  if __name__ == "__main__":
341
  demo.launch(theme=gr.themes.Soft())