ashokpoudel commited on
Commit
ce26fdd
·
verified ·
1 Parent(s): 0ff5b4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +444 -407
app.py CHANGED
@@ -6,40 +6,29 @@ import random
6
  import datetime
7
  import uuid
8
  import json
9
- from huggingface_hub import HfApi, create_repo
10
- from datasets import Dataset
11
- import soundfile as sf # Added for explicit use in save_recording
12
- import shutil # Added for explicit use in save_recording
13
-
14
- # Configuration
 
 
15
  SAMPLE_PROMPTS = [
16
- "नमस्ते, मेरो नाम __ हो। म नेपाली बोल्छु।",
17
- "आज मौसम धेरै राम्रो छ।",
18
- "नेपाल एक सुन्दर देश हो जहाँ हिमाल, पहाड तराई छन्।",
19
- "काठमाडौं नेपालको राजधानी हो।",
20
- " आज बिहान स्कूल जाँदैछु।",
21
- "नेपाली भाषा बोल्ने मानिसहरू विश्वभर छन्।",
22
- "हिमालमा हिउँ परिरहेको छ।",
23
- "मलाई नेपाली खाना धेरै मन पर्छ।",
24
- "बुद्ध नेपालमा जन्मिएका थिए।",
25
- "सगरमाथा विश्वको सबैभन्दा अग्लो हिमाल हो।"
26
  ]
27
-
28
  EMOTIONS = ["सामान्य (Neutral)", "खुसी (Happy)", "दुःखी (Sad)", "रिसाएको (Angry)", "अचम्मित (Surprised)"]
29
  GENDERS = ["पुरुष (Male)", "महिला (Female)", "अन्य (Other)", "भन्न चाहन्न (Prefer not to say)"]
30
  AGE_GROUPS = ["18 भन्दा कम", "18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
31
-
32
- # Nepal-specific data
33
  REGIONS = [
34
- "प्रदेश १ (Province 1)",
35
- "मधेश प्रदेश (Madhesh Province)",
36
- "बागमती प्रदेश (Bagmati Province)",
37
- "गण्डकी प्रदेश (Gandaki Province)",
38
- "लुम्बिनी प्रदेश (Lumbini Province)",
39
- "कर्णाली प्रदेश (Karnali Province)",
40
- "सुदूरपश्चिम प्रदेश (Sudurpashchim Province)"
41
  ]
42
-
43
  COMMON_LAST_NAMES = {
44
  "पहाडी (Pahadi)": ["शर्मा (Sharma)", "पौडेल (Poudel)", "खनाल (Khanal)", "अधिकारी (Adhikari)", "भट्टराई (Bhattarai)", "अन्य पहाडी (Other Pahadi)"],
45
  "नेवार (Newar)": ["श्रेष्ठ (Shrestha)", "प्रधान (Pradhan)", "महर्जन (Maharjan)", "बज्राचार्य (Bajracharya)", "अन्य नेवार (Other Newar)"],
@@ -54,488 +43,536 @@ COMMON_LAST_NAMES = {
54
  "अन्य (Other)": ["अन्य (Other)"]
55
  }
56
 
57
- # --- Directory and File Paths ---
58
- # These paths are relative to where app.py is run.
59
- # In a Hugging Face Space, this means they are within the Space's file system.
60
- RECORDINGS_DIR = "recordings"
61
- METADATA_DIR = "metadata"
62
- RATINGS_DIR = "ratings"
63
- METADATA_FILE = os.path.join(METADATA_DIR, "metadata.csv")
64
- RATINGS_FILE = os.path.join(RATINGS_DIR, "ratings.json")
 
 
 
 
 
 
65
 
66
  # --- Initialization ---
67
  def initialize_data_storage():
68
- """Creates directories and initial files if they don't exist."""
69
  os.makedirs(RECORDINGS_DIR, exist_ok=True)
70
  os.makedirs(METADATA_DIR, exist_ok=True)
71
  os.makedirs(RATINGS_DIR, exist_ok=True)
72
 
73
- if not os.path.exists(METADATA_FILE):
74
  pd.DataFrame(columns=[
75
- "id", "text", "audio_path", "gender", "age_group", "ethnicity",
76
- "last_name", "region", "emotion", "timestamp", "recording_type"
77
- ]).to_csv(METADATA_FILE, index=False)
 
78
 
79
- if not os.path.exists(RATINGS_FILE):
80
- with open(RATINGS_FILE, 'w') as f:
81
  json.dump({}, f)
82
 
83
- initialize_data_storage() # Call initialization at script start
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  # --- Core Functions ---
86
- def save_recording(audio, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type):
87
- """Save the recording and metadata"""
88
- if audio is None:
89
  return "कृपया पहिले रेकर्डिङ गर्नुहोस्। (Please record audio first)", None
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  recording_id = str(uuid.uuid4())
92
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
93
  audio_filename_relative = f"{recording_id}.wav"
94
- audio_filepath_in_space = os.path.join(RECORDINGS_DIR, audio_filename_relative)
95
 
96
  try:
97
- if isinstance(audio, tuple): # If it's a tuple (sr, data) from gr.Audio(type="numpy")
98
- sr, data = audio
99
- sf.write(audio_filepath_in_space, data, sr)
100
- elif isinstance(audio, str) and os.path.exists(audio): # If it's a path from gr.Audio(type="filepath")
101
- shutil.copy(audio, audio_filepath_in_space)
102
- # Gradio might place temp files elsewhere, so we ensure it's in our recordings dir
103
  else:
104
- return "अडियो फाइल बचत गर्न सकिएन। (Could not save audio file. Invalid audio format.)", None
105
  except Exception as e:
106
- return f"अडियो फाइल बचत गर्दा त्रुटि भयो: {e} (Error saving audio file: {e})", None
107
-
108
 
109
- metadata_df = pd.read_csv(METADATA_FILE)
110
- new_row = pd.DataFrame([{
111
- "id": recording_id,
112
- "text": text,
113
- "audio_path": audio_filepath_in_space, # Store path relative to space root
114
- "gender": gender,
115
- "age_group": age_group,
116
- "ethnicity": ethnicity,
117
- "last_name": last_name,
118
- "region": region,
119
- "emotion": emotion,
120
- "timestamp": timestamp,
121
- "recording_type": recording_type
122
- }])
123
-
124
- updated_metadata = pd.concat([metadata_df, new_row], ignore_index=True)
125
- updated_metadata.to_csv(METADATA_FILE, index=False)
126
-
127
- with open(RATINGS_FILE, 'r+') as f:
128
- ratings = json.load(f)
129
- ratings[recording_id] = {
130
- "upvotes": 0, "downvotes": 0,
131
- "quality_score": 0, "quality_votes": 0,
132
- "correctness_score": 0, "correctness_votes": 0
133
- }
134
- f.seek(0)
135
- json.dump(ratings, f, indent=2)
136
- f.truncate()
137
 
138
- return f"रेकर्डिङ सफलतापूर्वक सुरक्षित गरियो! ID: {recording_id} (Recording saved successfully!)", None # Return None to clear audio input
 
 
 
 
 
 
 
 
 
 
139
 
140
- def get_random_prompt():
141
- return random.choice(SAMPLE_PROMPTS)
142
 
143
- def get_ethnicity_based_last_names(ethnicity):
144
- return gr.Dropdown.update(choices=COMMON_LAST_NAMES.get(ethnicity, COMMON_LAST_NAMES["अन्य (Other)"]))
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- def vote_recording(recording_id, vote_type, vote_value_str): # vote_value comes as string from slider
147
- if not recording_id:
148
- return "कृपया पहिले समीक्षा गर्न रेकर्डिङ चयन गर्नुहोस्। (Please select a recording to review first.)"
149
- if not os.path.exists(RATINGS_FILE):
150
- return "रेटिङ फाइल भेटिएन। (Rating file not found.)"
151
 
152
- try:
153
- vote_value = int(vote_value_str) # Convert to int for quality/correctness
154
- except ValueError:
155
- if vote_type in ["quality", "correctness"]:
156
- return "अमान्य मत मान। (Invalid vote value.)"
157
- vote_value = 0 # For upvote/downvote
158
 
159
  try:
160
- with open(RATINGS_FILE, 'r+') as f:
161
  ratings = json.load(f)
162
- if recording_id not in ratings:
163
- return "रेकर्डिङ आईडी भेटिएन। (Recording ID not found.)"
164
-
165
  rec_ratings = ratings[recording_id]
166
- if vote_type == "upvote":
167
- rec_ratings["upvotes"] += 1
168
- elif vote_type == "downvote":
169
- rec_ratings["downvotes"] += 1
170
  elif vote_type == "quality":
171
- current_score = rec_ratings["quality_score"]
172
- current_votes = rec_ratings["quality_votes"]
173
- new_votes = current_votes + 1
174
- new_score = ((current_score * current_votes) + vote_value) / new_votes
175
- rec_ratings["quality_score"] = new_score
176
  rec_ratings["quality_votes"] = new_votes
177
  elif vote_type == "correctness":
178
- current_score = rec_ratings["correctness_score"]
179
- current_votes = rec_ratings["correctness_votes"]
180
- new_votes = current_votes + 1
181
- new_score = ((current_score * current_votes) + vote_value) / new_votes
182
- rec_ratings["correctness_score"] = new_score
183
  rec_ratings["correctness_votes"] = new_votes
184
- else:
185
- return "अमान्य मतदान प्रकार। (Invalid vote type.)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- f.seek(0)
188
- json.dump(ratings, f, indent=2)
189
- f.truncate()
190
- return "मतदान सफलतापूर्वक दर्ता गरियो! (Vote registered successfully!)"
191
- except Exception as e:
192
- return f"मतदान दर्ता गर्दा त्रुटि: {str(e)} (Error registering vote: {str(e)})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- def upload_to_huggingface(dataset_name, admin_password_attempt):
195
- """Upload the collected data to Hugging Face"""
196
- # --- Admin Password Check ---
 
197
  expected_admin_password = os.environ.get("ADMIN_UPLOAD_PASSWORD")
198
- hf_token_from_secret = os.environ.get("HF_TOKEN")
199
 
200
- if not expected_admin_password:
201
- return "त्रुटि: प्रशासक पासवर्ड स्पेस गोप्यमा कन्फिगर गरिएको छैन। (Error: Admin password not configured in Space secrets.)"
202
- if admin_password_attempt != expected_admin_password:
203
- return "त्रुटि: अपलोडका लागि अमान्य प्रशासक पासवर्ड। (Error: Invalid admin password for upload.)"
204
- if not hf_token_from_secret:
205
- return "त्रुटि: HF_TOKEN गोप्य स्पेस कन्फिगरेसनमा फेला परेन। अपलोड गर्न सकिँदैन। (Error: HF_TOKEN secret not found in Space configuration. Cannot upload.)"
206
- if not dataset_name or len(dataset_name.split('/')) != 2:
207
- return "त्रुटि: कृपया मान्य डेटासेट नाम 'username/repo_name' ढाँचामा प्रदान गर्नुहोस्। (Error: Please provide a valid dataset name in 'username/repo_name' format.)"
208
 
209
- if not os.path.exists(METADATA_FILE):
210
- return "कुनै मेटाडाटा फाइल भेटिएन। (No metadata file found.)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- try:
213
- api = HfApi(token=hf_token_from_secret)
214
- # Ensure repo exists, create if not. private=False for public dataset
215
- create_repo(repo_id=dataset_name, token=hf_token_from_secret, repo_type="dataset", exist_ok=True, private=False)
216
-
217
- metadata = pd.read_csv(METADATA_FILE)
218
- if len(metadata) == 0:
219
- return "कुनै डाटा भेटिएन। (No data to upload.)"
220
-
221
- with open(RATINGS_FILE, 'r') as f:
222
- ratings_data = json.load(f)
223
-
224
- metadata["upvotes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("upvotes", 0))
225
- metadata["downvotes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("downvotes", 0))
226
- metadata["quality_score"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("quality_score", 0))
227
- metadata["quality_votes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("quality_votes", 0))
228
- metadata["correctness_score"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("correctness_score", 0))
229
- metadata["correctness_votes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("correctness_votes", 0))
230
-
231
- # Prepare audio column for datasets library
232
- # The 'audio' column should contain dictionaries with 'path' and optionally 'bytes'
233
- # Here, we'll tell datasets to load from the paths we upload.
234
- audio_files_for_dataset = []
235
- for audio_path_in_space in metadata["audio_path"]:
236
- audio_files_for_dataset.append(
237
- {"path": os.path.join("audio", os.path.basename(audio_path_in_space))}
238
- )
239
-
240
- dataset_dict = metadata.to_dict(orient='list')
241
- dataset_dict['audio'] = audio_files_for_dataset # Add the audio column
242
-
243
- # Remove the local audio_path column as we now have the 'audio' dict column
244
- if 'audio_path' in dataset_dict:
245
- del dataset_dict['audio_path']
246
-
247
- hf_dataset = Dataset.from_dict(dataset_dict)
248
-
249
- # Push dataset metadata (e.g., data.jsonl or data.arrow/parquet files in the repo)
250
- hf_dataset.push_to_hub(repo_id=dataset_name) # token is implicitly used if HfApi was init with it or HF_TOKEN env var is set
251
-
252
- # Upload individual audio files
253
- # Create the audio folder in the dataset repo if it doesn't exist
254
  try:
255
- api.create_folder(
256
- repo_id=dataset_name,
257
- folder_path="audio", # Target folder in the dataset repo
258
- repo_type="dataset",
259
- exist_ok=True
260
- )
261
- except Exception as e:
262
- # Log this, but it's not critical if the folder already exists
263
- print(f"सूचना: अडियो फोल्डर सिर्जना गर्न सकिएन (यो पहिले नै अवस्थित हुन सक्छ): {e} (Info: Could not create audio folder (it might already exist): {e})")
264
-
265
- upload_count = 0
266
- for _, row in metadata.iterrows():
267
- local_audio_file = row["audio_path"] # This is like "recordings/uuid.wav"
268
- if os.path.exists(local_audio_file):
269
- # The path_in_repo should match what you put in the 'audio' column for datasets
270
- target_path_in_repo = os.path.join("audio", os.path.basename(local_audio_file))
 
 
 
 
271
  api.upload_file(
272
- path_or_fileobj=local_audio_file,
273
- path_in_repo=target_path_in_repo,
274
- repo_id=dataset_name,
275
- repo_type="dataset"
276
  )
277
- upload_count +=1
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
- return (f"डाटा हगिङफेसमा सफलतापूर्��क अपलोड गरियो! {upload_count} अडियो फाइलहरू अपलोड गरियो। "
280
- f"(Data successfully uploaded to Hugging Face at {dataset_name}. {upload_count} audio files uploaded.)")
281
 
282
- except Exception as e:
283
- import traceback
284
- tb_str = traceback.format_exc()
285
- return f"अपलोडको क्रममा त्रुटि (Error during upload):\n{str(e)}\n{tb_str}"
286
 
287
- def update_count():
288
- if os.path.exists(METADATA_FILE):
289
- try:
290
- metadata = pd.read_csv(METADATA_FILE)
291
- return f"हालसम्म {len(metadata)} रेकर्डिङहरू संकलन गरिएको छ। (Total recordings collected: {len(metadata)})"
292
- except pd.errors.EmptyDataError:
293
- return "हालसम्म ० रेकर्डिङहरू संकलन गरिएको छ। (Total recordings collected: 0)"
294
- return "कुनै रेकर्डिङ भेटिएन। (No recordings found.)"
295
-
296
- def list_recordings(num_items=10):
297
- if not os.path.exists(METADATA_FILE):
298
- return pd.DataFrame(columns=['id', 'text', 'ethnicity', 'region', 'timestamp'])
299
- try:
300
- metadata = pd.read_csv(METADATA_FILE)
301
- except pd.errors.EmptyDataError:
302
- return pd.DataFrame(columns=['id', 'text', 'ethnicity', 'region', 'timestamp'])
303
 
304
- if len(metadata) == 0:
305
- return pd.DataFrame(columns=['id', 'text', 'ethnicity', 'region', 'timestamp'])
306
 
307
- metadata['timestamp'] = pd.to_datetime(metadata['timestamp'], errors='coerce')
308
- sorted_metadata = metadata.sort_values('timestamp', ascending=False).head(int(num_items))
309
- display_df = sorted_metadata[['id', 'text', 'ethnicity', 'region', 'timestamp']].copy()
310
- display_df['timestamp'] = display_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M').fillna('N/A')
311
- return display_df.reset_index(drop=True)
312
 
313
- def get_recording_audio(recording_id):
314
- if not recording_id: return None, "कुनै रेकर्डिङ आईडी प्रदान गरिएको छैन। (No recording ID provided.)"
315
- if not os.path.exists(METADATA_FILE): return None, "मेटाडाटा फाइल भेटिएन। (Metadata file not found.)"
316
- try:
317
- metadata = pd.read_csv(METADATA_FILE)
318
- except pd.errors.EmptyDataError:
319
- return None, "मेटाडाटा खाली छ। (Metadata is empty.)"
320
 
321
- recording = metadata[metadata['id'] == recording_id]
322
- if len(recording) == 0: return None, "रेकर्डिङ भेटिएन। (Recording not found.)"
323
-
324
- audio_path = recording['audio_path'].iloc[0]
325
- text = recording['text'].iloc[0]
326
- if not os.path.exists(audio_path): return None, f"अडियो फाइल भेटिएन: {audio_path} (Audio file not found: {audio_path})"
327
- return audio_path, text
328
 
329
- def get_recording_ratings(recording_id):
330
- if not recording_id: return "रेकर्डिङ आईडी चयन गर्नुहोस्। (Select a Recording ID.)"
331
- if not os.path.exists(RATINGS_FILE): return "डाटा भेटिएन। (No ratings data found.)"
 
332
 
333
- with open(RATINGS_FILE, 'r') as f:
334
- ratings = json.load(f)
335
- if recording_id not in ratings: return "यस रेकर्डिङको लागि कुनै मूल्याङ्कन भेटिएन। (No ratings found for this recording.)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
- r = ratings[recording_id]
338
- upvotes = r.get("upvotes", 0)
339
- downvotes = r.get("downvotes", 0)
340
- quality = round(r.get("quality_score",0), 1) if r.get("quality_votes",0) > 0 else 0
341
- quality_votes = r.get("quality_votes",0)
342
- correctness = round(r.get("correctness_score",0), 1) if r.get("correctness_votes",0) > 0 else 0
343
- correctness_votes = r.get("correctness_votes",0)
344
 
345
- return f"""👍 Upvotes: {upvotes} | 👎 Downvotes: {downvotes}
346
- गुणस्तर (Quality): {quality}/5 ({quality_votes} मत/votes)
347
- शुद्धता (Correctness): {correctness}/5 ({correctness_votes} मत/votes)"""
348
 
349
  # --- Gradio UI Build ---
350
  def build_ui():
351
- with gr.Blocks(title="नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)") as app:
352
  gr.Markdown("# नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)")
353
- gr.Markdown(
354
- "यस प्लेटफर्मले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवाज डाटा संकलन गर्दछ। "
355
- "कृपया आफ्नो आवाज रेकर्ड गरेर योगदान दिनुहोस्।\n"
356
- "*This platform collects voice data for the development of Nepali Automatic Speech Recognition (ASR) technology. "
357
- "Please contribute by recording your voice.*"
358
- )
359
 
360
- # --- Data Collection Tabs ---
361
  with gr.Tabs():
362
  with gr.TabItem("१. आवाज रेकर्ड गर्नुहोस् (Record Voice)"):
363
  with gr.Tabs():
364
  with gr.TabItem("स्वतन्त्र पाठ (Free Text)"):
365
- with gr.Row():
366
- with gr.Column(scale=2):
367
- free_text_input = gr.Textbox(label="तपाईंले बोल्न चाहनुभएको पाठ (Text you want to speak)", placeholder="यहाँ लेख्नुहोस्...", lines=3)
368
- free_audio_input = gr.Audio(label="आवाज रेकर्ड गर्नुहोस् (Record your voice)", type="filepath", source="microphone")
369
- with gr.Column(scale=3):
370
- with gr.Row():
371
- free_gender_dd = gr.Dropdown(label="लिङ्ग (Gender)", choices=GENDERS, value=GENDERS[0])
372
- free_age_dd = gr.Dropdown(label="उमेर समूह (Age Group)", choices=AGE_GROUPS, value=AGE_GROUPS[1])
373
- with gr.Row():
374
- free_ethnicity_dd = gr.Dropdown(label="जातीयता (Ethnicity)", choices=list(COMMON_LAST_NAMES.keys()), value=list(COMMON_LAST_NAMES.keys())[0])
375
- free_lastname_dd = gr.Dropdown(label="थर (Last Name)", choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]])
376
- free_ethnicity_dd.change(fn=get_ethnicity_based_last_names, inputs=free_ethnicity_dd, outputs=free_lastname_dd)
377
- with gr.Row():
378
- free_region_dd = gr.Dropdown(label="क्षेत्र (Region)", choices=REGIONS, value=REGIONS[2])
379
- free_emotion_dd = gr.Dropdown(label="भावना (Emotion)", choices=EMOTIONS, value=EMOTIONS[0])
380
- free_submit_btn = gr.Button("सुरक्षित गर्नुहोस् (Save Free Text Recording)")
381
- free_status_output = gr.Textbox(label="स्थिति (Status)", interactive=False)
382
  free_submit_btn.click(
383
- save_recording,
384
  inputs=[free_audio_input, free_text_input, free_gender_dd, free_age_dd, free_ethnicity_dd, free_lastname_dd, free_region_dd, free_emotion_dd, gr.Textbox(value="free_text", visible=False)],
385
- outputs=[free_status_output, free_audio_input] # Clear audio on success
386
  )
387
 
388
  with gr.TabItem("निर्देशित पाठ (Prompted Text)"):
389
- with gr.Row():
390
- with gr.Column(scale=2):
391
- prompt_text_display = gr.Textbox(label="कृपया यो पाठ पढ्नुहोस् (Please read this text)", value=get_random_prompt(), lines=3, interactive=False)
392
- new_prompt_btn = gr.Button("नयाँ पाठ (New Prompt)")
393
- prompt_audio_input = gr.Audio(label="आवाज रेकर्ड गर्नुहोस् (Record your voice)", type="filepath", source="microphone")
394
- with gr.Column(scale=3):
395
- with gr.Row():
396
- prompt_gender_dd = gr.Dropdown(label="लिङ्ग (Gender)", choices=GENDERS, value=GENDERS[0])
397
- prompt_age_dd = gr.Dropdown(label="उमेर समूह (Age Group)", choices=AGE_GROUPS, value=AGE_GROUPS[1])
398
- with gr.Row():
399
- prompt_ethnicity_dd = gr.Dropdown(label="जातीयता (Ethnicity)", choices=list(COMMON_LAST_NAMES.keys()), value=list(COMMON_LAST_NAMES.keys())[0])
400
- prompt_lastname_dd = gr.Dropdown(label="थर (Last Name)", choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]])
401
- prompt_ethnicity_dd.change(fn=get_ethnicity_based_last_names, inputs=prompt_ethnicity_dd, outputs=prompt_lastname_dd)
402
- with gr.Row():
403
- prompt_region_dd = gr.Dropdown(label="क्षेत्र (Region)", choices=REGIONS, value=REGIONS[2])
404
- prompt_emotion_dd = gr.Dropdown(label="भावना (Emotion)", choices=EMOTIONS, value=EMOTIONS[0])
405
  new_prompt_btn.click(get_random_prompt, outputs=prompt_text_display)
406
- prompt_submit_btn = gr.Button("सुरक्षित गर्नुहोस् (Save Prompted Recording)")
407
- prompt_status_output = gr.Textbox(label="स्थिति (Status)", interactive=False)
408
  prompt_submit_btn.click(
409
- save_recording,
410
  inputs=[prompt_audio_input, prompt_text_display, prompt_gender_dd, prompt_age_dd, prompt_ethnicity_dd, prompt_lastname_dd, prompt_region_dd, prompt_emotion_dd, gr.Textbox(value="prompted_text", visible=False)],
411
  outputs=[prompt_status_output, prompt_audio_input]
412
  )
413
 
414
- with gr.TabItem("२. रेकर्डिङ समीक्षा गर्नुहोस् (Review Recordings)"):
415
- gr.Markdown("हालसालैका रेकर्डिङहरू हेर्नुहोस् र मत दिनुहोस्। (View and vote on recent recordings.)")
416
- num_review_items = gr.Number(value=10, label="देखाउने वस्तुहरूको संख्या (Number of items to show)", minimum=1, maximum=50, step=1)
417
- refresh_review_list_btn = gr.Button("सूची ताजा गर्नुहोस् (Refresh List)")
418
- review_list_df = gr.DataFrame(headers=['id', 'text', 'ethnicity', 'region', 'timestamp'], label="हालका रेकर्डिङहरू (Recent Recordings)", interactive=False, datatype=['str', 'str', 'str', 'str', 'str'])
419
-
420
- with gr.Row():
421
- selected_review_id = gr.Textbox(label="चयन गरिएको आईडी (Selected ID)", interactive=False)
422
- selected_review_text = gr.Textbox(label="रेकर्डिङ पाठ (Recording Text)", interactive=False, lines=2)
423
- review_audio_player = gr.Audio(label="रेकर्डिङ सुन्नुहोस् (Listen to Recording)", type="filepath")
424
- current_ratings_display = gr.Textbox(label="वर्तमान मूल्याङ्कन (Current Ratings)", interactive=False, lines=3)
 
 
 
425
 
426
  def select_for_review(evt: gr.SelectData, df_data: pd.DataFrame):
427
  if evt.index is None or df_data is None or len(df_data) == 0 or evt.index[0] >= len(df_data):
428
- return "", "", None, "कुनै रेकर्डिङ चयन गरिएको छैन (No recording selected)"
429
  selected_id_val = df_data.iloc[evt.index[0]]['id']
430
- audio_p, text_val = get_recording_audio(selected_id_val)
431
- ratings_text_val = get_recording_ratings(selected_id_val)
432
  return selected_id_val, text_val, audio_p, ratings_text_val
433
 
434
  review_list_df.select(select_for_review, inputs=[review_list_df], outputs=[selected_review_id, selected_review_text, review_audio_player, current_ratings_display])
435
- refresh_review_list_btn.click(list_recordings, inputs=[num_review_items], outputs=review_list_df)
436
-
437
- gr.Markdown("### मतदान गर्नुहोस् (Cast Your Vote)")
438
- with gr.Row():
439
- upvote_btn = gr.Button("👍 मन पर्यो (Upvote)")
440
- downvote_btn = gr.Button("👎 मन परेन (Downvote)")
441
- with gr.Row():
442
- quality_rating_slider = gr.Slider(minimum=1, maximum=5, step=1, label="गुणस्तर मूल्याङ्कन (Quality Rating 1-5)", value=3)
443
- submit_quality_btn = gr.Button("गुणस्तर मत दिनुहोस् (Submit Quality)")
444
- with gr.Row():
445
- correctness_rating_slider = gr.Slider(minimum=1, maximum=5, step=1, label="शुद्धता मूल्याङ्कन (Correctness Rating 1-5)", value=3)
446
- submit_correctness_btn = gr.Button("शुद्धता मत दिनुहोस् (Submit Correctness)")
447
- vote_status_output = gr.Textbox(label="मतदान स्थिति (Voting Status)", interactive=False)
448
-
449
- def vote_and_refresh(rec_id, vote_t, vote_val_str):
450
- status = vote_recording(rec_id, vote_t, str(vote_val_str)) # Ensure vote_val is str
451
- new_ratings = get_recording_ratings(rec_id) if rec_id else "रेकर्डिङ चयन गर्नुहोस् (Select a recording)"
452
- # Also refresh the main list to reflect potential score changes indirectly
453
- # latest_list = list_recordings(num_review_items.value) # This needs to be handled carefully to avoid component errors
454
  return status, new_ratings
455
 
456
- upvote_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="upvote", visible=False), gr.Number(value=0, visible=False)], outputs=[vote_status_output, current_ratings_display])
457
- downvote_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="downvote", visible=False), gr.Number(value=0, visible=False)], outputs=[vote_status_output, current_ratings_display])
458
- submit_quality_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="quality", visible=False), quality_rating_slider], outputs=[vote_status_output, current_ratings_display])
459
- submit_correctness_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="correctness", visible=False), correctness_rating_slider], outputs=[vote_status_output, current_ratings_display])
460
 
461
- with gr.TabItem("३. प्रगति र अपलोड (Progress & Upload)"):
462
- gr.Markdown("## संकलन प्रगति (Collection Progress)")
463
- total_count_display = gr.Textbox(label="कुल संकलित रेकर्डिङ (Total Recordings Collected)", interactive=False)
464
- refresh_total_count_btn = gr.Button("गणना ताजा गर्नुहोस् (Refresh Count)")
465
- refresh_total_count_btn.click(update_count, outputs=total_count_display)
466
 
 
 
 
 
 
467
  gr.Markdown("---")
468
- gr.Markdown("## हगिङफेसमा अपलोड गर्नुहोस् (Upload to Hugging Face)")
469
- gr.Markdown(
470
- "**महत्वपूर्ण:** यो कार्यले स्पेसमा संकलित सबै डाटालाई हगिङ फेस डेटासेटमा पुश गर्नेछ। "
471
- "स्पेसको स्टोरेज अस्थायी हुन सक्छ, त्यसैले नियमित रूपमा अपलोड गर्न सिफारिस गरिन्छ।\n"
472
- "यो कार्य गर्नको लागि, तपाईंले स्पेस सेटिङहरूमा `HF_TOKEN` (लेख्ने पहुँच सहितको हगिङ फेस टोकन) "
473
- "र `ADMIN_UPLOAD_PASSWORD` गोप्य रूपमा थप्नुपर्छ।\n\n"
474
- "**IMPORTANT:** This action will push all data collected in this Space to the Hugging Face Dataset. "
475
- "Space storage can be ephemeral, so regular uploads are recommended. "
476
- "To perform this action, you must have added `HF_TOKEN` (a Hugging Face token with write access) "
477
- "and `ADMIN_UPLOAD_PASSWORD` as secrets in the Space settings."
478
- )
479
- hf_dataset_name_input = gr.Textbox(label="Dataset Name (e.g., your_username/nepali-asr-data)", placeholder="your_hf_username/dataset_repo_name")
480
- admin_password_input = gr.Textbox(label="Admin Upload Password", type="password", placeholder="Enter admin password")
481
- upload_to_hf_btn = gr.Button("हगिङफेसमा अपलोड गर्नुहोस् (Upload to Hugging Face)")
482
- upload_status_output = gr.Textbox(label="अपलोड स्थिति (Upload Status)", interactive=False, lines=5)
483
- upload_to_hf_btn.click(upload_to_huggingface, inputs=[hf_dataset_name_input, admin_password_input], outputs=upload_status_output)
484
 
485
- with gr.TabItem("४. जानकारी (Information)"):
486
- gr.Markdown(render_info_page()) # Using a helper for cleaner code
 
 
487
 
488
- # Initial loads
489
- app.load(fn=update_count, inputs=None, outputs=total_count_display)
490
- app.load(fn=lambda n: list_recordings(n), inputs=[num_review_items], outputs=review_list_df) # Load initial review list
 
 
491
 
 
 
492
  return app
493
 
494
  def render_info_page():
 
495
  return """
496
  ## नेपाली ASR डाटा संकलन प्रोजेक्टको बारेमा (About the Nepali ASR Data Collection Project)
497
-
498
  यो प्रोजेक्टले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवश्यक डाटा संकलन गर्दछ।
499
- तपाईंको योगदानले नेपाली भाषा प्रविधिको विकासमा ठूलो मद्दत पुर्‍याउनेछ।
500
-
501
  ### कसरी योगदान दिने (How to Contribute):
502
- 1. **आवाज रेकर्ड गर्नुहोस् (Record Voice)** ट्याबमा जानुहोस्।
503
- * **स्वतन्त्र पाठ (Free Text)** अन्तर्गत, तपाईं आफ्नो इच्छा अनुसारको पाठ लेख्नुहोस्, आवश्यक विवरणहरू (लिङ्ग, उमेर, आदि) छान्नुहोस्,आफ्नो आवाज रेकर्ड गर्नुहोस्।
504
- * **निर्देशित पाठ (Prompted Text)** अन्तर्गत, दिइएको नेपाली वाक्य पढ्नुहोस्, विवरणहरू छान्नुहोस्, र आफ्नो आवाज रेकर्ड गर्नुहोस्। "नयाँ पाठ" बटनले तपाईंलाई फरक वाक्य दिनेछ।
505
- 2. **रेकर्डिङ समीक्षा गर्नुहोस् (Review Recordings)** ट्याबमा गएर अरूले गरेका रेकर्डिङहरू सुन्नुहोस् र तिनीहरूको गुणस्तर र शुद्धताको लागि मतदान गर्नुहोस्। यसले डाटाको गुणस्तर सुधार गर्न मद्दत गर्दछ।
506
- 3. रेकर्डिङ पछि, "सुरक्षित गर्नुहोस् (Save)" बटनमा क्लिक गर्नुहोस्।
507
-
508
  ### गोपनीयता नीति (Privacy Policy):
509
- - तपाईंको आव��ज रेकर्डिङ र सम्बन्धित मेटाडाटा (जस्तै उमेर समूह, लिङ्ग, क्षेत्र) सार्वजनिक अनुसन्धान उद्देश्यका लागि प्रयोग गरिनेछ।
510
- - हामी तपाईंको नाम वा सम्पर्क जानकारी जस्ता प्रत्यक्ष व्यक्तिगत पहिचान योग्य जानकारी सङ्कलन गर्दैनौं। तपाईंले प्रदान गर्नुभएको जातीयता/थरको जानकारी उच्चारण र विविधता अध्ययनको लागि हो।
511
- - यो डाटासेट खुला स्रोत हुनेछ हगिङ फेस जस्ता प्लेटफर्महरूमा अनुसन्धान समुदायको लागि उपलब्ध गराइनेछ।
512
- - कृपया रेकर्डिङको क्रममा कुनै पनि संवेदनशील व्यक्तिगत जानकारी नबोल्नुहोस्।
513
-
514
- ---
515
-
516
- ## About Nepali ASR Data Collection Project
517
-
518
- This project collects voice data essential for developing Automatic Speech Recognition (ASR) technology for the Nepali language.
519
- Your contribution will significantly aid in the advancement of Nepali language technology.
520
-
521
- ### How to Contribute:
522
- 1. Go to the **Record Voice (आवाज रेकर्ड गर्नुहोस्)** tab.
523
- * Under **Free Text (स्वतन्त्र पाठ)**, type any Nepali text you wish, select the required demographic details (gender, age, etc.), and record your voice.
524
- * Under **Prompted Text (निर्देशित पाठ)**, read the provided Nepali sentence, select demographic details, and record your voice. The "New Text (नयाँ पाठ)" button will give you a different sentence.
525
- 2. Go to the **Review Recordings (रेकर्डिङ समीक्षा गर्नुहोस्)** tab to listen to recordings made by others and vote on their quality and correctness. This helps improve the overall quality of the dataset.
526
- 3. After recording, click the "Save (सुरक्षित गर्नुहोस्)" button.
527
-
528
- ### Privacy Policy:
529
- - Your voice recordings and associated metadata (like age group, gender, region) will be used for public research purposes.
530
- - We do not collect directly personally identifiable information such as your name or contact details. The ethnicity/last name information you provide is for studying accent and diversity.
531
- - This dataset will be open-source and made available to the research community on platforms like Hugging Face.
532
- - Please do not speak any sensitive personal information during your recordings.
533
- """
534
 
535
  # --- Main Execution ---
536
  if __name__ == "__main__":
537
- # Ensure storage is initialized when running locally too
538
  initialize_data_storage()
539
-
540
  app_ui = build_ui()
541
  app_ui.launch()
 
6
  import datetime
7
  import uuid
8
  import json
9
+ import io # For BytesIO
10
+ from huggingface_hub import HfApi, create_repo, HfFileSystem
11
+ from datasets import Dataset # Still useful for potential future aggregation
12
+ import soundfile as sf
13
+ import shutil
14
+ import traceback # For detailed error logging
15
+
16
+ # --- Configuration ---
17
  SAMPLE_PROMPTS = [
18
+ "नमस्ते, मेरो नाम __ हो। म नेपाली बोल्छु।", "आज मौसम धेरै राम्रो छ।",
19
+ "नेपाल एक सुन्दर देश हो जहाँ हिमाल, पहाड र तराई छन्।", "काठमाडौं नेपालको राजधानी हो।",
20
+ " आज बिहान स्कूल जाँदैछु।", "नेपाली भाषा बोल्ने मानिसहरू विश्वभर छन्।",
21
+ "हिमालमा हिउँ परिरहेको छ।", "मलाई नेपाली खाना धेरै मन पर्छ।",
22
+ "बुद्ध नेपालमा जन्मिएका थिए।", "सगरमाथा विश्वको सबैभन्दा अग्लो हिमाल हो।"
 
 
 
 
 
23
  ]
 
24
  EMOTIONS = ["सामान्य (Neutral)", "खुसी (Happy)", "दुःखी (Sad)", "रिसाएको (Angry)", "अचम्मित (Surprised)"]
25
  GENDERS = ["पुरुष (Male)", "महिला (Female)", "अन्य (Other)", "भन्न चाहन्न (Prefer not to say)"]
26
  AGE_GROUPS = ["18 भन्दा कम", "18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
 
 
27
  REGIONS = [
28
+ "प्रदेश १ (Province 1)", "मधेश प्रदेश (Madhesh Province)", "बागमती प्रदेश (Bagmati Province)",
29
+ "गण्डकी प्रदेश (Gandaki Province)", "लुम्बिनी प्रदेश (Lumbini Province)",
30
+ "कर्णाली प्रदेश (Karnali Province)", "सुदूरपश्चिम प्रदेश (Sudurpashchim Province)"
 
 
 
 
31
  ]
 
32
  COMMON_LAST_NAMES = {
33
  "पहाडी (Pahadi)": ["शर्मा (Sharma)", "पौडेल (Poudel)", "खनाल (Khanal)", "अधिकारी (Adhikari)", "भट्टराई (Bhattarai)", "अन्य पहाडी (Other Pahadi)"],
34
  "नेवार (Newar)": ["श्रेष्ठ (Shrestha)", "प्रधान (Pradhan)", "महर्जन (Maharjan)", "बज्राचार्य (Bajracharya)", "अन्य नेवार (Other Newar)"],
 
43
  "अन्य (Other)": ["अन्य (Other)"]
44
  }
45
 
46
+ # --- Directory and File Paths (Local to the Space) ---
47
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # Ensures paths are relative to app.py
48
+ RECORDINGS_DIR = os.path.join(BASE_DIR, "recordings")
49
+ METADATA_DIR = os.path.join(BASE_DIR, "metadata")
50
+ RATINGS_DIR = os.path.join(BASE_DIR, "ratings")
51
+ LOCAL_METADATA_FILE = os.path.join(METADATA_DIR, "local_metadata.csv")
52
+ LOCAL_RATINGS_FILE = os.path.join(RATINGS_DIR, "local_ratings.json")
53
+ HF_UPLOAD_LOG_FILE = os.path.join(METADATA_DIR, "hf_upload_log.csv") # To track HF uploads
54
+
55
+ # --- Hugging Face Configuration (from Space Secrets) ---
56
+ # These will be fetched from environment variables set by Space Secrets
57
+ # HF_TOKEN: Your HF Write Token
58
+ # ADMIN_UPLOAD_PASSWORD: Password for admin functions
59
+ # TARGET_HF_DATASET_REPO_ID: e.g., "yourusername/your-dataset-repo"
60
 
61
  # --- Initialization ---
62
  def initialize_data_storage():
 
63
  os.makedirs(RECORDINGS_DIR, exist_ok=True)
64
  os.makedirs(METADATA_DIR, exist_ok=True)
65
  os.makedirs(RATINGS_DIR, exist_ok=True)
66
 
67
+ if not os.path.exists(LOCAL_METADATA_FILE):
68
  pd.DataFrame(columns=[
69
+ "id", "text", "local_audio_path", "gender", "age_group", "ethnicity",
70
+ "last_name", "region", "emotion", "timestamp", "recording_type",
71
+ "hf_audio_uploaded", "hf_metadata_uploaded" # Track HF upload status
72
+ ]).to_csv(LOCAL_METADATA_FILE, index=False)
73
 
74
+ if not os.path.exists(LOCAL_RATINGS_FILE):
75
+ with open(LOCAL_RATINGS_FILE, 'w') as f:
76
  json.dump({}, f)
77
 
78
+ if not os.path.exists(HF_UPLOAD_LOG_FILE):
79
+ pd.DataFrame(columns=["id", "timestamp", "hf_repo_id", "type", "status", "message"]).to_csv(HF_UPLOAD_LOG_FILE, index=False)
80
+
81
+ initialize_data_storage()
82
+
83
+ # --- Helper: Log HF Upload Attempts ---
84
+ def log_hf_upload_attempt(recording_id, hf_repo_id, upload_type, status, message=""):
85
+ log_df = pd.read_csv(HF_UPLOAD_LOG_FILE)
86
+ new_log_entry = pd.DataFrame([{
87
+ "id": recording_id,
88
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
89
+ "hf_repo_id": hf_repo_id,
90
+ "type": upload_type, # "audio", "metadata_entry", "ratings_sync"
91
+ "status": status, # "success", "failure"
92
+ "message": message
93
+ }])
94
+ updated_log_df = pd.concat([log_df, new_log_entry], ignore_index=True)
95
+ updated_log_df.to_csv(HF_UPLOAD_LOG_FILE, index=False)
96
 
97
  # --- Core Functions ---
98
+ def save_and_direct_upload_recording(audio_input, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type):
99
+ """Saves recording locally and attempts immediate upload to Hugging Face."""
100
+ if audio_input is None:
101
  return "कृपया पहिले रेकर्डिङ गर्नुहोस्। (Please record audio first)", None
102
 
103
+ # --- Get HF Config from Secrets ---
104
+ hf_token = os.environ.get("HF_TOKEN")
105
+ target_hf_repo_id = os.environ.get("TARGET_HF_DATASET_REPO_ID")
106
+
107
+ if not hf_token or not target_hf_repo_id:
108
+ # Critical config missing, only save locally
109
+ status_msg, _ = save_recording_locally(
110
+ audio_input, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type,
111
+ hf_audio_uploaded=False, hf_metadata_uploaded=False # Mark as not uploaded
112
+ )
113
+ return f"{status_msg} (HF config missing in secrets, saved locally only)", None
114
+
115
+ # --- Save Locally First (always) ---
116
+ status_msg_local, recording_id, local_audio_path = save_recording_locally(
117
+ audio_input, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type,
118
+ hf_audio_uploaded=False, hf_metadata_uploaded=False # Initial state
119
+ )
120
+ if not recording_id: # Failed to save locally
121
+ return status_msg_local, None
122
+
123
+ # --- Attempt Direct HF Upload ---
124
+ hf_upload_success = True
125
+ final_status_message = status_msg_local # Start with local save message
126
+
127
+ try:
128
+ api = HfApi(token=hf_token)
129
+ # Ensure base repo and standard subfolders exist
130
+ create_repo(repo_id=target_hf_repo_id, token=hf_token, repo_type="dataset", exist_ok=True, private=False)
131
+ for folder in ["audio", "metadata_entries", "ratings_entries"]:
132
+ try:
133
+ api.create_folder(repo_id=target_hf_repo_id, folder_path=folder, repo_type="dataset", exist_ok=True)
134
+ except Exception: pass # Ignore if folder exists
135
+
136
+ # 1. Upload Audio File to HF
137
+ hf_audio_path = f"audio/{os.path.basename(local_audio_path)}"
138
+ api.upload_file(
139
+ path_or_fileobj=local_audio_path,
140
+ path_in_repo=hf_audio_path,
141
+ repo_id=target_hf_repo_id,
142
+ repo_type="dataset",
143
+ commit_message=f"feat: Add audio for recording {recording_id}"
144
+ )
145
+ log_hf_upload_attempt(recording_id, target_hf_repo_id, "audio", "success")
146
+ update_local_metadata_hf_status(recording_id, hf_audio_uploaded=True)
147
+ final_status_message += f"\nअडियो सफलतापूर्वक HF मा अपलोड गरियो। (Audio uploaded to HF successfully.)"
148
+
149
+ # 2. Prepare and Upload Metadata Entry to HF
150
+ single_metadata_dict_for_hf = {
151
+ "id": recording_id, "text": text, "hf_audio_path": hf_audio_path,
152
+ "gender": gender, "age_group": age_group, "ethnicity": ethnicity,
153
+ "last_name": last_name, "region": region, "emotion": emotion,
154
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # Use current time for HF metadata
155
+ "recording_type": recording_type
156
+ # Initial ratings are not part of this base metadata; they'll be separate
157
+ }
158
+ hf_metadata_entry_path = f"metadata_entries/{recording_id}.json"
159
+ api.upload_file(
160
+ path_or_fileobj=io.BytesIO(json.dumps(single_metadata_dict_for_hf, ensure_ascii=False, indent=2).encode('utf-8')),
161
+ path_in_repo=hf_metadata_entry_path,
162
+ repo_id=target_hf_repo_id,
163
+ repo_type="dataset",
164
+ commit_message=f"feat: Add metadata for recording {recording_id}"
165
+ )
166
+ log_hf_upload_attempt(recording_id, target_hf_repo_id, "metadata_entry", "success")
167
+ update_local_metadata_hf_status(recording_id, hf_metadata_uploaded=True)
168
+ final_status_message += f"\nमेटाडाटा सफलतापूर्वक HF मा अपलोड गरियो। (Metadata uploaded to HF successfully.)"
169
+
170
+ except Exception as e:
171
+ tb_str = traceback.format_exc()
172
+ error_message = f"HF अपलोडको क्रममा त्रुटि: {str(e)}. (Error during HF upload.)\nविवरण (Details): {tb_str}"
173
+ log_hf_upload_attempt(recording_id, target_hf_repo_id, "audio_or_metadata_entry_direct", "failure", error_message)
174
+ final_status_message += f"\n{error_message}\nडाटा स्थानीय रूपमा सुरक्षित गरिएको छ। (Data saved locally.)"
175
+ hf_upload_success = False # Mark that direct upload had issues
176
+
177
+ return final_status_message, None # Clear audio input
178
+
179
+ def save_recording_locally(audio_input, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type, hf_audio_uploaded=False, hf_metadata_uploaded=False):
180
+ """Saves recording and metadata locally in the Space."""
181
  recording_id = str(uuid.uuid4())
182
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
183
  audio_filename_relative = f"{recording_id}.wav"
184
+ local_audio_path = os.path.join(RECORDINGS_DIR, audio_filename_relative)
185
 
186
  try:
187
+ if isinstance(audio_input, tuple):
188
+ sr, data = audio_input
189
+ sf.write(local_audio_path, data, sr)
190
+ elif isinstance(audio_input, str) and os.path.exists(audio_input):
191
+ shutil.copy(audio_input, local_audio_path)
 
192
  else:
193
+ return "अडियो फाइल बचत गर्न सकिएन। (Could not save audio file. Invalid audio input.)", None, None
194
  except Exception as e:
195
+ return f"स्थानीय रूपमा अडियो बचत गर्दा त्रुटि भयो: {e} (Error saving audio locally: {e})", None, None
 
196
 
197
+ # Save to local metadata CSV
198
+ try:
199
+ metadata_df = pd.read_csv(LOCAL_METADATA_FILE)
200
+ new_row = pd.DataFrame([{
201
+ "id": recording_id, "text": text, "local_audio_path": local_audio_path,
202
+ "gender": gender, "age_group": age_group, "ethnicity": ethnicity,
203
+ "last_name": last_name, "region": region, "emotion": emotion,
204
+ "timestamp": timestamp, "recording_type": recording_type,
205
+ "hf_audio_uploaded": hf_audio_uploaded, "hf_metadata_uploaded": hf_metadata_uploaded
206
+ }])
207
+ updated_metadata = pd.concat([metadata_df, new_row], ignore_index=True)
208
+ updated_metadata.to_csv(LOCAL_METADATA_FILE, index=False)
209
+ except Exception as e:
210
+ # If metadata save fails, try to clean up audio to avoid orphans
211
+ if os.path.exists(local_audio_path): os.remove(local_audio_path)
212
+ return f"स्थानीय मेटाडाटा बचत गर्दा त्रुटि: {e} (Error saving local metadata: {e})", None, None
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+ # Initialize local ratings
215
+ try:
216
+ with open(LOCAL_RATINGS_FILE, 'r+') as f:
217
+ ratings = json.load(f)
218
+ ratings[recording_id] = {"upvotes": 0, "downvotes": 0, "quality_score": 0, "quality_votes": 0, "correctness_score": 0, "correctness_votes": 0}
219
+ f.seek(0)
220
+ json.dump(ratings, f, indent=2, ensure_ascii=False)
221
+ f.truncate()
222
+ except Exception as e:
223
+ # Non-critical for recording saving, but log it
224
+ print(f"Warning: Could not initialize local ratings for {recording_id}: {e}")
225
 
226
+ return f"रेकर्डिङ सफलतापूर्वक स्थानीय रूपमा सुरक्षित गरियो! ID: {recording_id} (Recording saved locally successfully!)", recording_id, local_audio_path
 
227
 
228
+ def update_local_metadata_hf_status(recording_id, hf_audio_uploaded=None, hf_metadata_uploaded=None):
229
+ """Updates the HF upload status flags in the local metadata CSV."""
230
+ try:
231
+ metadata_df = pd.read_csv(LOCAL_METADATA_FILE)
232
+ idx = metadata_df[metadata_df['id'] == recording_id].index
233
+ if not idx.empty:
234
+ if hf_audio_uploaded is not None:
235
+ metadata_df.loc[idx, 'hf_audio_uploaded'] = hf_audio_uploaded
236
+ if hf_metadata_uploaded is not None:
237
+ metadata_df.loc[idx, 'hf_metadata_uploaded'] = hf_metadata_uploaded
238
+ metadata_df.to_csv(LOCAL_METADATA_FILE, index=False)
239
+ except Exception as e:
240
+ print(f"Error updating local metadata HF status for {recording_id}: {e}")
241
 
242
+ def get_random_prompt(): return random.choice(SAMPLE_PROMPTS)
243
+ def get_ethnicity_based_last_names(ethnicity): return gr.Dropdown.update(choices=COMMON_LAST_NAMES.get(ethnicity, COMMON_LAST_NAMES["अन्य (Other)"]))
 
 
 
244
 
245
+ def vote_recording(recording_id, vote_type, vote_value_str):
246
+ if not recording_id: return "कृपया पहिले समीक्षा गर्न रेकर्डिङ चयन गर्नुहोस्।"
247
+ if not os.path.exists(LOCAL_RATINGS_FILE): return "स्थानीय रेटिङ फाइल भेटिएन।"
248
+ try: vote_value = int(vote_value_str)
249
+ except ValueError: vote_value = 0
 
250
 
251
  try:
252
+ with open(LOCAL_RATINGS_FILE, 'r+') as f:
253
  ratings = json.load(f)
254
+ if recording_id not in ratings: return "यस रेकर्डिङको लागि स्थानीय मूल्याङ्कन भेटिएन।"
 
 
255
  rec_ratings = ratings[recording_id]
256
+ if vote_type == "upvote": rec_ratings["upvotes"] += 1
257
+ elif vote_type == "downvote": rec_ratings["downvotes"] += 1
 
 
258
  elif vote_type == "quality":
259
+ new_votes = rec_ratings.get("quality_votes", 0) + 1
260
+ rec_ratings["quality_score"] = ((rec_ratings.get("quality_score", 0) * rec_ratings.get("quality_votes", 0)) + vote_value) / new_votes
 
 
 
261
  rec_ratings["quality_votes"] = new_votes
262
  elif vote_type == "correctness":
263
+ new_votes = rec_ratings.get("correctness_votes", 0) + 1
264
+ rec_ratings["correctness_score"] = ((rec_ratings.get("correctness_score", 0) * rec_ratings.get("correctness_votes", 0)) + vote_value) / new_votes
 
 
 
265
  rec_ratings["correctness_votes"] = new_votes
266
+ else: return "अमान्य मतदान प्रकार।"
267
+ f.seek(0); json.dump(ratings, f, indent=2, ensure_ascii=False); f.truncate()
268
+ return "स्थानीय मतदान सफलतापूर्वक दर्ता गरियो!"
269
+ except Exception as e: return f"स्थानीय मतदान दर्ता गर्दा त्रुटि: {str(e)}"
270
+
271
+ def get_local_recording_audio(recording_id):
272
+ if not recording_id: return None, "कुनै रेकर्डिङ आईडी प्रदान गरिएको छैन।"
273
+ if not os.path.exists(LOCAL_METADATA_FILE): return None, "स्थानीय मेटाडाटा फाइल भेटिएन।"
274
+ try: metadata = pd.read_csv(LOCAL_METADATA_FILE)
275
+ except pd.errors.EmptyDataError: return None, "स्थानीय मेटाडाटा खाली छ।"
276
+ recording = metadata[metadata['id'] == recording_id]
277
+ if len(recording) == 0: return None, "रेकर्डिङ भेटिएन।"
278
+ audio_path = recording['local_audio_path'].iloc[0]
279
+ text = recording['text'].iloc[0]
280
+ if not os.path.exists(audio_path): return None, f"स्थानीय अडियो फाइल भेटिएन: {audio_path}"
281
+ return audio_path, text
282
 
283
+ def get_local_recording_ratings(recording_id):
284
+ if not recording_id: return "रेकर्डिङ आईडी चयन गर्नुहोस्।"
285
+ if not os.path.exists(LOCAL_RATINGS_FILE): return "स्थानीय मूल्याङ्कन डाटा भेटिएन।"
286
+ with open(LOCAL_RATINGS_FILE, 'r') as f: ratings_data = json.load(f)
287
+ if recording_id not in ratings_data: return "यस रेकर्डिङको लागि कुनै स्थानीय मूल्याङ्कन भेटिएन।"
288
+ r = ratings_data[recording_id]
289
+ return (f"👍 Upvotes: {r.get('upvotes',0)} | 👎 Downvotes: {r.get('downvotes',0)}\n"
290
+ f"गुणस्तर (Quality): {r.get('quality_score',0):.1f}/5 ({r.get('quality_votes',0)} मत)\n"
291
+ f"शुद्धता (Correctness): {r.get('correctness_score',0):.1f}/5 ({r.get('correctness_votes',0)} मत)")
292
+
293
+ def update_local_count():
294
+ if os.path.exists(LOCAL_METADATA_FILE):
295
+ try:
296
+ metadata = pd.read_csv(LOCAL_METADATA_FILE)
297
+ uploaded_audio_count = metadata['hf_audio_uploaded'].sum()
298
+ uploaded_metadata_count = metadata['hf_metadata_uploaded'].sum()
299
+ return (f"कुल स्थानीय रेकर्डिङ: {len(metadata)}\n"
300
+ f"HF मा अडियो अपलोड गरिएको: {uploaded_audio_count}\n"
301
+ f"HF मा मेटाडाटा अपलोड गरिएको: {uploaded_metadata_count}")
302
+ except pd.errors.EmptyDataError: return "स्थानीय रेकर्डिङ: 0"
303
+ return "कुनै स्थानीय रेकर्डिङ भेटिएन।"
304
+
305
+ def list_local_recordings(num_items=10):
306
+ if not os.path.exists(LOCAL_METADATA_FILE): return pd.DataFrame()
307
+ try: metadata = pd.read_csv(LOCAL_METADATA_FILE)
308
+ except pd.errors.EmptyDataError: return pd.DataFrame()
309
+ if len(metadata) == 0: return pd.DataFrame()
310
+ metadata['timestamp'] = pd.to_datetime(metadata['timestamp'], errors='coerce')
311
+ sorted_metadata = metadata.sort_values('timestamp', ascending=False).head(int(num_items))
312
+ display_df = sorted_metadata[['id', 'text', 'ethnicity', 'region', 'timestamp', 'hf_audio_uploaded', 'hf_metadata_uploaded']].copy()
313
+ display_df['timestamp'] = display_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M').fillna('N/A')
314
+ return display_df.reset_index(drop=True)
315
 
316
+ # --- Admin Functions for HF Syncing ---
317
+ def admin_retry_failed_uploads(admin_password_attempt):
318
+ hf_token = os.environ.get("HF_TOKEN")
319
+ target_hf_repo_id = os.environ.get("TARGET_HF_DATASET_REPO_ID")
320
  expected_admin_password = os.environ.get("ADMIN_UPLOAD_PASSWORD")
 
321
 
322
+ if admin_password_attempt != expected_admin_password: return "अमान्य प्रशासक पासवर्ड।"
323
+ if not hf_token or not target_hf_repo_id: return "HF कन्फिगरेसन गोप्यमा हराइरहेको छ।"
324
+ if not os.path.exists(LOCAL_METADATA_FILE): return "कुनै स्थानीय मेटाडाटा फाइल भेटिएन।"
 
 
 
 
 
325
 
326
+ local_meta_df = pd.read_csv(LOCAL_METADATA_FILE)
327
+ # Find recordings where audio or metadata upload is marked as False
328
+ to_retry_df = local_meta_df[~(local_meta_df['hf_audio_uploaded'] & local_meta_df['hf_metadata_uploaded'])]
329
+
330
+ if to_retry_df.empty: return "पुनः प्रयास गर्न कुनै असफल अपलोडहरू भेटिएन।"
331
+
332
+ api = HfApi(token=hf_token)
333
+ success_count = 0
334
+ failure_count = 0
335
+ messages = []
336
+
337
+ for _, row in to_retry_df.iterrows():
338
+ rec_id = row['id']
339
+ local_audio_p = row['local_audio_path']
340
+ hf_audio_path = f"audio/{os.path.basename(local_audio_p)}"
341
+ current_status_msg = f"Retrying {rec_id}: "
342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  try:
344
+ # Retry audio if not uploaded
345
+ if not row['hf_audio_uploaded'] and os.path.exists(local_audio_p):
346
+ api.upload_file(path_or_fileobj=local_audio_p, path_in_repo=hf_audio_path, repo_id=target_hf_repo_id, repo_type="dataset", commit_message=f"fix: Retry audio upload for {rec_id}")
347
+ update_local_metadata_hf_status(rec_id, hf_audio_uploaded=True)
348
+ log_hf_upload_attempt(rec_id, target_hf_repo_id, "audio_retry", "success")
349
+ current_status_msg += "Audio OK. "
350
+ elif row['hf_audio_uploaded']:
351
+ current_status_msg += "Audio already uploaded. "
352
+
353
+
354
+ # Retry metadata if not uploaded
355
+ if not row['hf_metadata_uploaded']:
356
+ # Reconstruct metadata dict for HF
357
+ hf_meta_dict = {
358
+ "id": rec_id, "text": row["text"], "hf_audio_path": hf_audio_path,
359
+ "gender": row["gender"], "age_group": row["age_group"], "ethnicity": row["ethnicity"],
360
+ "last_name": row["last_name"], "region": row["region"], "emotion": row["emotion"],
361
+ "timestamp": row["timestamp"], "recording_type": row["recording_type"]
362
+ }
363
+ hf_meta_entry_path = f"metadata_entries/{rec_id}.json"
364
  api.upload_file(
365
+ path_or_fileobj=io.BytesIO(json.dumps(hf_meta_dict, ensure_ascii=False, indent=2).encode('utf-8')),
366
+ path_in_repo=hf_meta_entry_path, repo_id=target_hf_repo_id, repo_type="dataset",
367
+ commit_message=f"fix: Retry metadata entry upload for {rec_id}"
 
368
  )
369
+ update_local_metadata_hf_status(rec_id, hf_metadata_uploaded=True)
370
+ log_hf_upload_attempt(rec_id, target_hf_repo_id, "metadata_entry_retry", "success")
371
+ current_status_msg += "Metadata OK."
372
+ elif row['hf_metadata_uploaded']:
373
+ current_status_msg += "Metadata already uploaded."
374
+
375
+ success_count += 1
376
+ messages.append(f"SUCCESS: {current_status_msg}")
377
+ except Exception as e:
378
+ failure_count += 1
379
+ err_msg = f"FAILURE for {rec_id}: {str(e)}"
380
+ messages.append(err_msg)
381
+ log_hf_upload_attempt(rec_id, target_hf_repo_id, "audio_or_metadata_retry", "failure", str(e))
382
 
383
+ return f"पुनः प्रयास सम्पन्न। सफलता: {success_count}, असफलता: {failure_count}\n" + "\n".join(messages)
 
384
 
385
+ def admin_sync_ratings_to_hf(admin_password_attempt):
386
+ hf_token = os.environ.get("HF_TOKEN")
387
+ target_hf_repo_id = os.environ.get("TARGET_HF_DATASET_REPO_ID")
388
+ expected_admin_password = os.environ.get("ADMIN_UPLOAD_PASSWORD")
389
 
390
+ if admin_password_attempt != expected_admin_password: return "अमान्य प्रशासक पासवर्ड।"
391
+ if not hf_token or not target_hf_repo_id: return "HF कन्फिगरेसन गोप्यमा हराइरहेको छ।"
392
+ if not os.path.exists(LOCAL_RATINGS_FILE): return "कुनै स्थानीय मूल्याङ्कन फाइल भेटिएन।"
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
+ with open(LOCAL_RATINGS_FILE, 'r') as f:
395
+ local_ratings = json.load(f)
396
 
397
+ if not local_ratings: return "सिंक गर्न कुनै स्थानीय मूल्याङ्कन छैन।"
 
 
 
 
398
 
399
+ api = HfApi(token=hf_token)
400
+ fs = HfFileSystem(token=hf_token) # For checking file existence
 
 
 
 
 
401
 
402
+ success_count = 0
403
+ failure_count = 0
404
+ messages = []
 
 
 
 
405
 
406
+ for rec_id, ratings_data in local_ratings.items():
407
+ # Only sync if there are any votes/scores
408
+ if not any(ratings_data.values()):
409
+ continue
410
 
411
+ hf_ratings_path = f"ratings_entries/{rec_id}.json"
412
+ try:
413
+ # Optional: Check if ratings on HF are older or different before uploading.
414
+ # For simplicity, we'll just overwrite/create.
415
+ api.upload_file(
416
+ path_or_fileobj=io.BytesIO(json.dumps(ratings_data, ensure_ascii=False, indent=2).encode('utf-8')),
417
+ path_in_repo=hf_ratings_path,
418
+ repo_id=target_hf_repo_id,
419
+ repo_type="dataset",
420
+ commit_message=f"chore: Sync ratings for recording {rec_id}"
421
+ )
422
+ success_count += 1
423
+ log_hf_upload_attempt(rec_id, target_hf_repo_id, "ratings_sync", "success")
424
+ messages.append(f"SUCCESS: Synced ratings for {rec_id}")
425
+ except Exception as e:
426
+ failure_count += 1
427
+ err_msg = f"FAILURE syncing ratings for {rec_id}: {str(e)}"
428
+ messages.append(err_msg)
429
+ log_hf_upload_attempt(rec_id, target_hf_repo_id, "ratings_sync", "failure", str(e))
430
 
431
+ return f"मूल्याङ्कन सिंक सम्पन्न। सफलता: {success_count}, असफलता: {failure_count}\n" + "\n".join(messages)
 
 
 
 
 
 
432
 
 
 
 
433
 
434
  # --- Gradio UI Build ---
435
  def build_ui():
436
+ with gr.Blocks(title="नेपाली ASR डाटा संकलन") as app:
437
  gr.Markdown("# नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)")
438
+ # ... (Your existing Markdown intro) ...
 
 
 
 
 
439
 
 
440
  with gr.Tabs():
441
  with gr.TabItem("१. आवाज रेकर्ड गर्नुहोस् (Record Voice)"):
442
  with gr.Tabs():
443
  with gr.TabItem("स्वतन्त्र पाठ (Free Text)"):
444
+ # ... (UI elements as before, but change .click) ...
445
+ free_audio_input = gr.Audio(label="आवाज रेकर्ड गर्नुहोस्", type="filepath", sources=["microphone"]) # Corrected
446
+ free_text_input = gr.Textbox(label="पाठ", placeholder="यहाँ लेख्नुहोस्...", lines=3)
447
+ # ... (gender, age, ethnicity, lastname, region, emotion dropdowns) ...
448
+ free_gender_dd = gr.Dropdown(label="लिङ्ग", choices=GENDERS, value=GENDERS[0])
449
+ free_age_dd = gr.Dropdown(label="उमेर समूह", choices=AGE_GROUPS, value=AGE_GROUPS[1])
450
+ free_ethnicity_dd = gr.Dropdown(label="जातीयता", choices=list(COMMON_LAST_NAMES.keys()), value=list(COMMON_LAST_NAMES.keys())[0])
451
+ free_lastname_dd = gr.Dropdown(label="थर", choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]])
452
+ free_ethnicity_dd.change(fn=get_ethnicity_based_last_names, inputs=free_ethnicity_dd, outputs=free_lastname_dd)
453
+ free_region_dd = gr.Dropdown(label="क्षेत्र", choices=REGIONS, value=REGIONS[2])
454
+ free_emotion_dd = gr.Dropdown(label="भावना", choices=EMOTIONS, value=EMOTIONS[0])
455
+
456
+ free_submit_btn = gr.Button("सुरक्षित र अपलोड गर्नुहोस्")
457
+ free_status_output = gr.Textbox(label="स्थिति", interactive=False, lines=3)
 
 
 
458
  free_submit_btn.click(
459
+ save_and_direct_upload_recording,
460
  inputs=[free_audio_input, free_text_input, free_gender_dd, free_age_dd, free_ethnicity_dd, free_lastname_dd, free_region_dd, free_emotion_dd, gr.Textbox(value="free_text", visible=False)],
461
+ outputs=[free_status_output, free_audio_input]
462
  )
463
 
464
  with gr.TabItem("निर्देशित पाठ (Prompted Text)"):
465
+ # ... (UI elements as before, but change .click) ...
466
+ prompt_text_display = gr.Textbox(label="कृपया यो पाठ पढ्नुहोस्", value=get_random_prompt(), lines=3, interactive=False)
467
+ new_prompt_btn = gr.Button("नयाँ पाठ")
468
+ prompt_audio_input = gr.Audio(label="आवाज रेकर्ड गर्नुहोस्", type="filepath", sources=["microphone"]) # Corrected
469
+ # ... (gender, age, ethnicity, lastname, region, emotion dropdowns) ...
470
+ prompt_gender_dd = gr.Dropdown(label="लिङ्ग", choices=GENDERS, value=GENDERS[0])
471
+ prompt_age_dd = gr.Dropdown(label="उमेर समूह", choices=AGE_GROUPS, value=AGE_GROUPS[1])
472
+ prompt_ethnicity_dd = gr.Dropdown(label="जातीयता", choices=list(COMMON_LAST_NAMES.keys()), value=list(COMMON_LAST_NAMES.keys())[0])
473
+ prompt_lastname_dd = gr.Dropdown(label="थर", choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]])
474
+ prompt_ethnicity_dd.change(fn=get_ethnicity_based_last_names, inputs=prompt_ethnicity_dd, outputs=prompt_lastname_dd)
475
+ prompt_region_dd = gr.Dropdown(label="क्षेत्र", choices=REGIONS, value=REGIONS[2])
476
+ prompt_emotion_dd = gr.Dropdown(label="भावना", choices=EMOTIONS, value=EMOTIONS[0])
477
+
 
 
 
478
  new_prompt_btn.click(get_random_prompt, outputs=prompt_text_display)
479
+ prompt_submit_btn = gr.Button("सुरक्षित अपलोड गर्नुहोस्")
480
+ prompt_status_output = gr.Textbox(label="स्थिति", interactive=False, lines=3)
481
  prompt_submit_btn.click(
482
+ save_and_direct_upload_recording,
483
  inputs=[prompt_audio_input, prompt_text_display, prompt_gender_dd, prompt_age_dd, prompt_ethnicity_dd, prompt_lastname_dd, prompt_region_dd, prompt_emotion_dd, gr.Textbox(value="prompted_text", visible=False)],
484
  outputs=[prompt_status_output, prompt_audio_input]
485
  )
486
 
487
+ with gr.TabItem("२. रेकर्डिङ समीक्षा गर्नुहोस् (Review Local Recordings)"):
488
+ gr.Markdown("स्थानीय रूपमा सुरक्षित गरिएका हालसालैका रेकर्डिङहरू हेर्नुहोस् र मत दिनुहोस्। (View and vote on recent locally saved recordings.)")
489
+ num_review_items = gr.Number(value=10, label="देखाउने वस्तुहरूको संख्या", minimum=1, maximum=50, step=1)
490
+ refresh_review_list_btn = gr.Button("स्थानीय सूची ताजा गर्नुहोस्")
491
+ review_list_df = gr.DataFrame(
492
+ headers=['id', 'text', 'ethnicity', 'region', 'timestamp', 'hf_audio_uploaded', 'hf_metadata_uploaded'],
493
+ label="हालका स्थानीय रेकर्डिङहरू", interactive=False,
494
+ datatype=['str', 'str', 'str', 'str', 'str', 'bool', 'bool']
495
+ )
496
+ # ... (Rest of review UI, using get_local_recording_audio and get_local_recording_ratings) ...
497
+ selected_review_id = gr.Textbox(label="चयन गरिएको आईडी", interactive=False)
498
+ selected_review_text = gr.Textbox(label="रेकर्डिङ पाठ", interactive=False, lines=2)
499
+ review_audio_player = gr.Audio(label="रेकर्डिङ सुन्नुहोस्", type="filepath")
500
+ current_ratings_display = gr.Textbox(label="वर्तमान स्थानीय मूल्याङ्कन", interactive=False, lines=3)
501
 
502
  def select_for_review(evt: gr.SelectData, df_data: pd.DataFrame):
503
  if evt.index is None or df_data is None or len(df_data) == 0 or evt.index[0] >= len(df_data):
504
+ return "", "", None, "कुनै रेकर्डिङ चयन गरिएको छैन"
505
  selected_id_val = df_data.iloc[evt.index[0]]['id']
506
+ audio_p, text_val = get_local_recording_audio(selected_id_val) # Use local getter
507
+ ratings_text_val = get_local_recording_ratings(selected_id_val) # Use local getter
508
  return selected_id_val, text_val, audio_p, ratings_text_val
509
 
510
  review_list_df.select(select_for_review, inputs=[review_list_df], outputs=[selected_review_id, selected_review_text, review_audio_player, current_ratings_display])
511
+ refresh_review_list_btn.click(list_local_recordings, inputs=[num_review_items], outputs=review_list_df)
512
+
513
+ gr.Markdown("### स्थानीय मतदान गर्नुहोस् (Cast Your Local Vote)")
514
+ # ... (voting buttons as before, using vote_recording which now acts locally) ...
515
+ upvote_btn = gr.Button("👍 मन पर्यो")
516
+ downvote_btn = gr.Button("👎 मन परेन")
517
+ quality_rating_slider = gr.Slider(minimum=1, maximum=5, step=1, label="गुणस्तर मूल्याङ्कन", value=3)
518
+ submit_quality_btn = gr.Button("गुणस्तर मत दिनुहोस्")
519
+ correctness_rating_slider = gr.Slider(minimum=1, maximum=5, step=1, label="शुद्धता मूल्याङ्कन", value=3)
520
+ submit_correctness_btn = gr.Button("शुद्धता मत दिनुहोस्")
521
+ vote_status_output = gr.Textbox(label="मतदान स्थिति", interactive=False)
522
+
523
+ def vote_and_refresh_local(rec_id, vote_t, vote_val_str):
524
+ status = vote_recording(rec_id, vote_t, str(vote_val_str))
525
+ new_ratings = get_local_recording_ratings(rec_id) if rec_id else "रेकर्डिङ चयन गर्नुहोस्"
 
 
 
 
526
  return status, new_ratings
527
 
528
+ upvote_btn.click(vote_and_refresh_local, inputs=[selected_review_id, gr.Textbox(value="upvote", visible=False), gr.Number(value=0, visible=False)], outputs=[vote_status_output, current_ratings_display])
529
+ downvote_btn.click(vote_and_refresh_local, inputs=[selected_review_id, gr.Textbox(value="downvote", visible=False), gr.Number(value=0, visible=False)], outputs=[vote_status_output, current_ratings_display])
530
+ submit_quality_btn.click(vote_and_refresh_local, inputs=[selected_review_id, gr.Textbox(value="quality", visible=False), quality_rating_slider], outputs=[vote_status_output, current_ratings_display])
531
+ submit_correctness_btn.click(vote_and_refresh_local, inputs=[selected_review_id, gr.Textbox(value="correctness", visible=False), correctness_rating_slider], outputs=[vote_status_output, current_ratings_display])
532
 
 
 
 
 
 
533
 
534
+ with gr.TabItem("३. प्रशासक: सिंक र प्रगति (Admin: Sync & Progress)"):
535
+ gr.Markdown("## स्थानीय संकलन प्रगति (Local Collection Progress)")
536
+ total_count_display = gr.Textbox(label="स्थानीय तथ्याङ्क", interactive=False, lines=3)
537
+ refresh_total_count_btn = gr.Button("स्थानीय गणना ताजा गर्नुहोस्")
538
+ refresh_total_count_btn.click(update_local_count, outputs=total_count_display)
539
  gr.Markdown("---")
540
+ gr.Markdown("## प्रशासक कार्यहरू (Admin Actions)")
541
+ gr.Markdown("यी कार्यहरूका लागि प्रशासक पासवर्ड आवश्यक छ (गोप्यमा `ADMIN_UPLOAD_PASSWORD` को रूपमा सेट गरिएको)। "
542
+ "HF टोकन लक्ष्य रिपो आईडी पनि गोप्यमा (`HF_TOKEN`, `TARGET_HF_DATASET_REPO_ID`) हुनुपर्छ।")
543
+ admin_password_input_sync = gr.Textbox(label="प्रशासक पासवर्ड", type="password", placeholder="प्रशासक पासवर्��� प्रविष्ट गर्नुहोस्")
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
+ with gr.Row():
546
+ retry_failed_uploads_btn = gr.Button("असफल HF अपलोडहरू पुनः प्रयास गर्नुहोस्")
547
+ sync_ratings_btn = gr.Button("स्थानीय मूल्याङ्कनहरू HF मा सिंक गर्नुहोस्")
548
+ admin_action_status_output = gr.Textbox(label="प्रशासक कार्य स्थिति", interactive=False, lines=10)
549
 
550
+ retry_failed_uploads_btn.click(admin_retry_failed_uploads, inputs=[admin_password_input_sync], outputs=admin_action_status_output)
551
+ sync_ratings_btn.click(admin_sync_ratings_to_hf, inputs=[admin_password_input_sync], outputs=admin_action_status_output)
552
+
553
+ with gr.TabItem("४. जानकारी (Information)"):
554
+ gr.Markdown(render_info_page())
555
 
556
+ app.load(fn=update_local_count, inputs=None, outputs=total_count_display)
557
+ app.load(fn=lambda n: list_local_recordings(n), inputs=[num_review_items], outputs=review_list_df)
558
  return app
559
 
560
  def render_info_page():
561
+ # ... (Your existing info page content is fine) ...
562
  return """
563
  ## नेपाली ASR डाटा संकलन प्रोजेक्टको बारेमा (About the Nepali ASR Data Collection Project)
 
564
  यो प्रोजेक्टले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवश्यक डाटा संकलन गर्दछ।
565
+ तपाईंको प्रत्येक रेकर्डिङ सिधै हाम्रो खुला हगिङ फेस डेटासेटमा योगदान गरिनेछ।
 
566
  ### कसरी योगदान दिने (How to Contribute):
567
+ 1. **आवाज रेकर्ड गर्नुहोस् (Record Voice)** ट्याबमा जानुहोस् र आफ्नो आवाज रेकर्ड गर्नुहोस्।
568
+ 2. **रेकर्डिङ समीक्षा गर्नुहोस् (Review Local Recordings)** ट्याबमा गएर अरूले गरेका (वा तपाईंले गरेका) स्थानीय रूपमा सुरक्षित गरिएका रेकर्डिङहरू सुन्नुहोस्मत दिनुहोस्। यी मतहरू पछि हगिङ फेसमा सिंक गर्न सकिन्छ।
 
 
 
 
569
  ### गोपनीयता नीति (Privacy Policy):
570
+ - तपाईंको आवाज रेकर्डिङ र मेटाडाटा सार्वजनिक अनुसन्धान उद्देश्यका लागि हगिङ फेसमा खुला रूपमा उपलब्ध हुनेछ।
571
+ - कृपया व्यक्तिगत पहिचान गर्न सकिने जानकारी शेयर नगर्नुहोस्।
572
+ """ # Keep your more detailed version
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
 
574
  # --- Main Execution ---
575
  if __name__ == "__main__":
 
576
  initialize_data_storage()
 
577
  app_ui = build_ui()
578
  app_ui.launch()