Spaces:

darvilab
/

Nepali-ASR-Open-Data-Collection

Sleeping

App Files Files Community

ashokpoudel commited on May 16, 2025

Commit

0ff5b4f

verified ·

1 Parent(s): b0aa85d

Update app.py

Browse files

Files changed (1) hide show

app.py +414 -418

app.py CHANGED Viewed

@@ -6,8 +6,10 @@ import random
 import datetime
 import uuid
 import json
-from huggingface_hub import HfApi
 from datasets import Dataset
 # Configuration
 SAMPLE_PROMPTS = [
@@ -38,7 +40,6 @@ REGIONS = [
     "सुदूरपश्चिम प्रदेश (Sudurpashchim Province)"
 ]
-# Common last names by ethnicity/region for better accent tracking
 COMMON_LAST_NAMES = {
     "पहाडी (Pahadi)": ["शर्मा (Sharma)", "पौडेल (Poudel)", "खनाल (Khanal)", "अधिकारी (Adhikari)", "भट्टराई (Bhattarai)", "अन्य पहाडी (Other Pahadi)"],
     "नेवार (Newar)": ["श्रेष्ठ (Shrestha)", "प्रधान (Pradhan)", "महर्जन (Maharjan)", "बज्राचार्य (Bajracharya)", "अन्य नेवार (Other Newar)"],
@@ -53,52 +54,63 @@ COMMON_LAST_NAMES = {
     "अन्य (Other)": ["अन्य (Other)"]
 }
-# Create directory for saved recordings
-os.makedirs("recordings", exist_ok=True)
-os.makedirs("metadata", exist_ok=True)
-os.makedirs("ratings", exist_ok=True)
-# Initialize metadata file if it doesn't exist
-metadata_file = "metadata/metadata.csv"
-ratings_file = "ratings/ratings.json"
-if not os.path.exists(metadata_file):
-    pd.DataFrame(columns=[
-        "id", "text", "audio_path", "gender", "age_group", "ethnicity",
-        "last_name", "region", "emotion", "timestamp", "recording_type"
-    ]).to_csv(metadata_file, index=False)
-if not os.path.exists(ratings_file):
-    with open(ratings_file, 'w') as f:
-        json.dump({}, f)
 def save_recording(audio, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type):
     """Save the recording and metadata"""
-    # Generate unique ID for this recording
-    recording_id = str(uuid.uuid4())
-    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    # Check if audio was recorded
     if audio is None:
         return "कृपया पहिले रेकर्डिङ गर्नुहोस्। (Please record audio first)", None
-    # Save audio file
-    audio_filename = f"recordings/{recording_id}.wav"
-    if isinstance(audio, tuple):  # If it's a tuple (sr, data)
-        sr, data = audio
-        import soundfile as sf
-        sf.write(audio_filename, data, sr)
-    else:  # If it's a path
-        # Copy the file
-        import shutil
-        shutil.copy(audio, audio_filename)
-    # Update metadata
-    metadata = pd.read_csv(metadata_file)
     new_row = pd.DataFrame([{
         "id": recording_id,
         "text": text,
-        "audio_path": audio_filename,
         "gender": gender,
         "age_group": age_group,
         "ethnicity": ethnicity,
@@ -108,438 +120,422 @@ def save_recording(audio, text, gender, age_group, ethnicity, last_name, region,
         "timestamp": timestamp,
         "recording_type": recording_type
     }])
-    updated_metadata = pd.concat([metadata, new_row], ignore_index=True)
-    updated_metadata.to_csv(metadata_file, index=False)
-    # Initialize rating for this recording in the ratings file
-    with open(ratings_file, 'r') as f:
         ratings = json.load(f)
-    ratings[recording_id] = {
-        "upvotes": 0,
-        "downvotes": 0,
-        "quality_score": 0,  # Average quality rating (1-5)
-        "quality_votes": 0,  # Number of quality ratings
-        "correctness_score": 0,  # Average correctness rating (1-5)
-        "correctness_votes": 0  # Number of correctness ratings
-    }
-    with open(ratings_file, 'w') as f:
         json.dump(ratings, f, indent=2)
-    return f"रेकर्डिङ सफलतापूर्वक सुरक्षित गरियो! (Recording saved successfully!)", audio_filename
 def get_random_prompt():
-    """Return a random prompt from the list"""
     return random.choice(SAMPLE_PROMPTS)
-def vote_recording(recording_id, vote_type, vote_value):
-    """Add a vote for a recording"""
-    if not os.path.exists(ratings_file):
         return "रेटिङ फाइल भेटिएन। (Rating file not found.)"
     try:
-        with open(ratings_file, 'r') as f:
             ratings = json.load(f)
-        if recording_id not in ratings:
-            return "रेकर्डिङ आईडी भेटिएन। (Recording ID not found.)"
-        if vote_type == "upvote":
-            ratings[recording_id]["upvotes"] += 1
-        elif vote_type == "downvote":
-            ratings[recording_id]["downvotes"] += 1
-        elif vote_type == "quality":
-            # Update quality score (running average)
-            current_score = ratings[recording_id]["quality_score"]
-            current_votes = ratings[recording_id]["quality_votes"]
-            new_votes = current_votes + 1
-            new_score = ((current_score * current_votes) + vote_value) / new_votes
-            ratings[recording_id]["quality_score"] = new_score
-            ratings[recording_id]["quality_votes"] = new_votes
-        elif vote_type == "correctness":
-            # Update correctness score (running average)
-            current_score = ratings[recording_id]["correctness_score"]
-            current_votes = ratings[recording_id]["correctness_votes"]
-            new_votes = current_votes + 1
-            new_score = ((current_score * current_votes) + vote_value) / new_votes
-            ratings[recording_id]["correctness_score"] = new_score
-            ratings[recording_id]["correctness_votes"] = new_votes
-        with open(ratings_file, 'w') as f:
             json.dump(ratings, f, indent=2)
-        return f"मतदान सफलतापूर्वक दर्ता गरियो! (Vote registered successfully!)"
     except Exception as e:
-        return f"त्रुटि: {str(e)}"
-def get_ethnicity_based_last_names(ethnicity):
-    """Return last name options based on selected ethnicity"""
-    if ethnicity in COMMON_LAST_NAMES:
-        return COMMON_LAST_NAMES[ethnicity]
-    return COMMON_LAST_NAMES["अन्य (Other)"]
-def upload_to_huggingface(hf_token, dataset_name):
     """Upload the collected data to Hugging Face"""
-    if not os.path.exists(metadata_file):
-        return "कुनै डाटा भेटिएन। (No data found.)"
     try:
-        # Read metadata
-        metadata = pd.read_csv(metadata_file)
         if len(metadata) == 0:
-            return "कुनै डाटा भेटिएन। (No data found.)"
-        # Read ratings
-        with open(ratings_file, 'r') as f:
-            ratings = json.load(f)
-        # Add ratings to metadata
-        metadata["upvotes"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("upvotes", 0))
-        metadata["downvotes"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("downvotes", 0))
-        metadata["quality_score"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("quality_score", 0))
-        metadata["correctness_score"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("correctness_score", 0))
-        # Create a dataset dict
-        dataset_dict = {
-            "id": metadata["id"].tolist(),
-            "text": metadata["text"].tolist(),
-            "gender": metadata["gender"].tolist(),
-            "age_group": metadata["age_group"].tolist(),
-            "ethnicity": metadata["ethnicity"].tolist(),
-            "last_name": metadata["last_name"].tolist(),
-            "region": metadata["region"].tolist(),
-            "emotion": metadata["emotion"].tolist(),
-            "recording_type": metadata["recording_type"].tolist(),
-            "timestamp": metadata["timestamp"].tolist(),
-            "upvotes": metadata["upvotes"].tolist(),
-            "downvotes": metadata["downvotes"].tolist(),
-            "quality_score": metadata["quality_score"].tolist(),
-            "correctness_score": metadata["correctness_score"].tolist(),
-        }
-        # Create a Dataset object
-        dataset = Dataset.from_dict(dataset_dict)
-        # Push to hub
-        api = HfApi(token=hf_token)
-        dataset.push_to_hub(dataset_name)
-        # Upload audio files
         for _, row in metadata.iterrows():
-            audio_path = row["audio_path"]
-            if os.path.exists(audio_path):
                 api.upload_file(
-                    path_or_fileobj=audio_path,
-                    path_in_repo=f"audio/{os.path.basename(audio_path)}",
                     repo_id=dataset_name,
                     repo_type="dataset"
                 )
-        return f"डाटा हगिङफेसमा सफलतापूर्वक अपलोड गरियो! (Data successfully uploaded to Hugging Face at {dataset_name})"
     except Exception as e:
-        return f"त्रुटि: {str(e)}"
 def update_count():
-    """Update the count of recordings"""
-    if os.path.exists(metadata_file):
-        metadata = pd.read_csv(metadata_file)
-        return f"���ालसम्म {len(metadata)} रेकर्डिङहरू संकलन गरिएको छ। (Total recordings collected: {len(metadata)})"
     return "कुनै रेकर्डिङ भेटिएन। (No recordings found.)"
 def list_recordings(num_items=10):
-    """List recent recordings for review"""
-    if not os.path.exists(metadata_file):
-        return pd.DataFrame()
-    metadata = pd.read_csv(metadata_file)
     if len(metadata) == 0:
-        return pd.DataFrame()
-    # Sort by timestamp (newest first) and get the most recent entries
-    metadata['timestamp'] = pd.to_datetime(metadata['timestamp'])
-    sorted_metadata = metadata.sort_values('timestamp', ascending=False).head(num_items)
-    # Reset the index for display purposes
     display_df = sorted_metadata[['id', 'text', 'ethnicity', 'region', 'timestamp']].copy()
-    display_df['timestamp'] = display_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
-    display_df = display_df.reset_index(drop=True)
-    return display_df
 def get_recording_audio(recording_id):
-    """Get audio file path for a specific recording"""
-    if not os.path.exists(metadata_file):
-        return None, "रेकर्डिङ भेटिएन। (Recording not found.)"
-    metadata = pd.read_csv(metadata_file)
     recording = metadata[metadata['id'] == recording_id]
-    if len(recording) == 0:
-        return None, "रेकर्डिङ भेटिएन। (Recording not found.)"
     audio_path = recording['audio_path'].iloc[0]
     text = recording['text'].iloc[0]
-    if not os.path.exists(audio_path):
-        return None, "अडियो फाइल भेटिएन। (Audio file not found.)"
     return audio_path, text
 def get_recording_ratings(recording_id):
-    """Get current ratings for a recording"""
-    if not os.path.exists(ratings_file):
-        return "डाटा भेटिएन। (No data found.)"
-    with open(ratings_file, 'r') as f:
         ratings = json.load(f)
-    if recording_id not in ratings:
-        return "रेकर्डिङ आईडी भेटिएन। (Recording ID not found.)"
     r = ratings[recording_id]
-    # Format the ratings for display
-    upvotes = r["upvotes"]
-    downvotes = r["downvotes"]
-    quality = round(r["quality_score"], 1) if r["quality_votes"] > 0 else 0
-    quality_votes = r["quality_votes"]
-    correctness = round(r["correctness_score"], 1) if r["correctness_votes"] > 0 else 0
-    correctness_votes = r["correctness_votes"]
     return f"""👍 Upvotes: {upvotes} | 👎 Downvotes: {downvotes}
 गुणस्तर (Quality): {quality}/5 ({quality_votes} मत/votes)
 शुद्धता (Correctness): {correctness}/5 ({correctness_votes} मत/votes)"""
 def build_ui():
-    """Build the Gradio interface"""
     with gr.Blocks(title="नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)") as app:
-        gr.Markdown("""
-        # नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)
-        यस प्लेटफर्मले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवाज डाटा संकलन गर्दछ।
-        कृपया आफ्नो आवाज रेकर्ड गरेर योगदान दिनुहोस्।
-        *This platform collects voice data for the development of Nepali Automatic Speech Recognition (ASR) technology.
-        Please contribute by recording your voice.*
-        """)
-        with gr.Tab("स्वतन्त्र पाठ (Free Text)"):
-            with gr.Row():
-                with gr.Column():
-                    free_text = gr.Textbox(
-                        label="तपाईंले बोल्न चाहनुभएको पाठ यहाँ लेख्नुहोस् (Type the text you want to speak here)",
-                        placeholder="यहाँ लेख्नुहोस्...",
-                        lines=3
-                    )
-                    free_audio = gr.Audio(
-                        label="आफ्नो आवाज रेकर्ड गर्नुहोस् (Record your voice)",
-                        type="filepath",
-                        source="microphone"
-                    )
-                with gr.Column():
-                    # First row of metadata
-                    with gr.Row():
-                        free_gender = gr.Dropdown(
-                            label="लिङ्ग (Gender)",
-                            choices=GENDERS,
-                            value=GENDERS[0]
-                        )
-                        free_age = gr.Dropdown(
-                            label="उमेर समूह (Age Group)",
-                            choices=AGE_GROUPS,
-                            value=AGE_GROUPS[1]
-                        )
-                    # Second row of metadata
-                    with gr.Row():
-                        free_ethnicity = gr.Dropdown(
-                            label="जातीयता (Ethnicity)",
-                            choices=list(COMMON_LAST_NAMES.keys()),
-                            value=list(COMMON_LAST_NAMES.keys())[0]
-                        )
-                        free_last_name = gr.Dropdown(
-                            label="थर (Last Name)",
-                            choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]]
-                        )
-                    # Update last name options when ethnicity changes
-                    free_ethnicity.change(
-                        fn=get_ethnicity_based_last_names,
-                        inputs=free_ethnicity,
-                        outputs=free_last_name
-                    )
-                    # Third row of metadata
-                    with gr.Row():
-                        free_region = gr.Dropdown(
-                            label="क्षेत्र (Region)",
-                            choices=REGIONS,
-                            value=REGIONS[2]  # Default to Bagmati Province
                         )
-                        free_emotion = gr.Dropdown(
-                            label="भावना (Emotion)",
-                            choices=EMOTIONS,
-                            value=EMOTIONS[0]
                         )
-            free_submit = gr.Button("सुरक्षित गर्नुहोस् (Save)")
-            free_output = gr.Textbox(label="स्थिति (Status)")
-            free_submit.click(
-                fn=save_recording,
-                inputs=[
-                    free_audio, free_text, free_gender, free_age,
-                    free_ethnicity, free_last_name, free_region, free_emotion,
-                    gr.Textbox(value="free_text", visible=False)
-                ],
-                outputs=[free_output, free_audio]
-            )="filepath",
-                        source="microphone"
-                    )
-                with gr.Column():
-                    free_gender = gr.Dropdown(
-                        label="लिङ्ग (Gender)",
-                        choices=GENDERS,
-                        value=GENDERS[0]
-                    )
-                    free_age = gr.Dropdown(
-                        label="उमेर समूह (Age Group)",
-                        choices=AGE_GROUPS,
-                        value=AGE_GROUPS[1]
-                    )
-                    free_emotion = gr.Dropdown(
-                        label="भावना (Emotion)",
-                        choices=EMOTIONS,
-                        value=EMOTIONS[0]
-                    )
-            free_submit = gr.Button("सुरक्षित गर्नुहोस् (Save)")
-            free_output = gr.Textbox(label="स्थिति (Status)")
-            free_submit.click(
-                fn=save_recording,
-                inputs=[free_audio, free_text, free_gender, free_age, free_emotion, gr.Textbox(value="free_text", visible=False)],
-                outputs=[free_output, free_audio]
-            )
-        with gr.Tab("निर्देशित पाठ (Prompted Text)"):
-            with gr.Row():
-                with gr.Column():
-                    prompt_text = gr.Textbox(
-                        label="कृपया यो पाठ पढ्नुहोस् (Please read this text)",
-                        value=get_random_prompt(),
-                        lines=3
-                    )
-                    prompt_audio = gr.Audio(
-                        label="आफ्नो आवाज रेकर्ड गर्नुहोस् (Record your voice)",
-                        type="filepath",
-                        source="microphone"
-                    )
-                    new_prompt = gr.Button("नयाँ पाठ (New Text)")
-                with gr.Column():
-                    prompt_gender = gr.Dropdown(
-                        label="लिङ्ग (Gender)",
-                        choices=GENDERS,
-                        value=GENDERS[0]
-                    )
-                    prompt_age = gr.Dropdown(
-                        label="उमेर समूह (Age Group)",
-                        choices=AGE_GROUPS,
-                        value=AGE_GROUPS[1]
-                    )
-                    prompt_emotion = gr.Dropdown(
-                        label="भावना (Emotion)",
-                        choices=EMOTIONS,
-                        value=EMOTIONS[0]
-                    )
-            prompt_submit = gr.Button("सुरक्षित गर्नुहोस् (Save)")
-            prompt_output = gr.Textbox(label="स्थिति (Status)")
-            new_prompt.click(fn=get_random_prompt, inputs=None, outputs=prompt_text)
-            prompt_submit.click(
-                fn=save_recording,
-                inputs=[prompt_audio, prompt_text, prompt_gender, prompt_age, prompt_emotion, gr.Textbox(value="prompted_text", visible=False)],
-                outputs=[prompt_output, prompt_audio]
-            )
-        with gr.Tab("प्रगति (Progress)"):
-            count_display = gr.Textbox(label="संकलित रेकर्डिङ गणना (Recording Count)")
-            refresh_button = gr.Button("ताजा गर्नुहोस् (Refresh)")
-            refresh_button.click(fn=update_count, inputs=None, outputs=count_display)
-            # HuggingFace upload section (admin only)
-            gr.Markdown("## हगिङफेसमा अपलोड गर्नुहोस् (Upload to Hugging Face)")
-            with gr.Row():
-                hf_token = gr.Textbox(label="Hugging Face API Token", type="password")
-                dataset_name = gr.Textbox(
-                    label="Dataset Name",
-                    placeholder="username/nepali-asr-dataset"
                 )
-            upload_button = gr.Button("अपलोड गर्नुहोस् (Upload)")
-            upload_status = gr.Textbox(label="अपलोड स्थिति (Upload Status)")
-            upload_button.click(
-                fn=upload_to_huggingface,
-                inputs=[hf_token, dataset_name],
-                outputs=upload_status
-            )
-        with gr.Tab("जानकारी (Information)"):
-            gr.Markdown("""
-            ## नेपाली ASR डाटा संकलन प्रोजेक्टको बारेमा
-            यो प्रोजेक्टले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवश्यक डाटा संकलन गर्दछ।
-            तपाईंको योगदानले नेपाली भाषा प्रविधिको विकासमा ठूलो मद्दत पुर्‍याउनेछ।
-            ### कसरी योगदान दिने:
-            1. **स्वतन्त्र पाठ (Free Text)** ट्याबमा, तपाईं आफ्नो इच्छा अनुसार पाठ लेखेर त्यसलाई बोल्न सक्नुहुन्छ।
-            2. **निर्देशित पाठ (Prompted Text)** ट्याबमा, तपाईंले दिइएको पाठलाई पढेर रेकर्ड गर्न सक्नुहुन्छ।
-            3. रेकर्डिङ पछि, "सुरक्षित गर्नुहोस्" बटनमा क्लिक गर्नुहोस्।
-            ### गोपनीयता नीति:
-            - तपाईंको आवाज रेकर्डिङ र मेटाडाटा सार्वजनिक अनुसन्धान उद्देश्यका लागि प्रयोग गरिनेछ।
-            - कृपया व्यक्तिगत पहिचान गर्न सकिने जानकारी शेयर नगर्नुहोस्।
-            - यो डाटासेट खुला स्रोत हुनेछ र हगिङफेसमा प्रकाशित गरिनेछ।
-            ---
-            ## About Nepali ASR Data Collection Project
-            This project collects necessary data for the development of Nepali Automatic Speech Recognition (ASR) technology.
-            Your contribution will greatly help in advancing Nepali language technology.
-            ### How to Contribute:
-            1. In the **Free Text** tab, you can type any text you want and record yourself speaking it.
-            2. In the **Prompted Text** tab, you can record yourself reading the provided text.
-            3. After recording, click the "Save" button.
-            ### Privacy Policy:
-            - Your voice recordings and metadata will be used for public research purposes.
-            - Please do not share personally identifiable information.
-            - This dataset will be open-source and published on Hugging Face.
-            """)
-        # Initialize the count
-        app.load(fn=update_count, inputs=None, outputs=count_display)
     return app
-# Launch the app
 if __name__ == "__main__":
-    app = build_ui()
-    app.launch()

 import datetime
 import uuid
 import json
+from huggingface_hub import HfApi, create_repo
 from datasets import Dataset
+import soundfile as sf # Added for explicit use in save_recording
+import shutil # Added for explicit use in save_recording
 # Configuration
 SAMPLE_PROMPTS = [
     "सुदूरपश्चिम प्रदेश (Sudurpashchim Province)"
 ]
 COMMON_LAST_NAMES = {
     "पहाडी (Pahadi)": ["शर्मा (Sharma)", "पौडेल (Poudel)", "खनाल (Khanal)", "अधिकारी (Adhikari)", "भट्टराई (Bhattarai)", "अन्य पहाडी (Other Pahadi)"],
     "नेवार (Newar)": ["श्रेष्ठ (Shrestha)", "प्रधान (Pradhan)", "महर्जन (Maharjan)", "बज्राचार्य (Bajracharya)", "अन्य नेवार (Other Newar)"],
     "अन्य (Other)": ["अन्य (Other)"]
 }
+# --- Directory and File Paths ---
+# These paths are relative to where app.py is run.
+# In a Hugging Face Space, this means they are within the Space's file system.
+RECORDINGS_DIR = "recordings"
+METADATA_DIR = "metadata"
+RATINGS_DIR = "ratings"
+METADATA_FILE = os.path.join(METADATA_DIR, "metadata.csv")
+RATINGS_FILE = os.path.join(RATINGS_DIR, "ratings.json")
+# --- Initialization ---
+def initialize_data_storage():
+    """Creates directories and initial files if they don't exist."""
+    os.makedirs(RECORDINGS_DIR, exist_ok=True)
+    os.makedirs(METADATA_DIR, exist_ok=True)
+    os.makedirs(RATINGS_DIR, exist_ok=True)
+    if not os.path.exists(METADATA_FILE):
+        pd.DataFrame(columns=[
+            "id", "text", "audio_path", "gender", "age_group", "ethnicity",
+            "last_name", "region", "emotion", "timestamp", "recording_type"
+        ]).to_csv(METADATA_FILE, index=False)
+    if not os.path.exists(RATINGS_FILE):
+        with open(RATINGS_FILE, 'w') as f:
+            json.dump({}, f)
+initialize_data_storage() # Call initialization at script start
+# --- Core Functions ---
 def save_recording(audio, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type):
     """Save the recording and metadata"""
     if audio is None:
         return "कृपया पहिले रेकर्डिङ गर्नुहोस्। (Please record audio first)", None
+    recording_id = str(uuid.uuid4())
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    audio_filename_relative = f"{recording_id}.wav"
+    audio_filepath_in_space = os.path.join(RECORDINGS_DIR, audio_filename_relative)
+    try:
+        if isinstance(audio, tuple):  # If it's a tuple (sr, data) from gr.Audio(type="numpy")
+            sr, data = audio
+            sf.write(audio_filepath_in_space, data, sr)
+        elif isinstance(audio, str) and os.path.exists(audio): # If it's a path from gr.Audio(type="filepath")
+            shutil.copy(audio, audio_filepath_in_space)
+            # Gradio might place temp files elsewhere, so we ensure it's in our recordings dir
+        else:
+            return "अडियो फाइल बचत गर्न सकिएन। (Could not save audio file. Invalid audio format.)", None
+    except Exception as e:
+        return f"अडियो फाइल बचत गर्दा त्रुटि भयो: {e} (Error saving audio file: {e})", None
+    metadata_df = pd.read_csv(METADATA_FILE)
     new_row = pd.DataFrame([{
         "id": recording_id,
         "text": text,
+        "audio_path": audio_filepath_in_space, # Store path relative to space root
         "gender": gender,
         "age_group": age_group,
         "ethnicity": ethnicity,
         "timestamp": timestamp,
         "recording_type": recording_type
     }])
+    updated_metadata = pd.concat([metadata_df, new_row], ignore_index=True)
+    updated_metadata.to_csv(METADATA_FILE, index=False)
+    with open(RATINGS_FILE, 'r+') as f:
         ratings = json.load(f)
+        ratings[recording_id] = {
+            "upvotes": 0, "downvotes": 0,
+            "quality_score": 0, "quality_votes": 0,
+            "correctness_score": 0, "correctness_votes": 0
+        }
+        f.seek(0)
         json.dump(ratings, f, indent=2)
+        f.truncate()
+    return f"रेकर्डिङ सफलतापूर्वक सुरक्षित गरियो! ID: {recording_id} (Recording saved successfully!)", None # Return None to clear audio input
 def get_random_prompt():
     return random.choice(SAMPLE_PROMPTS)
+def get_ethnicity_based_last_names(ethnicity):
+    return gr.Dropdown.update(choices=COMMON_LAST_NAMES.get(ethnicity, COMMON_LAST_NAMES["अन्य (Other)"]))
+def vote_recording(recording_id, vote_type, vote_value_str): # vote_value comes as string from slider
+    if not recording_id:
+        return "कृपया पहिले समीक्षा गर्न रेकर्डिङ चयन गर्नुहोस्। (Please select a recording to review first.)"
+    if not os.path.exists(RATINGS_FILE):
         return "रेटिङ फाइल भेटिएन। (Rating file not found.)"
     try:
+        vote_value = int(vote_value_str) # Convert to int for quality/correctness
+    except ValueError:
+        if vote_type in ["quality", "correctness"]:
+            return "अमान्य मत मान। (Invalid vote value.)"
+        vote_value = 0 # For upvote/downvote
+    try:
+        with open(RATINGS_FILE, 'r+') as f:
             ratings = json.load(f)
+            if recording_id not in ratings:
+                return "रेकर्डिङ आईडी भेटिएन। (Recording ID not found.)"
+            rec_ratings = ratings[recording_id]
+            if vote_type == "upvote":
+                rec_ratings["upvotes"] += 1
+            elif vote_type == "downvote":
+                rec_ratings["downvotes"] += 1
+            elif vote_type == "quality":
+                current_score = rec_ratings["quality_score"]
+                current_votes = rec_ratings["quality_votes"]
+                new_votes = current_votes + 1
+                new_score = ((current_score * current_votes) + vote_value) / new_votes
+                rec_ratings["quality_score"] = new_score
+                rec_ratings["quality_votes"] = new_votes
+            elif vote_type == "correctness":
+                current_score = rec_ratings["correctness_score"]
+                current_votes = rec_ratings["correctness_votes"]
+                new_votes = current_votes + 1
+                new_score = ((current_score * current_votes) + vote_value) / new_votes
+                rec_ratings["correctness_score"] = new_score
+                rec_ratings["correctness_votes"] = new_votes
+            else:
+                return "अमान्य मतदान प्रकार। (Invalid vote type.)"
+            f.seek(0)
             json.dump(ratings, f, indent=2)
+            f.truncate()
+        return "मतदान सफलतापूर्वक दर्ता गरियो! (Vote registered successfully!)"
     except Exception as e:
+        return f"मतदान दर्ता गर्दा त्रुटि: {str(e)} (Error registering vote: {str(e)})"
+def upload_to_huggingface(dataset_name, admin_password_attempt):
     """Upload the collected data to Hugging Face"""
+    # --- Admin Password Check ---
+    expected_admin_password = os.environ.get("ADMIN_UPLOAD_PASSWORD")
+    hf_token_from_secret = os.environ.get("HF_TOKEN")
+    if not expected_admin_password:
+        return "त्रुटि: प्रशासक पासवर्ड स्पेस गोप्यमा कन्फिगर गरिएको छैन। (Error: Admin password not configured in Space secrets.)"
+    if admin_password_attempt != expected_admin_password:
+        return "त्रुटि: अपलोडका लागि अमान्य प्रशासक पासवर्ड। (Error: Invalid admin password for upload.)"
+    if not hf_token_from_secret:
+        return "त्रुटि: HF_TOKEN गोप्य स्पेस कन्फिगरेसनमा फेला परेन। अपलोड गर्न सकिँदैन। (Error: HF_TOKEN secret not found in Space configuration. Cannot upload.)"
+    if not dataset_name or len(dataset_name.split('/')) != 2:
+        return "त्रुटि: कृपया मान्य डेटासेट नाम 'username/repo_name' ढाँचामा प्रदान गर्नुहोस्। (Error: Please provide a valid dataset name in 'username/repo_name' format.)"
+    if not os.path.exists(METADATA_FILE):
+        return "कुनै मेटाडाटा फाइल भेटिएन। (No metadata file found.)"
     try:
+        api = HfApi(token=hf_token_from_secret)
+        # Ensure repo exists, create if not. private=False for public dataset
+        create_repo(repo_id=dataset_name, token=hf_token_from_secret, repo_type="dataset", exist_ok=True, private=False)
+        metadata = pd.read_csv(METADATA_FILE)
         if len(metadata) == 0:
+            return "कुनै डाटा भेटिएन। (No data to upload.)"
+        with open(RATINGS_FILE, 'r') as f:
+            ratings_data = json.load(f)
+        metadata["upvotes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("upvotes", 0))
+        metadata["downvotes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("downvotes", 0))
+        metadata["quality_score"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("quality_score", 0))
+        metadata["quality_votes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("quality_votes", 0))
+        metadata["correctness_score"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("correctness_score", 0))
+        metadata["correctness_votes"] = metadata["id"].apply(lambda x: ratings_data.get(x, {}).get("correctness_votes", 0))
+        # Prepare audio column for datasets library
+        # The 'audio' column should contain dictionaries with 'path' and optionally 'bytes'
+        # Here, we'll tell datasets to load from the paths we upload.
+        audio_files_for_dataset = []
+        for audio_path_in_space in metadata["audio_path"]:
+            audio_files_for_dataset.append(
+                {"path": os.path.join("audio", os.path.basename(audio_path_in_space))}
+            )
+        dataset_dict = metadata.to_dict(orient='list')
+        dataset_dict['audio'] = audio_files_for_dataset # Add the audio column
+        # Remove the local audio_path column as we now have the 'audio' dict column
+        if 'audio_path' in dataset_dict:
+            del dataset_dict['audio_path']
+        hf_dataset = Dataset.from_dict(dataset_dict)
+        # Push dataset metadata (e.g., data.jsonl or data.arrow/parquet files in the repo)
+        hf_dataset.push_to_hub(repo_id=dataset_name) # token is implicitly used if HfApi was init with it or HF_TOKEN env var is set
+        # Upload individual audio files
+        # Create the audio folder in the dataset repo if it doesn't exist
+        try:
+            api.create_folder(
+                repo_id=dataset_name,
+                folder_path="audio", # Target folder in the dataset repo
+                repo_type="dataset",
+                exist_ok=True
+            )
+        except Exception as e:
+            # Log this, but it's not critical if the folder already exists
+            print(f"सूचना: अडियो फोल्डर सिर्जना गर्न सकिएन (यो पहिले नै अवस्थित हुन सक्छ): {e} (Info: Could not create audio folder (it might already exist): {e})")
+        upload_count = 0
         for _, row in metadata.iterrows():
+            local_audio_file = row["audio_path"] # This is like "recordings/uuid.wav"
+            if os.path.exists(local_audio_file):
+                # The path_in_repo should match what you put in the 'audio' column for datasets
+                target_path_in_repo = os.path.join("audio", os.path.basename(local_audio_file))
                 api.upload_file(
+                    path_or_fileobj=local_audio_file,
+                    path_in_repo=target_path_in_repo,
                     repo_id=dataset_name,
                     repo_type="dataset"
                 )
+                upload_count +=1
+        return (f"डाटा हगिङफेसमा सफलतापूर्वक अपलोड गरियो! {upload_count} अडियो फाइलहरू अपलोड गरियो। "
+                f"(Data successfully uploaded to Hugging Face at {dataset_name}. {upload_count} audio files uploaded.)")
     except Exception as e:
+        import traceback
+        tb_str = traceback.format_exc()
+        return f"अपलोडको क्रममा त्रुटि (Error during upload):\n{str(e)}\n{tb_str}"
 def update_count():
+    if os.path.exists(METADATA_FILE):
+        try:
+            metadata = pd.read_csv(METADATA_FILE)
+            return f"हालसम्म {len(metadata)} रेकर्डिङहरू संकलन गरिएको छ। (Total recordings collected: {len(metadata)})"
+        except pd.errors.EmptyDataError:
+             return "हालसम्म ० रेकर्डिङहरू संकलन गरिएको छ। (Total recordings collected: 0)"
     return "कुनै रेकर्डिङ भेटिएन। (No recordings found.)"
 def list_recordings(num_items=10):
+    if not os.path.exists(METADATA_FILE):
+        return pd.DataFrame(columns=['id', 'text', 'ethnicity', 'region', 'timestamp'])
+    try:
+        metadata = pd.read_csv(METADATA_FILE)
+    except pd.errors.EmptyDataError:
+         return pd.DataFrame(columns=['id', 'text', 'ethnicity', 'region', 'timestamp'])
     if len(metadata) == 0:
+        return pd.DataFrame(columns=['id', 'text', 'ethnicity', 'region', 'timestamp'])
+    metadata['timestamp'] = pd.to_datetime(metadata['timestamp'], errors='coerce')
+    sorted_metadata = metadata.sort_values('timestamp', ascending=False).head(int(num_items))
     display_df = sorted_metadata[['id', 'text', 'ethnicity', 'region', 'timestamp']].copy()
+    display_df['timestamp'] = display_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M').fillna('N/A')
+    return display_df.reset_index(drop=True)
 def get_recording_audio(recording_id):
+    if not recording_id: return None, "कुनै रेकर्डिङ आईडी प्रदान गरिएको छैन। (No recording ID provided.)"
+    if not os.path.exists(METADATA_FILE): return None, "मेटाडाटा फाइल भेटिएन। (Metadata file not found.)"
+    try:
+        metadata = pd.read_csv(METADATA_FILE)
+    except pd.errors.EmptyDataError:
+        return None, "मेटाडाटा खाली छ। (Metadata is empty.)"
     recording = metadata[metadata['id'] == recording_id]
+    if len(recording) == 0: return None, "रेकर्डिङ भेटिएन। (Recording not found.)"
     audio_path = recording['audio_path'].iloc[0]
     text = recording['text'].iloc[0]
+    if not os.path.exists(audio_path): return None, f"अडियो फाइल भेटिएन: {audio_path} (Audio file not found: {audio_path})"
     return audio_path, text
 def get_recording_ratings(recording_id):
+    if not recording_id: return "रेकर्डिङ आईडी चयन गर्नुहोस्। (Select a Recording ID.)"
+    if not os.path.exists(RATINGS_FILE): return "डाटा भेटिएन। (No ratings data found.)"
+    with open(RATINGS_FILE, 'r') as f:
         ratings = json.load(f)
+    if recording_id not in ratings: return "यस रेकर्डिङको लागि कुनै मूल्याङ्कन भेटिएन। (No ratings found for this recording.)"
     r = ratings[recording_id]
+    upvotes = r.get("upvotes", 0)
+    downvotes = r.get("downvotes", 0)
+    quality = round(r.get("quality_score",0), 1) if r.get("quality_votes",0) > 0 else 0
+    quality_votes = r.get("quality_votes",0)
+    correctness = round(r.get("correctness_score",0), 1) if r.get("correctness_votes",0) > 0 else 0
+    correctness_votes = r.get("correctness_votes",0)
     return f"""👍 Upvotes: {upvotes} | 👎 Downvotes: {downvotes}
 गुणस्तर (Quality): {quality}/5 ({quality_votes} मत/votes)
 शुद्धता (Correctness): {correctness}/5 ({correctness_votes} मत/votes)"""
+# --- Gradio UI Build ---
 def build_ui():
     with gr.Blocks(title="नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)") as app:
+        gr.Markdown("# नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)")
+        gr.Markdown(
+            "यस प्लेटफर्मले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवाज डाटा संकलन गर्दछ। "
+            "कृपया आफ्नो आवाज रेकर्ड गरेर योगदान दिनुहोस्।\n"
+            "*This platform collects voice data for the development of Nepali Automatic Speech Recognition (ASR) technology. "
+            "Please contribute by recording your voice.*"
+        )
+        # --- Data Collection Tabs ---
+        with gr.Tabs():
+            with gr.TabItem("१. आवाज रेकर्ड गर्नुहोस् (Record Voice)"):
+                with gr.Tabs():
+                    with gr.TabItem("स्वतन्त्र पाठ (Free Text)"):
+                        with gr.Row():
+                            with gr.Column(scale=2):
+                                free_text_input = gr.Textbox(label="तपाईंले बोल्न चाहनुभएको पाठ (Text you want to speak)", placeholder="यहाँ लेख्नुहोस्...", lines=3)
+                                free_audio_input = gr.Audio(label="आवाज रेकर्ड गर्नुहोस् (Record your voice)", type="filepath", source="microphone")
+                            with gr.Column(scale=3):
+                                with gr.Row():
+                                    free_gender_dd = gr.Dropdown(label="लिङ्ग (Gender)", choices=GENDERS, value=GENDERS[0])
+                                    free_age_dd = gr.Dropdown(label="उमेर समूह (Age Group)", choices=AGE_GROUPS, value=AGE_GROUPS[1])
+                                with gr.Row():
+                                    free_ethnicity_dd = gr.Dropdown(label="जातीयता (Ethnicity)", choices=list(COMMON_LAST_NAMES.keys()), value=list(COMMON_LAST_NAMES.keys())[0])
+                                    free_lastname_dd = gr.Dropdown(label="थर (Last Name)", choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]])
+                                free_ethnicity_dd.change(fn=get_ethnicity_based_last_names, inputs=free_ethnicity_dd, outputs=free_lastname_dd)
+                                with gr.Row():
+                                    free_region_dd = gr.Dropdown(label="क्षेत्र (Region)", choices=REGIONS, value=REGIONS[2])
+                                    free_emotion_dd = gr.Dropdown(label="भावना (Emotion)", choices=EMOTIONS, value=EMOTIONS[0])
+                        free_submit_btn = gr.Button("सुरक्षित गर्नुहोस् (Save Free Text Recording)")
+                        free_status_output = gr.Textbox(label="स्थिति (Status)", interactive=False)
+                        free_submit_btn.click(
+                            save_recording,
+                            inputs=[free_audio_input, free_text_input, free_gender_dd, free_age_dd, free_ethnicity_dd, free_lastname_dd, free_region_dd, free_emotion_dd, gr.Textbox(value="free_text", visible=False)],
+                            outputs=[free_status_output, free_audio_input] # Clear audio on success
                         )
+                    with gr.TabItem("निर्देशित पाठ (Prompted Text)"):
+                        with gr.Row():
+                            with gr.Column(scale=2):
+                                prompt_text_display = gr.Textbox(label="कृपया यो पाठ पढ्नुहोस् (Please read this text)", value=get_random_prompt(), lines=3, interactive=False)
+                                new_prompt_btn = gr.Button("नयाँ पाठ (New Prompt)")
+                                prompt_audio_input = gr.Audio(label="आवाज रेकर्ड गर्नुहोस् (Record your voice)", type="filepath", source="microphone")
+                            with gr.Column(scale=3):
+                                with gr.Row():
+                                    prompt_gender_dd = gr.Dropdown(label="लिङ्ग (Gender)", choices=GENDERS, value=GENDERS[0])
+                                    prompt_age_dd = gr.Dropdown(label="उमेर समूह (Age Group)", choices=AGE_GROUPS, value=AGE_GROUPS[1])
+                                with gr.Row():
+                                    prompt_ethnicity_dd = gr.Dropdown(label="जातीयता (Ethnicity)", choices=list(COMMON_LAST_NAMES.keys()), value=list(COMMON_LAST_NAMES.keys())[0])
+                                    prompt_lastname_dd = gr.Dropdown(label="थर (Last Name)", choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]])
+                                prompt_ethnicity_dd.change(fn=get_ethnicity_based_last_names, inputs=prompt_ethnicity_dd, outputs=prompt_lastname_dd)
+                                with gr.Row():
+                                    prompt_region_dd = gr.Dropdown(label="क्षेत्र (Region)", choices=REGIONS, value=REGIONS[2])
+                                    prompt_emotion_dd = gr.Dropdown(label="भावना (Emotion)", choices=EMOTIONS, value=EMOTIONS[0])
+                        new_prompt_btn.click(get_random_prompt, outputs=prompt_text_display)
+                        prompt_submit_btn = gr.Button("सुरक्षित गर्नुहोस् (Save Prompted Recording)")
+                        prompt_status_output = gr.Textbox(label="स्थिति (Status)", interactive=False)
+                        prompt_submit_btn.click(
+                            save_recording,
+                            inputs=[prompt_audio_input, prompt_text_display, prompt_gender_dd, prompt_age_dd, prompt_ethnicity_dd, prompt_lastname_dd, prompt_region_dd, prompt_emotion_dd, gr.Textbox(value="prompted_text", visible=False)],
+                            outputs=[prompt_status_output, prompt_audio_input]
                         )
+            with gr.TabItem("२. रेकर्डिङ समीक्षा गर्नुहोस् (Review Recordings)"):
+                gr.Markdown("हालसालैका रेकर्डिङहरू हेर्नुहोस् र मत दिनुहोस्। (View and vote on recent recordings.)")
+                num_review_items = gr.Number(value=10, label="देखाउने वस्तुहरूको संख्या (Number of items to show)", minimum=1, maximum=50, step=1)
+                refresh_review_list_btn = gr.Button("सूची ताजा गर्नुहोस् (Refresh List)")
+                review_list_df = gr.DataFrame(headers=['id', 'text', 'ethnicity', 'region', 'timestamp'], label="हालका रेकर्डिङहरू (Recent Recordings)", interactive=False, datatype=['str', 'str', 'str', 'str', 'str'])
+                with gr.Row():
+                    selected_review_id = gr.Textbox(label="चयन गरिएको आईडी (Selected ID)", interactive=False)
+                    selected_review_text = gr.Textbox(label="रेकर्डिङ पाठ (Recording Text)", interactive=False, lines=2)
+                review_audio_player = gr.Audio(label="रेकर्डिङ सुन्नुहोस् (Listen to Recording)", type="filepath")
+                current_ratings_display = gr.Textbox(label="वर्तमान मूल्याङ्कन (Current Ratings)", interactive=False, lines=3)
+                def select_for_review(evt: gr.SelectData, df_data: pd.DataFrame):
+                    if evt.index is None or df_data is None or len(df_data) == 0 or evt.index[0] >= len(df_data):
+                        return "", "", None, "कुनै रेकर्डिङ चयन गरिएको छैन (No recording selected)"
+                    selected_id_val = df_data.iloc[evt.index[0]]['id']
+                    audio_p, text_val = get_recording_audio(selected_id_val)
+                    ratings_text_val = get_recording_ratings(selected_id_val)
+                    return selected_id_val, text_val, audio_p, ratings_text_val
+                review_list_df.select(select_for_review, inputs=[review_list_df], outputs=[selected_review_id, selected_review_text, review_audio_player, current_ratings_display])
+                refresh_review_list_btn.click(list_recordings, inputs=[num_review_items], outputs=review_list_df)
+                gr.Markdown("### मतदान गर्नुहोस् (Cast Your Vote)")
+                with gr.Row():
+                    upvote_btn = gr.Button("👍 मन पर्यो (Upvote)")
+                    downvote_btn = gr.Button("👎 मन परेन (Downvote)")
+                with gr.Row():
+                    quality_rating_slider = gr.Slider(minimum=1, maximum=5, step=1, label="गुणस्तर मूल्याङ्कन (Quality Rating 1-5)", value=3)
+                    submit_quality_btn = gr.Button("गुणस्तर मत दिनुहोस् (Submit Quality)")
+                with gr.Row():
+                    correctness_rating_slider = gr.Slider(minimum=1, maximum=5, step=1, label="शुद्धता मूल्याङ्कन (Correctness Rating 1-5)", value=3)
+                    submit_correctness_btn = gr.Button("शुद्धता मत दिनुहोस् (Submit Correctness)")
+                vote_status_output = gr.Textbox(label="मतदान स्थिति (Voting Status)", interactive=False)
+                def vote_and_refresh(rec_id, vote_t, vote_val_str):
+                    status = vote_recording(rec_id, vote_t, str(vote_val_str)) # Ensure vote_val is str
+                    new_ratings = get_recording_ratings(rec_id) if rec_id else "रेकर्डिङ चयन गर्नुहोस् (Select a recording)"
+                    # Also refresh the main list to reflect potential score changes indirectly
+                    # latest_list = list_recordings(num_review_items.value) # This needs to be handled carefully to avoid component errors
+                    return status, new_ratings
+                upvote_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="upvote", visible=False), gr.Number(value=0, visible=False)], outputs=[vote_status_output, current_ratings_display])
+                downvote_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="downvote", visible=False), gr.Number(value=0, visible=False)], outputs=[vote_status_output, current_ratings_display])
+                submit_quality_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="quality", visible=False), quality_rating_slider], outputs=[vote_status_output, current_ratings_display])
+                submit_correctness_btn.click(vote_and_refresh, inputs=[selected_review_id, gr.Textbox(value="correctness", visible=False), correctness_rating_slider], outputs=[vote_status_output, current_ratings_display])
+            with gr.TabItem("३. प्रगति र अपलोड (Progress & Upload)"):
+                gr.Markdown("## संकलन प्रगति (Collection Progress)")
+                total_count_display = gr.Textbox(label="कुल संकलित रेकर्डिङ (Total Recordings Collected)", interactive=False)
+                refresh_total_count_btn = gr.Button("गणना ताजा गर्नुहोस् (Refresh Count)")
+                refresh_total_count_btn.click(update_count, outputs=total_count_display)
+                gr.Markdown("---")
+                gr.Markdown("## हगिङफेसमा अपलोड गर्नुहोस् (Upload to Hugging Face)")
+                gr.Markdown(
+                    "**महत्वपूर्ण:** यो कार्यले स्पेसमा संकलित सबै डाटालाई हगिङ फेस डेटासेटमा पुश गर्नेछ। "
+                    "स्पेसको स्टोरेज अस्थायी हुन सक्छ, त्यसैले नियमित रूपमा अपलोड गर्न सिफारिस गरिन्छ।\n"
+                    "यो कार्य गर्नको लागि, तपाईंले स्पेस सेटिङहरूमा `HF_TOKEN` (लेख्ने पहुँच सहितको हगिङ फेस टोकन) "
+                    "र `ADMIN_UPLOAD_PASSWORD` गोप्य रूपमा थप्नुपर्छ।\n\n"
+                    "**IMPORTANT:** This action will push all data collected in this Space to the Hugging Face Dataset. "
+                    "Space storage can be ephemeral, so regular uploads are recommended. "
+                    "To perform this action, you must have added `HF_TOKEN` (a Hugging Face token with write access) "
+                    "and `ADMIN_UPLOAD_PASSWORD` as secrets in the Space settings."
                 )
+                hf_dataset_name_input = gr.Textbox(label="Dataset Name (e.g., your_username/nepali-asr-data)", placeholder="your_hf_username/dataset_repo_name")
+                admin_password_input = gr.Textbox(label="Admin Upload Password", type="password", placeholder="Enter admin password")
+                upload_to_hf_btn = gr.Button("हगिङफेसमा अपलोड गर्नुहोस् (Upload to Hugging Face)")
+                upload_status_output = gr.Textbox(label="अपलोड स्थिति (Upload Status)", interactive=False, lines=5)
+                upload_to_hf_btn.click(upload_to_huggingface, inputs=[hf_dataset_name_input, admin_password_input], outputs=upload_status_output)
+            with gr.TabItem("४. जानकारी (Information)"):
+                gr.Markdown(render_info_page()) # Using a helper for cleaner code
+        # Initial loads
+        app.load(fn=update_count, inputs=None, outputs=total_count_display)
+        app.load(fn=lambda n: list_recordings(n), inputs=[num_review_items], outputs=review_list_df) # Load initial review list
     return app
+def render_info_page():
+    return """
+    ## नेपाली ASR डाटा संकलन प्रोजेक्टको बारेमा (About the Nepali ASR Data Collection Project)
+    यो प्रोजेक्टले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवश्यक डाटा संकलन गर्दछ।
+    तपाईंको योगदानले नेपाली भाषा प्रविधिको विकासमा ठूलो मद्दत पुर्‍याउनेछ।
+    ### कसरी योगदान दिने (How to Contribute):
+    1.  **आवाज रेकर्ड गर्नुहोस् (Record Voice)** ट्याबमा जानुहोस्।
+        *   **स्वतन्त्र पाठ (Free Text)** अन्तर्गत, तपाईं आफ्नो इच्छा अनुसारको पाठ लेख्नुहोस्, आवश्यक विवरणहरू (लिङ्ग, उमेर, आदि) छान्नुहोस्, र आफ्नो आवाज रेकर्ड गर्नुहोस्।
+        *   **निर्देशित पाठ (Prompted Text)** अन्तर्गत, दिइएको नेपाली वाक्य पढ्नुहोस्, विवरणहरू छान्नुहोस्, र आफ्नो आवाज रेकर्ड गर्नुहोस्। "नयाँ पाठ" बटनले तपाईंलाई फरक वाक्य दिनेछ।
+    2.  **रेकर्डिङ समीक्षा गर्नुहोस् (Review Recordings)** ट्याबमा गएर अरूले गरेका रेकर्डिङहरू सुन्नुहोस् र तिनीहरूको गुणस्तर र शुद्धताको लागि मतदान गर्नुहोस्। यसले डाटाको गुणस्तर सुधार गर्न मद्दत गर्दछ।
+    3.  रेकर्डिङ पछि, "सुरक्षित गर्नुहोस् (Save)" बटनमा क्लिक गर्नुहोस्।
+    ### गोपनीयता नीति (Privacy Policy):
+    -   तपाईंको आवाज रेकर्डिङ र सम्बन्धित मेटाडाटा (जस्तै उमेर समूह, लिङ्ग, क्षेत्र) सार्वजनिक अनुसन्धान उद्देश्यका लागि प्रयोग गरिनेछ।
+    -   हामी तपाईंको नाम वा सम्पर्क जानकारी जस्ता प्रत्यक्ष व्यक्तिगत पहिचान योग्य जानकारी सङ्कलन गर्दैनौं। तपाईंले प्रदान गर्नुभएको जातीयता/थरको जानकारी उच्चारण र विविधता अध्ययनको लागि हो।
+    -   यो डाटासेट खुला स्रोत हुनेछ र हगिङ फेस जस्ता प्लेटफर्महरूमा अनुसन्धान समुदायको लागि उपलब्ध गराइनेछ।
+    -   कृपया रेकर्डिङको क्रममा कुनै पनि संवेदनशील व्यक्तिगत जानकारी नबोल्नुहोस्।
+    ---
+    ## About Nepali ASR Data Collection Project
+    This project collects voice data essential for developing Automatic Speech Recognition (ASR) technology for the Nepali language.
+    Your contribution will significantly aid in the advancement of Nepali language technology.
+    ### How to Contribute:
+    1.  Go to the **Record Voice (आवाज रेकर्ड गर्नुहोस्)** tab.
+        *   Under **Free Text (स्वतन्त्र पाठ)**, type any Nepali text you wish, select the required demographic details (gender, age, etc.), and record your voice.
+        *   Under **Prompted Text (निर्देशित पाठ)**, read the provided Nepali sentence, select demographic details, and record your voice. The "New Text (नयाँ पाठ)" button will give you a different sentence.
+    2.  Go to the **Review Recordings (रेकर्डिङ समीक्षा गर्नुहोस्)** tab to listen to recordings made by others and vote on their quality and correctness. This helps improve the overall quality of the dataset.
+    3.  After recording, click the "Save (सुरक्षित गर्नुहोस्)" button.
+    ### Privacy Policy:
+    -   Your voice recordings and associated metadata (like age group, gender, region) will be used for public research purposes.
+    -   We do not collect directly personally identifiable information such as your name or contact details. The ethnicity/last name information you provide is for studying accent and diversity.
+    -   This dataset will be open-source and made available to the research community on platforms like Hugging Face.
+    -   Please do not speak any sensitive personal information during your recordings.
+    """
+# --- Main Execution ---
 if __name__ == "__main__":
+    # Ensure storage is initialized when running locally too
+    initialize_data_storage()
+    app_ui = build_ui()
+    app_ui.launch()