Spaces:

darvilab
/

Nepali-ASR-Open-Data-Collection

Sleeping

App Files Files Community

ashokpoudel commited on May 16, 2025

Commit

e7f7004

verified ·

1 Parent(s): 8ea7f98

Create app.py

Browse files

Files changed (1) hide show

app.py +545 -0

app.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import os
+import gradio as gr
+import pandas as pd
+import numpy as np
+import random
+import datetime
+import uuid
+import json
+from huggingface_hub import HfApi
+from datasets import Dataset
+# Configuration
+SAMPLE_PROMPTS = [
+    "नमस्ते, मेरो नाम __ हो। म नेपाली बोल्छु।",
+    "आज मौसम धेरै राम्रो छ।",
+    "नेपाल एक सुन्दर देश हो जहाँ हिमाल, पहाड र तराई छन्।",
+    "काठमाडौं नेपालको राजधानी हो।",
+    "म आज बिहान स्कूल जाँदैछु।",
+    "नेपाली भाषा बोल्ने मानिसहरू विश्वभर छन्।",
+    "हिमालमा हिउँ परिरहेको छ।",
+    "मलाई नेपाली खाना धेरै मन पर्छ।",
+    "बुद्ध नेपालमा जन्मिएका थिए।",
+    "सगरमाथा विश्वको सबैभन्दा अग्लो हिमाल हो।"
+]
+EMOTIONS = ["सामान्य (Neutral)", "खुसी (Happy)", "दुःखी (Sad)", "रिसाएको (Angry)", "अचम्मित (Surprised)"]
+GENDERS = ["पुरुष (Male)", "महिला (Female)", "अन्य (Other)", "भन्न चाहन्न (Prefer not to say)"]
+AGE_GROUPS = ["18 भन्दा कम", "18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
+# Nepal-specific data
+REGIONS = [
+    "प्रदेश १ (Province 1)",
+    "मधेश प्रदेश (Madhesh Province)",
+    "बागमती प्रदेश (Bagmati Province)",
+    "गण्डकी प्रदेश (Gandaki Province)",
+    "लुम्बिनी प्रदेश (Lumbini Province)",
+    "कर्णाली प्रदेश (Karnali Province)",
+    "सुदूरपश्चिम प्रदेश (Sudurpashchim Province)"
+]
+# Common last names by ethnicity/region for better accent tracking
+COMMON_LAST_NAMES = {
+    "पहाडी (Pahadi)": ["शर्मा (Sharma)", "पौडेल (Poudel)", "खनाल (Khanal)", "अधिकारी (Adhikari)", "भट्टराई (Bhattarai)", "अन्य पहाडी (Other Pahadi)"],
+    "नेवार (Newar)": ["श्रेष्ठ (Shrestha)", "प्रधान (Pradhan)", "महर्जन (Maharjan)", "बज्राचार्य (Bajracharya)", "अन्य नेवार (Other Newar)"],
+    "मधेसी (Madhesi)": ["यादव (Yadav)", "साह (Shah)", "सिंह (Singh)", "गुप्ता (Gupta)", "अन्य मधेसी (Other Madhesi)"],
+    "थारु (Tharu)": ["चौधरी (Chaudhary)", "थारु (Tharu)", "अन्य थारु (Other Tharu)"],
+    "मगर (Magar)": ["मगर (Magar)", "थापा (Thapa)", "राना (Rana)", "अन्य मगर (Other Magar)"],
+    "तामाङ (Tamang)": ["तामाङ (Tamang)", "लामा (Lama)", "अन्य तामाङ (Other Tamang)"],
+    "राई (Rai)": ["राई (Rai)", "अन्य राई (Other Rai)"],
+    "गुरुङ (Gurung)": ["गुरुङ (Gurung)", "अन्य गुरुङ (Other Gurung)"],
+    "लिम्बु (Limbu)": ["लिम्बु (Limbu)", "अन्य लिम्बु (Other Limbu)"],
+    "शेर्पा (Sherpa)": ["शेर्पा (Sherpa)", "अन्य शेर्पा (Other Sherpa)"],
+    "अन्य (Other)": ["अन्य (Other)"]
+}
+# Create directory for saved recordings
+os.makedirs("recordings", exist_ok=True)
+os.makedirs("metadata", exist_ok=True)
+os.makedirs("ratings", exist_ok=True)
+# Initialize metadata file if it doesn't exist
+metadata_file = "metadata/metadata.csv"
+ratings_file = "ratings/ratings.json"
+if not os.path.exists(metadata_file):
+    pd.DataFrame(columns=[
+        "id", "text", "audio_path", "gender", "age_group", "ethnicity",
+        "last_name", "region", "emotion", "timestamp", "recording_type"
+    ]).to_csv(metadata_file, index=False)
+if not os.path.exists(ratings_file):
+    with open(ratings_file, 'w') as f:
+        json.dump({}, f)
+def save_recording(audio, text, gender, age_group, ethnicity, last_name, region, emotion, recording_type):
+    """Save the recording and metadata"""
+    # Generate unique ID for this recording
+    recording_id = str(uuid.uuid4())
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    # Check if audio was recorded
+    if audio is None:
+        return "कृपया पहिले रेकर्डिङ गर्नुहोस्। (Please record audio first)", None
+    # Save audio file
+    audio_filename = f"recordings/{recording_id}.wav"
+    if isinstance(audio, tuple):  # If it's a tuple (sr, data)
+        sr, data = audio
+        import soundfile as sf
+        sf.write(audio_filename, data, sr)
+    else:  # If it's a path
+        # Copy the file
+        import shutil
+        shutil.copy(audio, audio_filename)
+    # Update metadata
+    metadata = pd.read_csv(metadata_file)
+    new_row = pd.DataFrame([{
+        "id": recording_id,
+        "text": text,
+        "audio_path": audio_filename,
+        "gender": gender,
+        "age_group": age_group,
+        "ethnicity": ethnicity,
+        "last_name": last_name,
+        "region": region,
+        "emotion": emotion,
+        "timestamp": timestamp,
+        "recording_type": recording_type
+    }])
+    updated_metadata = pd.concat([metadata, new_row], ignore_index=True)
+    updated_metadata.to_csv(metadata_file, index=False)
+    # Initialize rating for this recording in the ratings file
+    with open(ratings_file, 'r') as f:
+        ratings = json.load(f)
+    ratings[recording_id] = {
+        "upvotes": 0,
+        "downvotes": 0,
+        "quality_score": 0,  # Average quality rating (1-5)
+        "quality_votes": 0,  # Number of quality ratings
+        "correctness_score": 0,  # Average correctness rating (1-5)
+        "correctness_votes": 0  # Number of correctness ratings
+    }
+    with open(ratings_file, 'w') as f:
+        json.dump(ratings, f, indent=2)
+    return f"रेकर्डिङ सफलतापूर्वक सुरक्षित गरियो! (Recording saved successfully!)", audio_filename
+def get_random_prompt():
+    """Return a random prompt from the list"""
+    return random.choice(SAMPLE_PROMPTS)
+def vote_recording(recording_id, vote_type, vote_value):
+    """Add a vote for a recording"""
+    if not os.path.exists(ratings_file):
+        return "रेटिङ फाइल भेटिएन। (Rating file not found.)"
+    try:
+        with open(ratings_file, 'r') as f:
+            ratings = json.load(f)
+        if recording_id not in ratings:
+            return "रेकर्डिङ आईडी भेटिएन। (Recording ID not found.)"
+        if vote_type == "upvote":
+            ratings[recording_id]["upvotes"] += 1
+        elif vote_type == "downvote":
+            ratings[recording_id]["downvotes"] += 1
+        elif vote_type == "quality":
+            # Update quality score (running average)
+            current_score = ratings[recording_id]["quality_score"]
+            current_votes = ratings[recording_id]["quality_votes"]
+            new_votes = current_votes + 1
+            new_score = ((current_score * current_votes) + vote_value) / new_votes
+            ratings[recording_id]["quality_score"] = new_score
+            ratings[recording_id]["quality_votes"] = new_votes
+        elif vote_type == "correctness":
+            # Update correctness score (running average)
+            current_score = ratings[recording_id]["correctness_score"]
+            current_votes = ratings[recording_id]["correctness_votes"]
+            new_votes = current_votes + 1
+            new_score = ((current_score * current_votes) + vote_value) / new_votes
+            ratings[recording_id]["correctness_score"] = new_score
+            ratings[recording_id]["correctness_votes"] = new_votes
+        with open(ratings_file, 'w') as f:
+            json.dump(ratings, f, indent=2)
+        return f"मतदान सफलतापूर्वक दर्ता गरियो! (Vote registered successfully!)"
+    except Exception as e:
+        return f"त्रुटि: {str(e)}"
+def get_ethnicity_based_last_names(ethnicity):
+    """Return last name options based on selected ethnicity"""
+    if ethnicity in COMMON_LAST_NAMES:
+        return COMMON_LAST_NAMES[ethnicity]
+    return COMMON_LAST_NAMES["अन्य (Other)"]
+def upload_to_huggingface(hf_token, dataset_name):
+    """Upload the collected data to Hugging Face"""
+    if not os.path.exists(metadata_file):
+        return "कुनै डाटा भेटिएन। (No data found.)"
+    try:
+        # Read metadata
+        metadata = pd.read_csv(metadata_file)
+        if len(metadata) == 0:
+            return "कुनै डाटा भेटिएन। (No data found.)"
+        # Read ratings
+        with open(ratings_file, 'r') as f:
+            ratings = json.load(f)
+        # Add ratings to metadata
+        metadata["upvotes"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("upvotes", 0))
+        metadata["downvotes"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("downvotes", 0))
+        metadata["quality_score"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("quality_score", 0))
+        metadata["correctness_score"] = metadata["id"].apply(lambda x: ratings.get(x, {}).get("correctness_score", 0))
+        # Create a dataset dict
+        dataset_dict = {
+            "id": metadata["id"].tolist(),
+            "text": metadata["text"].tolist(),
+            "gender": metadata["gender"].tolist(),
+            "age_group": metadata["age_group"].tolist(),
+            "ethnicity": metadata["ethnicity"].tolist(),
+            "last_name": metadata["last_name"].tolist(),
+            "region": metadata["region"].tolist(),
+            "emotion": metadata["emotion"].tolist(),
+            "recording_type": metadata["recording_type"].tolist(),
+            "timestamp": metadata["timestamp"].tolist(),
+            "upvotes": metadata["upvotes"].tolist(),
+            "downvotes": metadata["downvotes"].tolist(),
+            "quality_score": metadata["quality_score"].tolist(),
+            "correctness_score": metadata["correctness_score"].tolist(),
+        }
+        # Create a Dataset object
+        dataset = Dataset.from_dict(dataset_dict)
+        # Push to hub
+        api = HfApi(token=hf_token)
+        dataset.push_to_hub(dataset_name)
+        # Upload audio files
+        for _, row in metadata.iterrows():
+            audio_path = row["audio_path"]
+            if os.path.exists(audio_path):
+                api.upload_file(
+                    path_or_fileobj=audio_path,
+                    path_in_repo=f"audio/{os.path.basename(audio_path)}",
+                    repo_id=dataset_name,
+                    repo_type="dataset"
+                )
+        return f"डाटा हगिङफेसमा सफलतापूर्वक अपलोड गरियो! (Data successfully uploaded to Hugging Face at {dataset_name})"
+    except Exception as e:
+        return f"त्रुटि: {str(e)}"
+def update_count():
+    """Update the count of recordings"""
+    if os.path.exists(metadata_file):
+        metadata = pd.read_csv(metadata_file)
+        return f"हालसम्म {len(metadata)} रेकर्डिङहरू संकलन गरिएको छ। (Total recordings collected: {len(metadata)})"
+    return "कुनै रेकर्डिङ भेटिएन। (No recordings found.)"
+def list_recordings(num_items=10):
+    """List recent recordings for review"""
+    if not os.path.exists(metadata_file):
+        return pd.DataFrame()
+    metadata = pd.read_csv(metadata_file)
+    if len(metadata) == 0:
+        return pd.DataFrame()
+    # Sort by timestamp (newest first) and get the most recent entries
+    metadata['timestamp'] = pd.to_datetime(metadata['timestamp'])
+    sorted_metadata = metadata.sort_values('timestamp', ascending=False).head(num_items)
+    # Reset the index for display purposes
+    display_df = sorted_metadata[['id', 'text', 'ethnicity', 'region', 'timestamp']].copy()
+    display_df['timestamp'] = display_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
+    display_df = display_df.reset_index(drop=True)
+    return display_df
+def get_recording_audio(recording_id):
+    """Get audio file path for a specific recording"""
+    if not os.path.exists(metadata_file):
+        return None, "रेकर्डिङ भेटिएन। (Recording not found.)"
+    metadata = pd.read_csv(metadata_file)
+    recording = metadata[metadata['id'] == recording_id]
+    if len(recording) == 0:
+        return None, "रेकर्डिङ भेटिएन। (Recording not found.)"
+    audio_path = recording['audio_path'].iloc[0]
+    text = recording['text'].iloc[0]
+    if not os.path.exists(audio_path):
+        return None, "अडियो फाइल भेटिएन। (Audio file not found.)"
+    return audio_path, text
+def get_recording_ratings(recording_id):
+    """Get current ratings for a recording"""
+    if not os.path.exists(ratings_file):
+        return "डाटा भेटिएन। (No data found.)"
+    with open(ratings_file, 'r') as f:
+        ratings = json.load(f)
+    if recording_id not in ratings:
+        return "रेकर्डिङ आईडी भेटिएन। (Recording ID not found.)"
+    r = ratings[recording_id]
+    # Format the ratings for display
+    upvotes = r["upvotes"]
+    downvotes = r["downvotes"]
+    quality = round(r["quality_score"], 1) if r["quality_votes"] > 0 else 0
+    quality_votes = r["quality_votes"]
+    correctness = round(r["correctness_score"], 1) if r["correctness_votes"] > 0 else 0
+    correctness_votes = r["correctness_votes"]
+    return f"""👍 Upvotes: {upvotes} | 👎 Downvotes: {downvotes}
+गुणस्तर (Quality): {quality}/5 ({quality_votes} मत/votes)
+शुद्धता (Correctness): {correctness}/5 ({correctness_votes} मत/votes)"""
+def build_ui():
+    """Build the Gradio interface"""
+    with gr.Blocks(title="नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)") as app:
+        gr.Markdown("""
+        # नेपाली ASR डाटा संकलन (Nepali ASR Data Collection)
+        यस प्लेटफर्मले नेपाली भाषाको स्वचालित भाषण पह��चान (ASR) प्रविधिको विकासका लागि आवाज डाटा संकलन गर्दछ।
+        कृपया आफ्नो आवाज रेकर्ड गरेर योगदान दिनुहोस्।
+        *This platform collects voice data for the development of Nepali Automatic Speech Recognition (ASR) technology.
+        Please contribute by recording your voice.*
+        """)
+        with gr.Tab("स्वतन्त्र पाठ (Free Text)"):
+            with gr.Row():
+                with gr.Column():
+                    free_text = gr.Textbox(
+                        label="तपाईंले बोल्न चाहनुभएको पाठ यहाँ लेख्नुहोस् (Type the text you want to speak here)",
+                        placeholder="यहाँ लेख्नुहोस्...",
+                        lines=3
+                    )
+                    free_audio = gr.Audio(
+                        label="आफ्नो आवाज रेकर्ड गर्नुहोस् (Record your voice)",
+                        type="filepath",
+                        source="microphone"
+                    )
+                with gr.Column():
+                    # First row of metadata
+                    with gr.Row():
+                        free_gender = gr.Dropdown(
+                            label="लिङ्ग (Gender)",
+                            choices=GENDERS,
+                            value=GENDERS[0]
+                        )
+                        free_age = gr.Dropdown(
+                            label="उमेर समूह (Age Group)",
+                            choices=AGE_GROUPS,
+                            value=AGE_GROUPS[1]
+                        )
+                    # Second row of metadata
+                    with gr.Row():
+                        free_ethnicity = gr.Dropdown(
+                            label="जातीयता (Ethnicity)",
+                            choices=list(COMMON_LAST_NAMES.keys()),
+                            value=list(COMMON_LAST_NAMES.keys())[0]
+                        )
+                        free_last_name = gr.Dropdown(
+                            label="थर (Last Name)",
+                            choices=COMMON_LAST_NAMES[list(COMMON_LAST_NAMES.keys())[0]]
+                        )
+                    # Update last name options when ethnicity changes
+                    free_ethnicity.change(
+                        fn=get_ethnicity_based_last_names,
+                        inputs=free_ethnicity,
+                        outputs=free_last_name
+                    )
+                    # Third row of metadata
+                    with gr.Row():
+                        free_region = gr.Dropdown(
+                            label="क्षेत्र (Region)",
+                            choices=REGIONS,
+                            value=REGIONS[2]  # Default to Bagmati Province
+                        )
+                        free_emotion = gr.Dropdown(
+                            label="भावना (Emotion)",
+                            choices=EMOTIONS,
+                            value=EMOTIONS[0]
+                        )
+            free_submit = gr.Button("सुरक्षित गर्नुहोस् (Save)")
+            free_output = gr.Textbox(label="स्थिति (Status)")
+            free_submit.click(
+                fn=save_recording,
+                inputs=[
+                    free_audio, free_text, free_gender, free_age,
+                    free_ethnicity, free_last_name, free_region, free_emotion,
+                    gr.Textbox(value="free_text", visible=False)
+                ],
+                outputs=[free_output, free_audio]
+            )="filepath",
+                        source="microphone"
+                    )
+                with gr.Column():
+                    free_gender = gr.Dropdown(
+                        label="लिङ्ग (Gender)",
+                        choices=GENDERS,
+                        value=GENDERS[0]
+                    )
+                    free_age = gr.Dropdown(
+                        label="उमेर समूह (Age Group)",
+                        choices=AGE_GROUPS,
+                        value=AGE_GROUPS[1]
+                    )
+                    free_emotion = gr.Dropdown(
+                        label="भावना (Emotion)",
+                        choices=EMOTIONS,
+                        value=EMOTIONS[0]
+                    )
+            free_submit = gr.Button("सुरक्षित गर्नुहोस् (Save)")
+            free_output = gr.Textbox(label="स्थिति (Status)")
+            free_submit.click(
+                fn=save_recording,
+                inputs=[free_audio, free_text, free_gender, free_age, free_emotion, gr.Textbox(value="free_text", visible=False)],
+                outputs=[free_output, free_audio]
+            )
+        with gr.Tab("निर्देशित पाठ (Prompted Text)"):
+            with gr.Row():
+                with gr.Column():
+                    prompt_text = gr.Textbox(
+                        label="कृपया यो पाठ पढ्नुहोस् (Please read this text)",
+                        value=get_random_prompt(),
+                        lines=3
+                    )
+                    prompt_audio = gr.Audio(
+                        label="आफ्नो आवाज रेकर्ड गर्नुहोस् (Record your voice)",
+                        type="filepath",
+                        source="microphone"
+                    )
+                    new_prompt = gr.Button("नयाँ पाठ (New Text)")
+                with gr.Column():
+                    prompt_gender = gr.Dropdown(
+                        label="लिङ्ग (Gender)",
+                        choices=GENDERS,
+                        value=GENDERS[0]
+                    )
+                    prompt_age = gr.Dropdown(
+                        label="उमेर समूह (Age Group)",
+                        choices=AGE_GROUPS,
+                        value=AGE_GROUPS[1]
+                    )
+                    prompt_emotion = gr.Dropdown(
+                        label="भावना (Emotion)",
+                        choices=EMOTIONS,
+                        value=EMOTIONS[0]
+                    )
+            prompt_submit = gr.Button("सुरक्षित गर्नुहोस् (Save)")
+            prompt_output = gr.Textbox(label="स्थिति (Status)")
+            new_prompt.click(fn=get_random_prompt, inputs=None, outputs=prompt_text)
+            prompt_submit.click(
+                fn=save_recording,
+                inputs=[prompt_audio, prompt_text, prompt_gender, prompt_age, prompt_emotion, gr.Textbox(value="prompted_text", visible=False)],
+                outputs=[prompt_output, prompt_audio]
+            )
+        with gr.Tab("प्रगति (Progress)"):
+            count_display = gr.Textbox(label="संकलित रेकर्डिङ गणना (Recording Count)")
+            refresh_button = gr.Button("ताजा गर्नुहोस् (Refresh)")
+            refresh_button.click(fn=update_count, inputs=None, outputs=count_display)
+            # HuggingFace upload section (admin only)
+            gr.Markdown("## हगिङफेसमा अपलोड गर्नुहोस् (Upload to Hugging Face)")
+            with gr.Row():
+                hf_token = gr.Textbox(label="Hugging Face API Token", type="password")
+                dataset_name = gr.Textbox(
+                    label="Dataset Name",
+                    placeholder="username/nepali-asr-dataset"
+                )
+            upload_button = gr.Button("अपलोड गर्नुहोस् (Upload)")
+            upload_status = gr.Textbox(label="अपलोड स्थिति (Upload Status)")
+            upload_button.click(
+                fn=upload_to_huggingface,
+                inputs=[hf_token, dataset_name],
+                outputs=upload_status
+            )
+        with gr.Tab("जानकारी (Information)"):
+            gr.Markdown("""
+            ## नेपाली ASR डाटा संकलन प्रोजेक्टको बारेमा
+            यो प्रोजेक्टले नेपाली भाषाको स्वचालित भाषण पहिचान (ASR) प्रविधिको विकासका लागि आवश्यक डाटा संकलन गर्दछ।
+            तपाईंको योगदानले नेपाली भाषा प्रविधिको विकासमा ठूलो मद्दत पुर्‍याउनेछ।
+            ### कसरी योगदान दिने:
+            1. **स्वतन्त्र पाठ (Free Text)** ट्याबमा, तपाईं आफ्नो इच्छा अनुसार पाठ लेखेर त्यसलाई बोल्न सक्नुहुन्छ।
+            2. **निर्देशित पाठ (Prompted Text)** ट्याबमा, तपाईंले दिइएको पाठलाई पढेर रेकर्ड गर्न सक्नुहुन्छ।
+            3. रेकर्डिङ पछि, "सुरक्षित गर्नुहोस्" बटनमा क्लिक गर्नुहोस्।
+            ### गोपनीयता नीति:
+            - त��ाईंको आवाज रेकर्डिङ र मेटाडाटा सार्वजनिक अनुसन्धान उद्देश्यका लागि प्रयोग गरिनेछ।
+            - कृपया व्यक्तिगत पहिचान गर्न सकिने जानकारी शेयर नगर्नुहोस्।
+            - यो डाटासेट खुला स्रोत हुनेछ र हगिङफेसमा प्रकाशित गरिनेछ।
+            ---
+            ## About Nepali ASR Data Collection Project
+            This project collects necessary data for the development of Nepali Automatic Speech Recognition (ASR) technology.
+            Your contribution will greatly help in advancing Nepali language technology.
+            ### How to Contribute:
+            1. In the **Free Text** tab, you can type any text you want and record yourself speaking it.
+            2. In the **Prompted Text** tab, you can record yourself reading the provided text.
+            3. After recording, click the "Save" button.
+            ### Privacy Policy:
+            - Your voice recordings and metadata will be used for public research purposes.
+            - Please do not share personally identifiable information.
+            - This dataset will be open-source and published on Hugging Face.
+            """)
+        # Initialize the count
+        app.load(fn=update_count, inputs=None, outputs=count_display)
+    return app
+# Launch the app
+if __name__ == "__main__":
+    app = build_ui()
+    app.launch()