Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| from datasets import load_dataset, Dataset | |
| import pandas as pd | |
| from huggingface_hub import login | |
| def load_huggingface_dataset(dataset_link, token): | |
| """ | |
| Load a Hugging Face dataset using the provided link and token. | |
| """ | |
| # Extract dataset name and config if applicable | |
| # Load the dataset | |
| dataset = load_dataset(dataset_link, split="train") | |
| # Return the dataset as a DataFrame with index and transcription columns | |
| df = dataset.to_pandas().reset_index() | |
| # print(df) | |
| return df[["index", "transcription"]], dataset | |
| def update_transcriptions(df, dataset, token,dataset_link): | |
| """ | |
| Update the transcriptions in the dataset and push it back to the Hugging Face Hub. | |
| """ | |
| # Convert DataFrame back to Dataset | |
| updated_dataset = Dataset.from_pandas(df) | |
| # print(updated_dataset) | |
| # print(dataset) | |
| # Replace the original transcription column in the dataset | |
| dataset = dataset.map( | |
| lambda examples, idx: {"transcription": updated_dataset["transcription"][idx]}, | |
| with_indices=True | |
| ) | |
| # Filter out rows with empty or whitespace-only transcriptions | |
| dataset = dataset.filter( | |
| lambda examples: examples["transcription"].strip() != "", # Keep only non-empty transcriptions | |
| ) | |
| login(token) | |
| dataset.push_to_hub(dataset_link) | |
| return "Dataset updated and changes submitted to the Hugging Face Hub!" | |
| # Gradio Interface | |
| def main(): | |
| dataset = None # To store the loaded dataset object globally | |
| original_df = None # Store the original DataFrame globally for resetting search results | |
| def load_dataset_and_show_table(dataset_link, token): | |
| """ | |
| Load the dataset and return the DataFrame to display in Gradio. | |
| """ | |
| nonlocal dataset, original_df | |
| original_df, dataset = load_huggingface_dataset(dataset_link, token) | |
| return original_df | |
| def search_transcriptions(search_term): | |
| """ | |
| Search the transcription column and filter the table based on the search term. | |
| """ | |
| if original_df is None: | |
| return pd.DataFrame(columns=["index", "transcription"]) # Empty table if no dataset is loaded | |
| filtered_df = original_df[original_df["transcription"].str.contains(search_term, case=False, na=False)] | |
| return filtered_df | |
| def update_original(df): | |
| # Merge modified DataFrame into original DataFrame | |
| for _, row in df.iterrows(): | |
| # Locate the row in the original DataFrame with the same index | |
| original_df.loc[original_df["index"] == row["index"], "transcription"] = row["transcription"] | |
| return "update Successful" | |
| def submit_changes(df, token,dataset_link): | |
| """ | |
| Submit updated changes to the Hugging Face Hub. | |
| """ | |
| if dataset is None: | |
| return "No dataset loaded to update." | |
| print(len(dataset)) | |
| print(len(df)) | |
| if len(df) < len(dataset): | |
| update_original(df) | |
| return update_transcriptions(original_df, dataset, token,dataset_link) | |
| return update_transcriptions(df, dataset, token,dataset_link) | |
| # Gradio Interface | |
| with gr.Blocks(css=".dataframe-row { height: 200px; }") as interface: | |
| gr.Markdown("## Hugging Face Audio Dataset Editor") | |
| # Input fields for dataset link and token | |
| dataset_link = gr.Textbox(label="Hugging Face Dataset Link") | |
| hf_token = gr.Textbox(label="Hugging Face Token", type="password") | |
| # Button to load dataset | |
| load_button = gr.Button("Load Dataset") | |
| # Search bar | |
| search_box = gr.Textbox(label="Search Transcriptions", placeholder="Enter a search term...") | |
| # Table to display and edit dataset | |
| table = gr.Dataframe( | |
| headers=["index", "transcription"], | |
| datatype=["number", "str"], | |
| interactive=True, | |
| label="Edit Dataset (Transcriptions are RTL)", | |
| ) | |
| update_button = gr.Button("Update Table") | |
| # Button to submit changes | |
| submit_button = gr.Button("Submit Changes") | |
| update_message = gr.Textbox(label="update message") | |
| output_message = gr.Textbox(label="Message") | |
| # RTL styling for transcription column | |
| table.style = {"transcription": {"direction": "rtl"}} | |
| # Button functionality | |
| load_button.click(load_dataset_and_show_table, [dataset_link, hf_token], table) | |
| search_box.change(search_transcriptions, search_box, table) | |
| update_button.click(update_original, [table], update_message) | |
| submit_button.click(submit_changes, [table, hf_token,dataset_link], output_message) | |
| # Launch Gradio Interface | |
| interface.launch() | |
| if __name__ == "__main__": | |
| main() | |