Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from datasets import load_dataset | |
| # Print all the available datasets | |
| from huggingface_hub import list_datasets | |
| import pandas as pd | |
| import io | |
| def list_available_datasets(query: str): | |
| all_dsets = list_datasets() | |
| matches = [ds for ds in all_dsets if query in ds] | |
| return matches[:50] | |
| def explore_dataset(dataset_name: str, split: str, num_examples: int): | |
| ds = load_dataset(dataset_name, split=split) | |
| # Schema: column name to feature type | |
| schema = {col: str(ds.features[col]) for col in ds.column_names} | |
| # Examples DataFrame | |
| examples = ds.select(range(min(len(ds), num_examples))).to_pandas() | |
| # Statistics: total samples and column types | |
| stats = {"Anzahl Samples": len(ds)} | |
| stats.update({col: str(ds.features[col]) for col in ds.column_names}) | |
| return schema, examples, stats | |
| def export_column(dataset_name: str, split: str, column: str): | |
| ds = load_dataset(dataset_name, split=split) | |
| if column not in ds.column_names: | |
| return "Spalte nicht gefunden.", "" | |
| df = ds[column].to_pandas() | |
| buffer = io.StringIO() | |
| df.to_csv(buffer, index=False) | |
| csv_text = buffer.getvalue() | |
| return f"CSV für Spalte '{column}' erzeugt.", csv_text | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 📊 DataScout – Hugging Face Dataset Explorer") | |
| with gr.Row(): | |
| query = gr.Textbox(label="Dataset suchen", placeholder="z.B. imdb") | |
| search_btn = gr.Button("🔍 Suchen") | |
| results = gr.Dropdown(label="Gefundene Datasets", choices=[], interactive=True) | |
| split = gr.Dropdown(label="Split wählen", choices=["train", "test", "validation"], value="train") | |
| num_examples = gr.Slider(label="Anzahl Beispiele", minimum=1, maximum=20, value=5, step=1) | |
| explore_btn = gr.Button("👁️ Dataset erkunden") | |
| schema_out = gr.JSON(label="Schema") | |
| examples_out = gr.Dataframe(label="Beispiele") | |
| stats_out = gr.JSON(label="Statistiken") | |
| col_dropdown = gr.Dropdown(label="Spalte für CSV-Export", choices=[], interactive=True) | |
| export_btn = gr.Button("📥 CSV erzeugen") | |
| export_msg = gr.Textbox(label="Status") | |
| export_csv = gr.TextArea(label="CSV-Ausgabe", lines=10) | |
| # Events | |
| search_btn.click(fn=list_available_datasets, inputs=query, outputs=results) | |
| explore_btn.click(fn=explore_dataset, inputs=[results, split, num_examples], outputs=[schema_out, examples_out, stats_out]) | |
| results.change(fn=lambda name: load_dataset(name, split="train").column_names if name else [], | |
| inputs=results, outputs=col_dropdown) | |
| export_btn.click(fn=export_column, inputs=[results, split, col_dropdown], outputs=[export_msg, export_csv]) | |
| if __name__ == "__main__": | |
| demo.launch() | |