Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| from io import BytesIO | |
| def convert_hf_dataset(input_file, file_url): | |
| """ | |
| This function accepts either an uploaded file or a Hugging Face dataset URL. | |
| It automatically determines the file type (CSV or Parquet) based on the file extension, | |
| converts the file to the opposite format, and returns the converted file along with a preview | |
| of the top 10 rows. | |
| """ | |
| df = None | |
| source = None | |
| converted_format = None | |
| output_file = None | |
| # If no file is provided via upload and URL is empty, raise an error. | |
| if input_file is None and (file_url is None or file_url.strip() == ""): | |
| raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.") | |
| if input_file is not None: | |
| # Process the uploaded file. | |
| source = input_file.name | |
| file_extension = source.lower().split('.')[-1] | |
| file_bytes = input_file.read() # read the file content | |
| if file_extension == "csv": | |
| df = pd.read_csv(BytesIO(file_bytes)) | |
| converted_format = "Parquet" | |
| output_file = "output.parquet" | |
| elif file_extension == "parquet": | |
| df = pd.read_parquet(BytesIO(file_bytes)) | |
| converted_format = "CSV" | |
| output_file = "output.csv" | |
| else: | |
| raise ValueError("Uploaded file must have a .csv or .parquet extension.") | |
| else: | |
| # Process the URL input. | |
| file_url = file_url.strip() | |
| if "huggingface.co" not in file_url: | |
| raise ValueError("Please provide a URL from Hugging Face datasets.") | |
| if not file_url.lower().startswith(("http://", "https://")): | |
| file_url = "https://" + file_url | |
| source = file_url.split('/')[-1] | |
| response = requests.get(file_url) | |
| response.raise_for_status() | |
| content = response.content | |
| if file_url.lower().endswith(".csv"): | |
| df = pd.read_csv(BytesIO(content)) | |
| converted_format = "Parquet" | |
| output_file = "output.parquet" | |
| elif file_url.lower().endswith(".parquet"): | |
| df = pd.read_parquet(BytesIO(content)) | |
| converted_format = "CSV" | |
| output_file = "output.csv" | |
| else: | |
| raise ValueError("The URL must point to a .csv or .parquet file.") | |
| # Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV. | |
| if converted_format == "Parquet": | |
| df.to_parquet(output_file, index=False) | |
| else: | |
| df.to_csv(output_file, index=False) | |
| # Create a preview (top 10 rows) of the DataFrame. | |
| preview = df.head(10).to_string(index=False) | |
| info_message = ( | |
| f"Input file: {source}\n" | |
| f"Converted file format: {converted_format}\n\n" | |
| f"Preview (Top 10 Rows):\n{preview}" | |
| ) | |
| return output_file, info_message | |
| demo = gr.Interface( | |
| fn=convert_hf_dataset, | |
| inputs=[ | |
| gr.File(label="Uploaded File (Optional)"), | |
| gr.Textbox( | |
| label="Hugging Face Dataset URL (Optional)", | |
| placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.File(label="Converted File"), | |
| gr.Textbox(label="Preview (Top 10 Rows)", lines=15) | |
| ], | |
| title="Hugging Face CSV <-> Parquet Converter", | |
| description=( | |
| "Upload a file or enter the URL of a Hugging Face dataset file. " | |
| "The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, " | |
| "and displays a preview of the top 10 rows." | |
| ) | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |