Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import pyarrow.parquet as pq | |
| import fsspec | |
| from huggingface_hub import list_repo_files | |
| from urllib.parse import quote | |
| # ================================== | |
| REPO_ID = "OMCHOKSI108/my-cloud-data-lake" | |
| OUTPUT_FILE = "hf_dataset_structure_report.csv" | |
| summary = [] | |
| print(f"\nConnecting to HuggingFace dataset: {REPO_ID}\n") | |
| files = list_repo_files(repo_id=REPO_ID, repo_type="dataset") | |
| parquet_files = [f for f in files if f.endswith(".parquet")] | |
| print(f"Total parquet files found: {len(parquet_files)}\n") | |
| for file_path in parquet_files: | |
| print(f"Inspecting: {file_path}") | |
| try: | |
| # 🔥 ENCODE SPECIAL CHARACTERS | |
| encoded_path = quote(file_path) | |
| hf_url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{encoded_path}" | |
| with fsspec.open(hf_url, "rb") as f: | |
| parquet_file = pq.ParquetFile(f) | |
| schema = parquet_file.schema | |
| num_rows = parquet_file.metadata.num_rows | |
| summary.append({ | |
| "file_path": file_path, | |
| "folder": file_path.split("/")[0], | |
| "file_name": file_path.split("/")[-1], | |
| "num_columns": len(schema.names), | |
| "num_rows": num_rows, | |
| "columns": schema.names | |
| }) | |
| except Exception as e: | |
| print("Error:", e) | |
| df = pd.DataFrame(summary) | |
| print("\n===== DATASET STRUCTURE =====\n") | |
| print(df[["file_name", "folder", "num_columns", "num_rows"]]) | |
| df.to_csv(OUTPUT_FILE, index=False) | |
| print(f"\nReport saved as: {OUTPUT_FILE}") | |