import pandas as pd import pyarrow.parquet as pq import fsspec from huggingface_hub import list_repo_files from urllib.parse import quote # ================================== REPO_ID = "OMCHOKSI108/my-cloud-data-lake" OUTPUT_FILE = "hf_dataset_structure_report.csv" summary = [] print(f"\nConnecting to HuggingFace dataset: {REPO_ID}\n") files = list_repo_files(repo_id=REPO_ID, repo_type="dataset") parquet_files = [f for f in files if f.endswith(".parquet")] print(f"Total parquet files found: {len(parquet_files)}\n") for file_path in parquet_files: print(f"Inspecting: {file_path}") try: # 🔥 ENCODE SPECIAL CHARACTERS encoded_path = quote(file_path) hf_url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{encoded_path}" with fsspec.open(hf_url, "rb") as f: parquet_file = pq.ParquetFile(f) schema = parquet_file.schema num_rows = parquet_file.metadata.num_rows summary.append({ "file_path": file_path, "folder": file_path.split("/")[0], "file_name": file_path.split("/")[-1], "num_columns": len(schema.names), "num_rows": num_rows, "columns": schema.names }) except Exception as e: print("Error:", e) df = pd.DataFrame(summary) print("\n===== DATASET STRUCTURE =====\n") print(df[["file_name", "folder", "num_columns", "num_rows"]]) df.to_csv(OUTPUT_FILE, index=False) print(f"\nReport saved as: {OUTPUT_FILE}")