Spaces:
Running
Running
File size: 1,553 Bytes
aac542c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import pandas as pd
import pyarrow.parquet as pq
import fsspec
from huggingface_hub import list_repo_files
from urllib.parse import quote
# ==================================
REPO_ID = "OMCHOKSI108/my-cloud-data-lake"
OUTPUT_FILE = "hf_dataset_structure_report.csv"
summary = []
print(f"\nConnecting to HuggingFace dataset: {REPO_ID}\n")
files = list_repo_files(repo_id=REPO_ID, repo_type="dataset")
parquet_files = [f for f in files if f.endswith(".parquet")]
print(f"Total parquet files found: {len(parquet_files)}\n")
for file_path in parquet_files:
print(f"Inspecting: {file_path}")
try:
# 🔥 ENCODE SPECIAL CHARACTERS
encoded_path = quote(file_path)
hf_url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{encoded_path}"
with fsspec.open(hf_url, "rb") as f:
parquet_file = pq.ParquetFile(f)
schema = parquet_file.schema
num_rows = parquet_file.metadata.num_rows
summary.append({
"file_path": file_path,
"folder": file_path.split("/")[0],
"file_name": file_path.split("/")[-1],
"num_columns": len(schema.names),
"num_rows": num_rows,
"columns": schema.names
})
except Exception as e:
print("Error:", e)
df = pd.DataFrame(summary)
print("\n===== DATASET STRUCTURE =====\n")
print(df[["file_name", "folder", "num_columns", "num_rows"]])
df.to_csv(OUTPUT_FILE, index=False)
print(f"\nReport saved as: {OUTPUT_FILE}")
|