forexdatalake / testdata.py
OMCHOKSI108's picture
code
aac542c
import pandas as pd
import pyarrow.parquet as pq
import fsspec
from huggingface_hub import list_repo_files
from urllib.parse import quote
# ==================================
REPO_ID = "OMCHOKSI108/my-cloud-data-lake"
OUTPUT_FILE = "hf_dataset_structure_report.csv"
summary = []
print(f"\nConnecting to HuggingFace dataset: {REPO_ID}\n")
files = list_repo_files(repo_id=REPO_ID, repo_type="dataset")
parquet_files = [f for f in files if f.endswith(".parquet")]
print(f"Total parquet files found: {len(parquet_files)}\n")
for file_path in parquet_files:
print(f"Inspecting: {file_path}")
try:
# 🔥 ENCODE SPECIAL CHARACTERS
encoded_path = quote(file_path)
hf_url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{encoded_path}"
with fsspec.open(hf_url, "rb") as f:
parquet_file = pq.ParquetFile(f)
schema = parquet_file.schema
num_rows = parquet_file.metadata.num_rows
summary.append({
"file_path": file_path,
"folder": file_path.split("/")[0],
"file_name": file_path.split("/")[-1],
"num_columns": len(schema.names),
"num_rows": num_rows,
"columns": schema.names
})
except Exception as e:
print("Error:", e)
df = pd.DataFrame(summary)
print("\n===== DATASET STRUCTURE =====\n")
print(df[["file_name", "folder", "num_columns", "num_rows"]])
df.to_csv(OUTPUT_FILE, index=False)
print(f"\nReport saved as: {OUTPUT_FILE}")