Spaces:
Sleeping
Sleeping
| # tools/csv_parser.py | |
| # ------------------------------------------------------------ | |
| # Reads a CSVโฏ/โฏExcel file (sampling ultraโlarge CSVs), then | |
| # returns a Markdown report: | |
| # โธ dimensions โธ schema & dtypes | |
| # โธ missingโvalue map โธ numeric describe() | |
| # โธ memory footprint | |
| # If the optional dependency **tabulate** is unavailable, | |
| # it falls back to a plainโtext table wrapped in Markdown | |
| # code fences, so no ImportError ever reaches the UI. | |
| from __future__ import annotations | |
| import os | |
| from typing import Union | |
| import numpy as np | |
| import pandas as pd | |
| # โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ | |
| # โ Helper: efficient reader with sampling for huge CSVs โ | |
| # โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ | |
| def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame: | |
| """Load CSV / Excel. If CSV has >โฏsample_rows, read a uniform sample.""" | |
| is_str = isinstance(path, str) | |
| ext = os.path.splitext(path)[1].lower() if is_str else ".csv" | |
| if ext in (".xls", ".xlsx"): | |
| return pd.read_excel(path, engine="openpyxl") | |
| # --- CSV branch -------------------------------------------------------- | |
| if is_str: | |
| # fast line count (memoryโmap); falls back to full read for nonโfiles | |
| with open(path, "rb") as fh: | |
| n_total = sum(1 for _ in fh) | |
| else: | |
| n_total = None | |
| if n_total and n_total > sample_rows: | |
| # sample without reading entire file | |
| rng = np.random.default_rng(seed=42) | |
| skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False)) | |
| return pd.read_csv(path, skiprows=skip) | |
| return pd.read_csv(path) | |
| # โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ | |
| # โ Main public helper โ | |
| # โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ | |
| def parse_csv_tool(path: Union[str, bytes]) -> str: | |
| """ | |
| Return a Markdown report that Streamlit can render. | |
| Sections: | |
| โข Dimensions | |
| โข Schema & dtypes | |
| โข Missingโvalue counts (+%) | |
| โข Numeric describe() | |
| โข Memory usage | |
| """ | |
| try: | |
| df = _safe_read(path) | |
| except Exception as exc: | |
| return f"โ Failed to load data: {exc}" | |
| rows, cols = df.shape | |
| mem_mb = df.memory_usage(deep=True).sum() / 1024**2 | |
| # โโ Schema ------------------------------------------------------------- | |
| schema_md = "\n".join( | |
| f"- **{col}** โ `{dtype}`" for col, dtype in df.dtypes.items() | |
| ) | |
| # โโ Missing map -------------------------------------------------------- | |
| miss_ct = df.isna().sum() | |
| miss_pct = (miss_ct / len(df) * 100).round(1) | |
| missing_md = ( | |
| "\n".join( | |
| f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โฏ%)" | |
| for c in df.columns | |
| if miss_ct[c] > 0 | |
| ) | |
| or "None" | |
| ) | |
| # โโ Numeric describe() ------------------------------------------------- | |
| numeric_df = df.select_dtypes("number") | |
| if numeric_df.empty: | |
| desc_md = "_No numeric columns_" | |
| else: | |
| try: | |
| # requires the optional 'tabulate' package | |
| desc_md = numeric_df.describe().T.round(2).to_markdown() | |
| except ImportError: | |
| # graceful fallback without extra dependency | |
| desc_md = ( | |
| "```text\n" | |
| + numeric_df.describe().T.round(2).to_string() | |
| + "\n```" | |
| ) | |
| # โโ Assemble markdown -------------------------------------------------- | |
| return f""" | |
| # ๐ย Dataset Overview | |
| | metric | value | | |
| | ------ | ----- | | |
| | Rows | {rows:,} | | |
| | Columns| {cols} | | |
| | Memory | {mem_mb:.2f}ย MB | | |
| ## ๐ย Schema & Dtypes | |
| {schema_md} | |
| ## ๐ ย Missing Values | |
| {missing_md} | |
| ## ๐ย Descriptive Statisticsย (numeric) | |
| {desc_md} | |
| """.strip() | |