Spaces:

mgbam
/

BizIntel_AI

Sleeping

App Files Files Community

BizIntel_AI / tools /csv_parser.py

mgbam

Update tools/csv_parser.py

7453b19 verified 10 months ago

raw

history blame contribute delete

4.37 kB

	# tools/csv_parser.py
	# ------------------------------------------------------------
	# Reads a CSV / Excel file (sampling ultra‑large CSVs), then
	# returns a Markdown report:
	# ▸ dimensions ▸ schema & dtypes
	# ▸ missing‑value map ▸ numeric describe()
	# ▸ memory footprint
	# If the optional dependency tabulate is unavailable,
	# it falls back to a plain‑text table wrapped in Markdown
	# code fences, so no ImportError ever reaches the UI.

	from __future__ import annotations

	import os
	from typing import Union

	import numpy as np
	import pandas as pd


	# ╭──────────────────────────────────────────────────────────╮
	# │ Helper: efficient reader with sampling for huge CSVs │
	# ╰──────────────────────────────────────────────────────────╯
	def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
	"""Load CSV / Excel. If CSV has > sample_rows, read a uniform sample."""
	is_str = isinstance(path, str)
	ext = os.path.splitext(path)[1].lower() if is_str else ".csv"

	if ext in (".xls", ".xlsx"):
	return pd.read_excel(path, engine="openpyxl")

	# --- CSV branch --------------------------------------------------------
	if is_str:
	# fast line count (memory‑map); falls back to full read for non‑files
	with open(path, "rb") as fh:
	n_total = sum(1 for _ in fh)
	else:
	n_total = None

	if n_total and n_total > sample_rows:
	# sample without reading entire file
	rng = np.random.default_rng(seed=42)
	skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
	return pd.read_csv(path, skiprows=skip)

	return pd.read_csv(path)


	# ╭──────────────────────────────────────────────────────────╮
	# │ Main public helper │
	# ╰──────────────────────────────────────────────────────────╯
	def parse_csv_tool(path: Union[str, bytes]) -> str:
	"""
	Return a Markdown report that Streamlit can render.

	Sections:
	• Dimensions
	• Schema & dtypes
	• Missing‑value counts (+%)
	• Numeric describe()
	• Memory usage
	"""
	try:
	df = _safe_read(path)
	except Exception as exc:
	return f"❌ Failed to load data: {exc}"

	rows, cols = df.shape
	mem_mb = df.memory_usage(deep=True).sum() / 1024**2

	# ── Schema -------------------------------------------------------------
	schema_md = "\n".join(
	f"- {col} – `{dtype}`" for col, dtype in df.dtypes.items()
	)

	# ── Missing map --------------------------------------------------------
	miss_ct = df.isna().sum()
	miss_pct = (miss_ct / len(df) * 100).round(1)
	missing_md = (
	"\n".join(
	f"- {c}: {miss_ct[c]} ({miss_pct[c]} %)"
	for c in df.columns
	if miss_ct[c] > 0
	)
	or "None"
	)

	# ── Numeric describe() -------------------------------------------------
	numeric_df = df.select_dtypes("number")
	if numeric_df.empty:
	desc_md = "_No numeric columns_"
	else:
	try:
	# requires the optional 'tabulate' package
	desc_md = numeric_df.describe().T.round(2).to_markdown()
	except ImportError:
	# graceful fallback without extra dependency
	desc_md = (
	"```text\n"
	+ numeric_df.describe().T.round(2).to_string()
	+ "\n```"
	)

	# ── Assemble markdown --------------------------------------------------
	return f"""
	# 📊 Dataset Overview

	\| metric \| value \|
	\| ------ \| ----- \|
	\| Rows \| {rows:,} \|
	\| Columns\| {cols} \|
	\| Memory \| {mem_mb:.2f} MB \|

	## 🗂 Schema & Dtypes
	{schema_md}

	## 🛠 Missing Values
	{missing_md}

	## 📈 Descriptive Statistics (numeric)
	{desc_md}
	""".strip()