Spaces:
Running
Running
| from __future__ import annotations | |
| import csv | |
| from dataclasses import dataclass | |
| from random import Random | |
| from typing import List, Tuple, Dict, Any | |
| class Dataset: | |
| x_train: List[List[float]] | |
| y_train: List[int] | |
| x_val: List[List[float]] | |
| y_val: List[int] | |
| true_w: List[float] | |
| true_b: float | |
| feature_names: List[str] = None | |
| def _dot(a: List[float], b: List[float]) -> float: | |
| return sum(x * y for x, y in zip(a, b)) | |
| def load_csv_dataset(*, csv_path: str, target_col: str, seed: int, train_ratio: float) -> Dataset: | |
| with open(csv_path, "r", encoding="utf-8") as f: | |
| reader = csv.reader(f) | |
| header = next(reader) | |
| try: | |
| target_idx = header.index(target_col) | |
| except ValueError: | |
| target_idx = -1 # default to last column if not found | |
| target_col = header[-1] | |
| feature_names = [col for i, col in enumerate(header) if i != target_idx] | |
| xs: List[List[float]] = [] | |
| ys: List[int] = [] | |
| for row in reader: | |
| if not row: | |
| continue | |
| # try parsing | |
| try: | |
| y = int(float(row[target_idx])) | |
| x = [float(val) for i, val in enumerate(row) if i != target_idx] | |
| xs.append(x) | |
| ys.append(y) | |
| except ValueError: | |
| continue # skip invalid rows | |
| n_samples = len(xs) | |
| if n_samples == 0: | |
| raise ValueError("No valid data found in CSV.") | |
| idx = list(range(n_samples)) | |
| r = Random(seed) | |
| r.shuffle(idx) | |
| cut = max(1, min(n_samples - 1, int(n_samples * train_ratio))) | |
| train_idx = idx[:cut] | |
| val_idx = idx[cut:] | |
| x_train = [xs[i] for i in train_idx] | |
| y_train = [ys[i] for i in train_idx] | |
| x_val = [xs[i] for i in val_idx] | |
| y_val = [ys[i] for i in val_idx] | |
| return Dataset( | |
| x_train=x_train, | |
| y_train=y_train, | |
| x_val=x_val, | |
| y_val=y_val, | |
| true_w=[], | |
| true_b=0.0, | |
| feature_names=feature_names | |
| ) | |
| def make_synthetic_binary_classification( | |
| *, seed: int, n_samples: int, n_features: int, train_ratio: float | |
| ) -> Dataset: | |
| r = Random(seed) | |
| true_w = [r.uniform(-1.0, 1.0) for _ in range(n_features)] | |
| true_b = r.uniform(-0.5, 0.5) | |
| xs: List[List[float]] = [] | |
| ys: List[int] = [] | |
| for _ in range(n_samples): | |
| x = [r.gauss(0.0, 1.0) for _ in range(n_features)] | |
| margin = _dot(true_w, x) + true_b + r.gauss(0.0, 0.2) | |
| y = 1 if margin > 0 else 0 | |
| xs.append(x) | |
| ys.append(y) | |
| idx = list(range(n_samples)) | |
| r.shuffle(idx) | |
| cut = max(1, min(n_samples - 1, int(n_samples * train_ratio))) | |
| train_idx = idx[:cut] | |
| val_idx = idx[cut:] | |
| x_train = [xs[i] for i in train_idx] | |
| y_train = [ys[i] for i in train_idx] | |
| x_val = [xs[i] for i in val_idx] | |
| y_val = [ys[i] for i in val_idx] | |
| feature_names = [f"feature_{i}" for i in range(n_features)] | |
| return Dataset( | |
| x_train=x_train, | |
| y_train=y_train, | |
| x_val=x_val, | |
| y_val=y_val, | |
| true_w=true_w, | |
| true_b=true_b, | |
| feature_names=feature_names | |
| ) | |