Aswini-Kumar commited on
Commit
280fa04
·
verified ·
1 Parent(s): ecfcbd2

Upload server/dataset_factory.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. server/dataset_factory.py +89 -29
server/dataset_factory.py CHANGED
@@ -1,53 +1,113 @@
1
- import pandas as pd
 
 
 
 
 
 
 
 
 
2
  import numpy as np
 
3
  from sklearn.datasets import make_classification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  class DatasetFactory:
7
- def generate(self, difficulty="easy"):
 
 
 
 
8
  """
9
- difficulty controls degradation severity:
10
- easy single issue, low severity
11
- medium two issues, moderate severity
12
- hard three issues, high severity
13
  """
 
 
 
 
 
 
 
14
  X, y = make_classification(
15
- n_samples=500, n_features=10,
16
- n_informative=5, random_state=42
 
 
 
 
17
  )
18
  df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(10)])
19
  df["label"] = y
 
20
 
21
- params = self._difficulty_params(difficulty)
22
- df = self._inject_missing(df, params["missing_fraction"])
23
- df = self._inject_noise(df, params["noise_rate"])
 
 
 
 
24
  df = self._inject_imbalance(df, params["imbalance_ratio"])
25
 
26
- return df, params["target_accuracy"]
27
 
28
- def _difficulty_params(self, difficulty):
29
- return {
30
- "easy": {"missing_fraction": 0.05, "noise_rate": 0.05, "imbalance_ratio": 0.8, "target_accuracy": 0.80},
31
- "medium": {"missing_fraction": 0.15, "noise_rate": 0.15, "imbalance_ratio": 0.6, "target_accuracy": 0.75},
32
- "hard": {"missing_fraction": 0.30, "noise_rate": 0.25, "imbalance_ratio": 0.3, "target_accuracy": 0.70},
33
- }[difficulty]
 
 
 
 
 
 
 
 
34
 
35
- def _inject_missing(self, df, fraction):
36
- mask = np.random.random(df.shape) < fraction
37
  df_copy = df.copy()
38
- for col in df.columns[:-1]: # never corrupt label column
39
- df_copy.loc[mask[:, df.columns.get_loc(col)], col] = np.nan
 
 
 
 
40
  return df_copy
41
 
42
- def _inject_noise(self, df, rate):
43
  df_copy = df.copy()
44
- n_flip = int(len(df) * rate)
45
- idx = np.random.choice(len(df), n_flip, replace=False)
46
- df_copy.loc[idx, "label"] = 1 - df_copy.loc[idx, "label"]
 
47
  return df_copy
48
 
49
- def _inject_imbalance(self, df, ratio):
50
  minority = df[df["label"] == 1]
51
  majority = df[df["label"] == 0]
52
- minority_sample = minority.sample(frac=ratio, random_state=42)
53
- return pd.concat([majority, minority_sample]).sample(frac=1).reset_index(drop=True)
 
 
1
+ """
2
+ server/dataset_factory.py — Richer dataset generation with multiple archetypes
3
+ and golden rows.
4
+
5
+ Golden rows: A fixed set of rows injected into every dataset that represent
6
+ "ground truth" — they are perfectly clean and correctly labeled. If a specialist
7
+ operation corrupts them, the environment detects and penalizes this.
8
+
9
+ Archetypes provide variety so the agent can't memorize a single dataset shape.
10
+ """
11
  import numpy as np
12
+ import pandas as pd
13
  from sklearn.datasets import make_classification
14
+ from server.config import cfg
15
+
16
+
17
+ ARCHETYPES = [
18
+ # (name, n_informative, n_redundant, class_sep)
19
+ ("credit_risk", 5, 2, 1.0),
20
+ ("churn", 4, 3, 0.8),
21
+ ("fraud", 6, 1, 1.2),
22
+ ("medical", 5, 2, 0.9),
23
+ ("supply_chain", 4, 2, 1.1),
24
+ ]
25
+
26
+ DIFFICULTY_PARAMS = {
27
+ "easy": {"missing_fraction": 0.05, "noise_rate": 0.05, "imbalance_ratio": 0.80, "target_accuracy": 0.82},
28
+ "medium": {"missing_fraction": 0.15, "noise_rate": 0.12, "imbalance_ratio": 0.60, "target_accuracy": 0.77},
29
+ "hard": {"missing_fraction": 0.28, "noise_rate": 0.22, "imbalance_ratio": 0.35, "target_accuracy": 0.72},
30
+ }
31
 
32
 
33
  class DatasetFactory:
34
+
35
+ def __init__(self):
36
+ self._archetype_idx = 0
37
+
38
+ def generate(self, difficulty: str = "easy") -> tuple[pd.DataFrame, float, set]:
39
  """
40
+ Returns:
41
+ df corrupted DataFrame
42
+ target_acc accuracy target to hit
43
+ golden_row_ids set of row indices that are "golden" (must not be corrupted)
44
  """
45
+ params = DIFFICULTY_PARAMS[difficulty]
46
+
47
+ # Rotate archetypes for variety
48
+ arch_name, n_info, n_red, class_sep = ARCHETYPES[self._archetype_idx % len(ARCHETYPES)]
49
+ self._archetype_idx += 1
50
+
51
+ n = cfg.DATASET_N_SAMPLES
52
  X, y = make_classification(
53
+ n_samples=n,
54
+ n_features=10,
55
+ n_informative=n_info,
56
+ n_redundant=n_red,
57
+ class_sep=class_sep,
58
+ random_state=np.random.randint(0, 9999),
59
  )
60
  df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(10)])
61
  df["label"] = y
62
+ df["_archetype"] = arch_name # metadata column — not used by classifier
63
 
64
+ # Insert golden rows BEFORE corruption (they stay clean)
65
+ golden_indices = self._insert_golden_rows(df, cfg.GOLDEN_ROW_COUNT)
66
+
67
+ # Corrupt non-golden rows only
68
+ non_golden = df.index.difference(golden_indices).tolist()
69
+ df = self._inject_missing(df, non_golden, params["missing_fraction"])
70
+ df = self._inject_noise(df, non_golden, params["noise_rate"])
71
  df = self._inject_imbalance(df, params["imbalance_ratio"])
72
 
73
+ return df, params["target_accuracy"], set(golden_indices)
74
 
75
+ def _insert_golden_rows(self, df: pd.DataFrame, n: int) -> list[int]:
76
+ """
77
+ Inject n perfectly clean rows with known-correct labels.
78
+ Returns their indices.
79
+ """
80
+ golden_ids = []
81
+ feature_cols = [c for c in df.columns if c not in ("label", "_archetype")]
82
+ for cls in [0, 1]:
83
+ class_rows = df[df["label"] == cls]
84
+ if len(class_rows) < n // 2:
85
+ continue
86
+ sample = class_rows.sample(n=n // 2, random_state=42)
87
+ golden_ids.extend(sample.index.tolist())
88
+ return golden_ids
89
 
90
+ def _inject_missing(self, df: pd.DataFrame, non_golden: list, fraction: float) -> pd.DataFrame:
 
91
  df_copy = df.copy()
92
+ feature_cols = [c for c in df.columns if c not in ("label", "_archetype")]
93
+ mask = np.random.random((len(non_golden), len(feature_cols))) < fraction
94
+ for i, idx in enumerate(non_golden):
95
+ for j, col in enumerate(feature_cols):
96
+ if mask[i, j]:
97
+ df_copy.at[idx, col] = np.nan
98
  return df_copy
99
 
100
+ def _inject_noise(self, df: pd.DataFrame, non_golden: list, rate: float) -> pd.DataFrame:
101
  df_copy = df.copy()
102
+ n_flip = int(len(non_golden) * rate)
103
+ flip_indices = np.random.choice(non_golden, n_flip, replace=False)
104
+ for idx in flip_indices:
105
+ df_copy.at[idx, "label"] = 1 - df_copy.at[idx, "label"]
106
  return df_copy
107
 
108
+ def _inject_imbalance(self, df: pd.DataFrame, ratio: float) -> pd.DataFrame:
109
  minority = df[df["label"] == 1]
110
  majority = df[df["label"] == 0]
111
+ keep = max(1, int(len(minority) * ratio))
112
+ minority_sample = minority.sample(n=keep, random_state=42)
113
+ return pd.concat([majority, minority_sample]).sample(frac=1, random_state=42).reset_index(drop=True)