sairaj2 commited on
Commit
81ff063
·
verified ·
1 Parent(s): 6c3d963

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. server/dataset_loader.py +17 -9
server/dataset_loader.py CHANGED
@@ -27,6 +27,10 @@ class DatasetGenerator:
27
  def __init__(self):
28
  self.fake = Faker()
29
  self.seed = None
 
 
 
 
30
 
31
  def generate_dataset(self, task_id: str, seed: int = None) -> pd.DataFrame:
32
  """Generate dataset for specified task."""
@@ -96,8 +100,8 @@ class DatasetGenerator:
96
  # Add null values (20% of rows)
97
  null_mask = np.random.choice([True, False], size=n_rows, p=[0.20, 0.80])
98
  df.loc[null_mask, 'age'] = np.nan
99
- df.loc[null_mask[:n_rows//2], 'salary'] = np.nan
100
- df.loc[null_mask[:n_rows//3], 'department'] = np.nan
101
 
102
  # Add duplicates (15% of rows)
103
  n_duplicates = int(n_rows * 0.15)
@@ -107,13 +111,17 @@ class DatasetGenerator:
107
 
108
  # Add invalid emails (25% of emails)
109
  invalid_email_mask = np.random.choice([True, False], size=len(df), p=[0.25, 0.75])
110
- df.loc[invalid_email_mask, 'email'] = [
111
- self.fake.user_name(),
112
- 'not_an_email',
113
- 'missing@domain',
114
- 'user@.com',
115
- '@missinguser.com'
116
- ][np.random.randint(0, 5)]
 
 
 
 
117
 
118
  # Add outliers in salary column
119
  outlier_indices = np.random.choice(df.index, size=int(len(df) * 0.10), replace=False)
 
27
  def __init__(self):
28
  self.fake = Faker()
29
  self.seed = None
30
+
31
+ def get_total_examples(self):
32
+ """Required method for OpenEnv compatibility."""
33
+ return 3
34
 
35
  def generate_dataset(self, task_id: str, seed: int = None) -> pd.DataFrame:
36
  """Generate dataset for specified task."""
 
100
  # Add null values (20% of rows)
101
  null_mask = np.random.choice([True, False], size=n_rows, p=[0.20, 0.80])
102
  df.loc[null_mask, 'age'] = np.nan
103
+ df.loc[df.index[null_mask][:n_rows//2], 'salary'] = np.nan
104
+ df.loc[df.index[null_mask][:n_rows//3], 'department'] = np.nan
105
 
106
  # Add duplicates (15% of rows)
107
  n_duplicates = int(n_rows * 0.15)
 
111
 
112
  # Add invalid emails (25% of emails)
113
  invalid_email_mask = np.random.choice([True, False], size=len(df), p=[0.25, 0.75])
114
+ invalid_count = np.sum(invalid_email_mask)
115
+ invalid_values = [
116
+ [
117
+ self.fake.user_name(),
118
+ 'not_an_email',
119
+ 'missing@domain',
120
+ 'user@.com',
121
+ '@missinguser.com'
122
+ ][np.random.randint(0, 5)] for _ in range(invalid_count)
123
+ ]
124
+ df.loc[invalid_email_mask, 'email'] = invalid_values
125
 
126
  # Add outliers in salary column
127
  outlier_indices = np.random.choice(df.index, size=int(len(df) * 0.10), replace=False)