Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- server/dataset_loader.py +17 -9
server/dataset_loader.py
CHANGED
|
@@ -27,6 +27,10 @@ class DatasetGenerator:
|
|
| 27 |
def __init__(self):
|
| 28 |
self.fake = Faker()
|
| 29 |
self.seed = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def generate_dataset(self, task_id: str, seed: int = None) -> pd.DataFrame:
|
| 32 |
"""Generate dataset for specified task."""
|
|
@@ -96,8 +100,8 @@ class DatasetGenerator:
|
|
| 96 |
# Add null values (20% of rows)
|
| 97 |
null_mask = np.random.choice([True, False], size=n_rows, p=[0.20, 0.80])
|
| 98 |
df.loc[null_mask, 'age'] = np.nan
|
| 99 |
-
df.loc[null_mask[:n_rows//2], 'salary'] = np.nan
|
| 100 |
-
df.loc[null_mask[:n_rows//3], 'department'] = np.nan
|
| 101 |
|
| 102 |
# Add duplicates (15% of rows)
|
| 103 |
n_duplicates = int(n_rows * 0.15)
|
|
@@ -107,13 +111,17 @@ class DatasetGenerator:
|
|
| 107 |
|
| 108 |
# Add invalid emails (25% of emails)
|
| 109 |
invalid_email_mask = np.random.choice([True, False], size=len(df), p=[0.25, 0.75])
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
# Add outliers in salary column
|
| 119 |
outlier_indices = np.random.choice(df.index, size=int(len(df) * 0.10), replace=False)
|
|
|
|
| 27 |
def __init__(self):
|
| 28 |
self.fake = Faker()
|
| 29 |
self.seed = None
|
| 30 |
+
|
| 31 |
+
def get_total_examples(self):
|
| 32 |
+
"""Required method for OpenEnv compatibility."""
|
| 33 |
+
return 3
|
| 34 |
|
| 35 |
def generate_dataset(self, task_id: str, seed: int = None) -> pd.DataFrame:
|
| 36 |
"""Generate dataset for specified task."""
|
|
|
|
| 100 |
# Add null values (20% of rows)
|
| 101 |
null_mask = np.random.choice([True, False], size=n_rows, p=[0.20, 0.80])
|
| 102 |
df.loc[null_mask, 'age'] = np.nan
|
| 103 |
+
df.loc[df.index[null_mask][:n_rows//2], 'salary'] = np.nan
|
| 104 |
+
df.loc[df.index[null_mask][:n_rows//3], 'department'] = np.nan
|
| 105 |
|
| 106 |
# Add duplicates (15% of rows)
|
| 107 |
n_duplicates = int(n_rows * 0.15)
|
|
|
|
| 111 |
|
| 112 |
# Add invalid emails (25% of emails)
|
| 113 |
invalid_email_mask = np.random.choice([True, False], size=len(df), p=[0.25, 0.75])
|
| 114 |
+
invalid_count = np.sum(invalid_email_mask)
|
| 115 |
+
invalid_values = [
|
| 116 |
+
[
|
| 117 |
+
self.fake.user_name(),
|
| 118 |
+
'not_an_email',
|
| 119 |
+
'missing@domain',
|
| 120 |
+
'user@.com',
|
| 121 |
+
'@missinguser.com'
|
| 122 |
+
][np.random.randint(0, 5)] for _ in range(invalid_count)
|
| 123 |
+
]
|
| 124 |
+
df.loc[invalid_email_mask, 'email'] = invalid_values
|
| 125 |
|
| 126 |
# Add outliers in salary column
|
| 127 |
outlier_indices = np.random.choice(df.index, size=int(len(df) * 0.10), replace=False)
|