broadfield-dev commited on
Commit
d6558bb
·
verified ·
1 Parent(s): a76c50f

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +83 -1
processor.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import logging
3
  import datasets
4
  from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 
5
 
6
  logging.basicConfig(level=logging.INFO)
7
  logger = logging.getLogger(__name__)
@@ -25,7 +26,27 @@ class DatasetCommandCenter:
25
  splits = list(infos.values())[0].splits.keys()
26
  except:
27
  splits = ['train', 'test', 'validation']
28
- return {"status": "success", "configs": configs, "splits": list(splits)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
  return {"status": "error", "message": str(e)}
31
 
@@ -219,6 +240,59 @@ class DatasetCommandCenter:
219
  except:
220
  return False
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
223
  logger.info(f"Job started: {source_id}")
224
  conf = config if config != 'default' else None
@@ -236,9 +310,17 @@ class DatasetCommandCenter:
236
  try:
237
  new_dataset = datasets.Dataset.from_generator(gen)
238
  new_dataset.push_to_hub(target_id, token=self.token)
 
 
 
 
 
 
 
239
  return {"status": "success", "rows_processed": len(new_dataset)}
240
  except Exception as e:
241
  return {"status": "error", "message": str(e)}
 
242
 
243
  def preview_transform(self, dataset_id, config, split, recipe):
244
  conf = config if config != 'default' else None
 
2
  import logging
3
  import datasets
4
  from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
5
+ from huggingface_hub import HfApi, DatasetCard, DatasetCardData
6
 
7
  logging.basicConfig(level=logging.INFO)
8
  logger = logging.getLogger(__name__)
 
26
  splits = list(infos.values())[0].splits.keys()
27
  except:
28
  splits = ['train', 'test', 'validation']
29
+ # license
30
+ try:
31
+ configs = get_dataset_config_names(dataset_id, token=self.token)
32
+ except:
33
+ configs = ['default']
34
+
35
+ license_name = "unknown"
36
+ try:
37
+ infos = get_dataset_infos(dataset_id, token=self.token)
38
+ # Try to grab license from first config
39
+ first = list(infos.values())[0]
40
+ license_name = first.license or "unknown"
41
+ except:
42
+ pass
43
+
44
+ return {
45
+ "status": "success",
46
+ "configs": configs,
47
+ # We assume user will pick splits later, just return configs + license hint
48
+ "license_detected": license_name
49
+ }
50
  except Exception as e:
51
  return {"status": "error", "message": str(e)}
52
 
 
240
  except:
241
  return False
242
 
243
+
244
+ def _generate_card(self, source_id, target_id, recipe, license_name):
245
+ """
246
+ Generates a README.md with YAML metadata and a report of operations.
247
+ """
248
+
249
+ # 1. YAML Metadata
250
+ card_data = DatasetCardData(
251
+ language="en",
252
+ license=license_name,
253
+ tags=["dataset-command-center", "etl", "generated-dataset"],
254
+ base_model=source_id, # Linking source
255
+ )
256
+
257
+ # 2. Description & Recipe Table
258
+ content = f"""
259
+ # {target_id.split('/')[-1]}
260
+
261
+ This dataset is a transformation of [{source_id}](https://huggingface.co/datasets/{source_id}).
262
+ It was generated using the **Hugging Face Dataset Command Center**.
263
+
264
+ ## Transformation Recipe
265
+
266
+ The following operations were applied to the source data:
267
+
268
+ | Target Column | Source | Type | Logic / Filter |
269
+ |---------------|--------|------|----------------|
270
+ """
271
+
272
+ for col in recipe['columns']:
273
+ c_type = col.get('type', 'simple')
274
+ c_name = col['name']
275
+ c_src = col.get('source', '-')
276
+
277
+ if c_type == 'simple':
278
+ logic = "Direct Mapping"
279
+ elif c_type == 'list_search':
280
+ logic = f"Get `{col['target_key']}` where `{col['filter_key']} == {col['filter_val']}`"
281
+ elif c_type == 'python':
282
+ logic = f"`{col.get('expression')}`"
283
+ else:
284
+ logic = "-"
285
+
286
+ content += f"| **{c_name}** | `{c_src}` | {c_type} | {logic} |\n"
287
+
288
+ if recipe.get('filter_rule'):
289
+ content += f"\n### Row Filtering\n**Filter Applied:** `{recipe['filter_rule']}`\n"
290
+
291
+ content += f"\n## Original License\nThis dataset inherits the license: `{license_name}` from the source."
292
+
293
+ card = DatasetCard.from_template(card_data, content=content)
294
+ return card
295
+
296
  def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
297
  logger.info(f"Job started: {source_id}")
298
  conf = config if config != 'default' else None
 
310
  try:
311
  new_dataset = datasets.Dataset.from_generator(gen)
312
  new_dataset.push_to_hub(target_id, token=self.token)
313
+ # 2. GENERATE & PUSH CARD
314
+ try:
315
+ card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
316
+ card.push_to_hub(target_id, token=self.token)
317
+ except Exception as e:
318
+ logger.warning(f"Could not push dataset card: {e}")
319
+
320
  return {"status": "success", "rows_processed": len(new_dataset)}
321
  except Exception as e:
322
  return {"status": "error", "message": str(e)}
323
+
324
 
325
  def preview_transform(self, dataset_id, config, split, recipe):
326
  conf = config if config != 'default' else None