Spaces:

broadfield-dev
/

HF-Dataset-Commander

Sleeping

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

97353a3

verified ·

1 Parent(s): 22285df

Update processor.py

Browse files

Files changed (1) hide show

processor.py +158 -91

processor.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
-from datasets import load_dataset, Dataset, Features, Value
 from huggingface_hub import HfApi
 # Configure logging
@@ -11,63 +12,115 @@ class DatasetCommandCenter:
     def __init__(self, token=None):
         self.token = token
-    def inspect_dataset(self, dataset_id, split="train", config=None):
         """
-        Loads the first N rows and detects JSON structures.
         """
         try:
-            # Load in streaming mode to avoid RAM issues
-            ds_stream = load_dataset(dataset_id, config, split=split, streaming=True, token=self.token)
-            # Peek at the first 5 rows
             sample_rows = []
             for i, row in enumerate(ds_stream):
                 if i >= 5: break
-                sample_rows.append(row)
             # Analyze Columns
             analysis = {}
-            if sample_rows:
-                keys = sample_rows[0].keys()
-                for k in keys:
-                    # Check if it looks like JSON
-                    is_json_candidate = False
-                    sample_val = sample_rows[0][k]
-                    if isinstance(sample_val, str):
-                        sample_val = sample_val.strip()
-                        if (sample_val.startswith('{') and sample_val.endswith('}')) or \
-                           (sample_val.startswith('[') and sample_val.endswith(']')):
-                            try:
-                                json.loads(sample_val)
-                                is_json_candidate = True
-                            except:
-                                pass
-                    analysis[k] = {
-                        "type": str(type(sample_rows[0][k]).__name__),
-                        "is_json": is_json_candidate
-                    }
             return {
                 "status": "success",
                 "samples": sample_rows,
-                "analysis": analysis,
-                "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def _apply_transformations(self, row, recipe):
         """
-        Applies a single row transformation based on the recipe.
-        Recipe format:
-        {
-            "json_expansions": [{"col": "meta", "keys": ["url", "id"]}],
-            "renames": {"old_col": "new_col"},
-            "drops": ["unwanted_col"],
-            "filters": ["len(text) > 50"]  # List of python eval strings
-        }
         """
         new_row = row.copy()
@@ -75,20 +128,40 @@ class DatasetCommandCenter:
         if "json_expansions" in recipe:
             for item in recipe["json_expansions"]:
                 col_name = item["col"]
-                target_keys = item["keys"] # List of keys to extract
-                if col_name in new_row and isinstance(new_row[col_name], str):
                     try:
-                        data = json.loads(new_row[col_name])
-                        for key in target_keys:
-                            # Support simple dot notation if needed, currently 1 level
                             clean_key = key.replace('.', '_')
                             new_col_name = f"{col_name}_{clean_key}"
-                            new_row[new_col_name] = data.get(key, None)
-                    except:
-                        # If parsing fails, fill None
-                        for key in target_keys:
-                            new_row[f"{col_name}_{key}"] = None
         # 2. Renames
         if "renames" in recipe:
@@ -105,64 +178,58 @@ class DatasetCommandCenter:
         return new_row
     def _passes_filter(self, row, filters):
-        """
-        Safe-ish eval for filtering.
-        """
-        if not filters:
-            return True
-        # We create a local context with the row data accessible as variables
-        # e.g., if row has 'text', user can write "len(text) > 5"
         context = row.copy()
         for f_str in filters:
             try:
                 if not eval(f_str, {}, context):
                     return False
-            except Exception as e:
-                # If filter crashes (e.g. missing column), we skip the row or default to False
                 return False
         return True
-    def preview_transform(self, dataset_id, split, recipe):
-        """Return a transformed sample of 5 rows"""
-        ds_stream = load_dataset(dataset_id, split=split, streaming=True, token=self.token)
-        processed = []
-        for row in ds_stream:
-            if len(processed) >= 5: break
-            # Apply Filter
-            if not self._passes_filter(row, recipe.get("filters", [])):
-                continue
-            # Apply Transform
-            trans_row = self._apply_transformations(row, recipe)
-            processed.append(trans_row)
-        return processed
-    def process_and_push(self, source_id, split, target_id, recipe, max_rows=None):
-        """
-        The heavy lifter: Streams, Transforms, Filters, and Pushes to Hub.
-        """
-        logger.info(f"Starting job: {source_id} -> {target_id}")
         def gen():
-            ds_stream = load_dataset(source_id, split=split, streaming=True, token=self.token)
             count = 0
             for row in ds_stream:
                 if max_rows and count >= int(max_rows):
                     break
-                if self._passes_filter(row, recipe.get("filters", [])):
-                    yield self._apply_transformations(row, recipe)
                     count += 1
-        # Create new dataset from generator
-        # We let HF infer the features (schema) automatically from the first batch
-        new_dataset = Dataset.from_generator(gen)
-        # Push
-        new_dataset.push_to_hub(target_id, token=self.token)
-        return {"status": "success", "rows_processed": len(new_dataset)}

 import json
 import logging
+import datasets
+from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 from huggingface_hub import HfApi
 # Configure logging
     def __init__(self, token=None):
         self.token = token
+    def get_dataset_metadata(self, dataset_id):
         """
+        Step 1: Get available Configs (subsets) and Splits without downloading data.
         """
         try:
+            # 1. Get Configs (e.g., 'en', 'fr' or 'default')
+            try:
+                configs = get_dataset_config_names(dataset_id, token=self.token)
+            except Exception:
+                # Some datasets have no configs or throw errors, default to 'default' or None
+                configs = ['default']
+            # 2. Get Splits for the first config (to pre-populate)
+            # We will fetch specific splits for other configs dynamically if needed
+            selected_config = configs[0]
+            try:
+                # This fetches metadata (splits, columns) without downloading rows
+                infos = get_dataset_infos(dataset_id, token=self.token)
+                # If multiple configs, infos is a dict keyed by config name
+                if selected_config in infos:
+                    splits = list(infos[selected_config].splits.keys())
+                else:
+                    # Fallback if structure is flat
+                    splits = list(infos.values())[0].splits.keys()
+            except:
+                # Fallback: try to just list simple splits
+                splits = ['train', 'test', 'validation']
+            return {
+                "status": "success",
+                "configs": configs,
+                "splits": list(splits)
+            }
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+    def get_splits_for_config(self, dataset_id, config_name):
+        """
+        Helper to update splits when user changes the Config dropdown
+        """
+        try:
+            infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
+            splits = list(infos[config_name].splits.keys())
+            return {"status": "success", "splits": splits}
+        except Exception as e:
+             # Fallback
+            return {"status": "success", "splits": ['train', 'test', 'validation']}
+    def inspect_dataset(self, dataset_id, config, split):
+        """
+        Step 2: Stream actual rows and detect JSON.
+        """
+        try:
+            # Handle 'default' config edge cases
+            conf = config if config != 'default' else None
+            ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             for i, row in enumerate(ds_stream):
                 if i >= 5: break
+                # Convert non-serializable objects (like PIL Images) to strings for preview
+                clean_row = {}
+                for k, v in row.items():
+                    if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
+                        clean_row[k] = str(v)
+                    else:
+                        clean_row[k] = v
+                sample_rows.append(clean_row)
+            if not sample_rows:
+                return {"status": "error", "message": "Dataset is empty."}
             # Analyze Columns
             analysis = {}
+            keys = sample_rows[0].keys()
+            for k in keys:
+                sample_val = sample_rows[0][k]
+                col_type = type(sample_val).__name__
+                is_json_str = False
+                # Check if string looks like JSON
+                if isinstance(sample_val, str):
+                    s = sample_val.strip()
+                    if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                        try:
+                            json.loads(s)
+                            is_json_str = True
+                        except:
+                            pass
+                analysis[k] = {
+                    "type": col_type,
+                    "is_json_string": is_json_str
+                }
             return {
                 "status": "success",
                 "samples": sample_rows,
+                "analysis": analysis
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def _apply_transformations(self, row, recipe):
         """
+        Apply Parsing, Renaming, Dropping, Filtering
         """
         new_row = row.copy()
         if "json_expansions" in recipe:
             for item in recipe["json_expansions"]:
                 col_name = item["col"]
+                target_keys = item["keys"]
+                # Check if we need to parse string-json first
+                source_data = new_row.get(col_name)
+                parsed_obj = None
+                # Case A: It is already a dict (Struct)
+                if isinstance(source_data, dict):
+                    parsed_obj = source_data
+                # Case B: It is a string (JSON String)
+                elif isinstance(source_data, str):
                     try:
+                        parsed_obj = json.loads(source_data)
+                    except:
+                        pass
+                if parsed_obj:
+                    for key in target_keys:
+                        # Handle Nested Dot Notation (e.g. "meta.url")
+                        val = parsed_obj
+                        parts = key.split('.')
+                        try:
+                            for p in parts:
+                                val = val[p]
+                            # Create new column name (replace dots with underscores)
                             clean_key = key.replace('.', '_')
                             new_col_name = f"{col_name}_{clean_key}"
+                            new_row[new_col_name] = val
+                        except:
+                            # Key not found
+                            clean_key = key.replace('.', '_')
+                            new_row[f"{col_name}_{clean_key}"] = None
         # 2. Renames
         if "renames" in recipe:
         return new_row
     def _passes_filter(self, row, filters):
+        if not filters: return True
         context = row.copy()
         for f_str in filters:
             try:
+                # Safety: very basic eval.
                 if not eval(f_str, {}, context):
                     return False
+            except:
                 return False
         return True
+    def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
+        logger.info(f"Starting job: {source_id} ({config}/{split}) -> {target_id}")
+        conf = config if config != 'default' else None
         def gen():
+            ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for row in ds_stream:
                 if max_rows and count >= int(max_rows):
                     break
+                # Transform first (so filters apply to NEW schema if needed,
+                # OR change order depending on preference. Here we filter RAW data usually,
+                # but for JSON extraction we often filter on extracted fields.
+                # Let's Apply Transform -> Then Filter to allow filtering on extracted JSON fields)
+                trans_row = self._apply_transformations(row, recipe)
+                if self._passes_filter(trans_row, recipe.get("filters", [])):
+                    yield trans_row
                     count += 1
+        # Push to Hub
+        # Note: We must infer features or let HF do it.
+        # Using a generator allows HF to auto-detect the new schema.
+        try:
+            new_dataset = datasets.Dataset.from_generator(gen)
+            new_dataset.push_to_hub(target_id, token=self.token)
+            return {"status": "success", "rows_processed": len(new_dataset)}
+        except Exception as e:
+            logger.error(e)
+            raise e
+    def preview_transform(self, dataset_id, config, split, recipe):
+        conf = config if config != 'default' else None
+        ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
+        processed = []
+        for i, row in enumerate(ds_stream):
+            if len(processed) >= 5: break
+            trans_row = self._apply_transformations(row, recipe)
+            if self._passes_filter(trans_row, recipe.get("filters", [])):
+                processed.append(trans_row)
+        return processed