Agent-Example

Runtime error

App Files Files Community

SolshineMisfit commited on Mar 6

Commit

6220e54

verified ·

1 Parent(s): dfafa93

Changed the method for hugging face data push to docs again and tried to integrate json structuring in between

Browse files

Files changed (1) hide show

app.py +52 -104

app.py CHANGED Viewed

@@ -69,6 +69,12 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         URL of the created dataset or error message
     """
     try:
         # Get API key from environment variables
         api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
         if not api_key:
@@ -86,7 +92,7 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         print(f"Creating dataset: {repo_id}")
-        # Check if the repository exists or create it
         try:
             repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
             if not repo_exists:
@@ -97,127 +103,64 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         except Exception as e:
             print(f"Note when checking/creating repository: {str(e)}")
-        # Check if data is JSON first (preferred format)
-        is_json = False
         try:
-            # Try to parse as JSON
             json_data = json.loads(conversation_data)
-            # Check if it's an array of objects (preferred structure)
-            if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data) and len(json_data) > 0:
-                print(f"Processing as JSON array with {len(json_data)} items")
-                # Extract all keys to ensure consistent columns
-                all_keys = set()
-                for item in json_data:
-                    all_keys.update(item.keys())
-                all_keys = sorted(list(all_keys))  # Sort keys for consistent order
-                print(f"Detected columns: {', '.join(all_keys)}")
-                # Create dataset with proper structure
-                rows = []
-                for item in json_data:
-                    row = {key: item.get(key, "") for key in all_keys}
-                    rows.append(row)
-                # Convert to pandas DataFrame for better control
-                import pandas as pd
-                df = pd.DataFrame(rows)
-                print(df.head())  # Print first few rows for verification
-                # Create dataset from pandas DataFrame
-                from datasets import Dataset
                 dataset = Dataset.from_pandas(df)
-                # Push to Hugging Face Hub with the train split name
-                dataset.push_to_hub(
-                    repo_id=repo_id,
-                    token=api_key,
-                    split="train",
-                    commit_message=f"Upload JSON dataset: {dataset_name}"
-                )
-                print(f"Successfully pushed JSON dataset with {len(json_data)} rows")
-                is_json = True
             elif isinstance(json_data, dict):
-                # Single object - convert to dataset
-                print("Processing as single JSON object")
-                import pandas as pd
                 df = pd.DataFrame([json_data])
                 dataset = Dataset.from_pandas(df)
-                # Push to Hugging Face Hub
-                dataset.push_to_hub(
-                    repo_id=repo_id,
-                    token=api_key,
-                    split="train",
-                    commit_message=f"Upload single JSON object: {dataset_name}"
-                )
-                is_json = True
-        except json.JSONDecodeError:
-            # Not valid JSON, will try other formats
-            print("Not valid JSON, checking other formats...")
-        # If not JSON, check if data is structured with pipe separators
-        if not is_json:
-            lines = conversation_data.strip().split('\n')
-            is_structured = '|' in conversation_data and len(lines) > 1
-            if is_structured:
-                print("Detected pipe-separated structured data")
-                # Parse the header row for column names
-                header = lines[0].strip()
-                headers = [col.strip() for col in header.split('|')]
-                # Create structured data
-                import pandas as pd
-                rows = []
-                # Process each data row
                 for i, line in enumerate(lines[1:], 1):
                     if not line.strip():
                         continue
                     values = [val.strip() for val in line.split('|')]
-                    # Ensure we have the right number of values
                     if len(values) == len(headers):
-                        row = {headers[j]: values[j] for j in range(len(headers))}
-                        rows.append(row)
                     else:
-                        print(f"Warning: Skipping row {i} due to mismatch in column count")
-                # Create dataset from pandas DataFrame
-                df = pd.DataFrame(rows)
-                dataset = Dataset.from_pandas(df)
-                # Push to Hugging Face Hub
-                dataset.push_to_hub(
-                    repo_id=repo_id,
-                    token=api_key,
-                    split="train",
-                    commit_message=f"Upload structured data: {dataset_name}"
-                )
-                print(f"Successfully pushed structured dataset with {len(rows)} rows")
             else:
-                # Handle as regular text data (single row)
-                print("Processing as regular text data")
                 dataset = Dataset.from_dict({"text": [conversation_data]})
-                # Push to Hugging Face Hub
-                dataset.push_to_hub(
-                    repo_id=repo_id,
-                    token=api_key,
-                    split="train",
-                    commit_message=f"Upload text data: {dataset_name}"
-                )
-        # Generate the URL for the dataset
         dataset_url = f"https://huggingface.co/datasets/{repo_id}"
         print(f"Dataset successfully pushed to: {dataset_url}")
@@ -230,11 +173,16 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
 @tool
 def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
-    """A tool that posts a new dataset of the current conversation to Hugging Face.
     Args:
         dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
-        conversation_data: String content to save to the dataset
     Returns:
         Link to the created dataset or error message with troubleshooting steps

         URL of the created dataset or error message
     """
     try:
+        # Required imports
+        import json
+        import pandas as pd
+        from datasets import Dataset
+        from huggingface_hub import HfApi
         # Get API key from environment variables
         api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
         if not api_key:
         print(f"Creating dataset: {repo_id}")
+        # First ensure the repository exists
         try:
             repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
             if not repo_exists:
         except Exception as e:
             print(f"Note when checking/creating repository: {str(e)}")
+        # Process the data based on format
         try:
+            # Try parsing as JSON first
             json_data = json.loads(conversation_data)
+            if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
+                # Process JSON array of objects (preferred format)
+                print(f"Processing JSON array with {len(json_data)} items")
+                df = pd.DataFrame(json_data)
                 dataset = Dataset.from_pandas(df)
             elif isinstance(json_data, dict):
+                # Single JSON object
+                print("Processing single JSON object")
                 df = pd.DataFrame([json_data])
                 dataset = Dataset.from_pandas(df)
+            else:
+                raise ValueError("JSON format not recognized as array of objects or single object")
+        except (json.JSONDecodeError, ValueError) as e:
+            # Not valid JSON or not in expected format
+            print(f"Not processing as JSON: {str(e)}")
+            # Check if pipe-separated format
+            lines = conversation_data.strip().split('\n')
+            if '|' in conversation_data and len(lines) > 1:
+                print("Processing as pipe-separated data")
+                # Parse headers and data rows
+                headers = [h.strip() for h in lines[0].split('|')]
+                data = []
                 for i, line in enumerate(lines[1:], 1):
                     if not line.strip():
                         continue
                     values = [val.strip() for val in line.split('|')]
                     if len(values) == len(headers):
+                        data.append(dict(zip(headers, values)))
                     else:
+                        print(f"Warning: Skipping row {i} (column count mismatch)")
+                if data:
+                    df = pd.DataFrame(data)
+                    dataset = Dataset.from_pandas(df)
+                else:
+                    # Fallback to text if no valid rows
+                    dataset = Dataset.from_dict({"text": [conversation_data]})
             else:
+                # Plain text
+                print("Processing as plain text")
                 dataset = Dataset.from_dict({"text": [conversation_data]})
+        # Push to Hugging Face Hub
+        print(f"Pushing dataset to {repo_id}")
+        dataset.push_to_hub(
+            repo_id=repo_id,
+            token=api_key,
+            split="train"
+        )
         dataset_url = f"https://huggingface.co/datasets/{repo_id}"
         print(f"Dataset successfully pushed to: {dataset_url}")
 @tool
 def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
+    """A tool that creates and pushes a dataset to Hugging Face.
     Args:
         dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
+        conversation_data: Data content to save in the dataset. Can be formatted in three ways:
+            1. JSON array of objects - Each object becomes a row in the dataset with its properties as columns:
+               Example: [{"name": "Product A", "brand": "Company X"}, {"name": "Product B", "brand": "Company Y"}]
+            2. Pipe-separated values - First row as headers, subsequent rows as values:
+               Example: "name | brand\nProduct A | Company X\nProduct B | Company Y"
+            3. Plain text - Will be stored in a single 'text' column
     Returns:
         Link to the created dataset or error message with troubleshooting steps