Agent-Example

Runtime error

App Files Files Community

SolshineMisfit commited on Mar 6

Commit

40e5f48

verified ·

1 Parent(s): bdb213e

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -49

app.py CHANGED Viewed

@@ -60,8 +60,10 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
     Args:
         dataset_name: Name for the dataset (will be prefixed with username)
-        conversation_data: String representing the conversation data, can be structured
-                          with pipe-separated values (col1 | col2 | col3) for tabular data
     Returns:
         URL of the created dataset or error message
@@ -84,60 +86,92 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         print(f"Creating dataset: {repo_id}")
-        # Check if data is structured (contains pipe separators and multiple lines)
-        lines = conversation_data.strip().split('\n')
-        is_structured = '|' in conversation_data and len(lines) > 1
-        if is_structured:
-            print("Detected structured data with multiple rows")
-            # Parse the header row for column names
-            header = lines[0].strip()
-            headers = [col.strip() for col in header.split('|')]
-            # Parse the data rows
-            data_dict = {header: [] for header in headers}
-            # Add a timestamp and id column
-            data_dict['timestamp'] = []
-            data_dict['id'] = []
-            # Process each data row
-            for i, line in enumerate(lines[1:]):
-                if not line.strip():
-                    continue
-                values = [val.strip() for val in line.split('|')]
-                # Ensure we have the right number of values
-                if len(values) == len(headers):
-                    for j, header in enumerate(headers):
-                        data_dict[header].append(values[j])
-                    # Add timestamp and ID for each row
-                    data_dict['timestamp'].append(datetime.datetime.now().isoformat())
-                    data_dict['id'].append(str(uuid.uuid4()))
-                else:
-                    print(f"Warning: Skipping row {i+1} due to mismatch in column count")
-            # Create dataset from structured data
-            dataset = Dataset.from_dict(data_dict)
-            print(f"Created structured dataset with {len(data_dict[headers[0]])} rows and {len(data_dict)} columns")
-        else:
-            # Handle as regular text data (single row)
-            print("Processing as regular text data")
-            data = {
-                "text": [conversation_data],
-                "timestamp": [datetime.datetime.now().isoformat()],
-                "id": [str(uuid.uuid4())]
-            }
-            dataset = Dataset.from_dict(data)
-        # Push to Hugging Face Hub
         dataset.push_to_hub(
             repo_id=repo_id,
             token=api_key,
-            private=False
         )
         # Generate the URL for the dataset
@@ -149,7 +183,7 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         import traceback
         error_trace = traceback.format_exc()
         print(f"Dataset creation error: {str(e)}\n{error_trace}")
-        return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores\n3. Check your permissions for the Misfits-and-Machines organization"
 @tool
 def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:

     Args:
         dataset_name: Name for the dataset (will be prefixed with username)
+        conversation_data: String representing the conversation data. Can be:
+                          - JSON array of objects (each object becomes a row)
+                          - Pipe-separated values (col1 | col2 | col3) for tabular data
+                          - Plain text (stored in a 'text' column)
     Returns:
         URL of the created dataset or error message
         print(f"Creating dataset: {repo_id}")
+        # Check if data is JSON first (preferred format)
+        is_json = False
+        try:
+            # Try to parse as JSON
+            json_data = json.loads(conversation_data)
+            # Check if it's an array of objects (preferred structure)
+            if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data) and len(json_data) > 0:
+                print(f"Processing as JSON array with {len(json_data)} items")
+                # Extract all keys to ensure consistent columns
+                all_keys = set()
+                for item in json_data:
+                    all_keys.update(item.keys())
+                # Initialize the data dictionary with empty lists for each key
+                data_dict = {key: [] for key in all_keys}
+                # Process each item in the array
+                for item in json_data:
+                    for key in all_keys:
+                        # Use the value if present, otherwise empty string
+                        data_dict[key].append(item.get(key, ""))
+                # Create dataset from JSON data
+                dataset = Dataset.from_dict(data_dict)
+                print(f"Created dataset with {len(json_data)} rows and {len(all_keys)} columns")
+                is_json = True
+        except json.JSONDecodeError:
+            # Not valid JSON, will try other formats
+            print("Not valid JSON, checking other formats...")
+        # If not JSON, check if data is structured with pipe separators
+        if not is_json:
+            lines = conversation_data.strip().split('\n')
+            is_structured = '|' in conversation_data and len(lines) > 1
+            if is_structured:
+                print("Detected pipe-separated structured data")
+                # Parse the header row for column names
+                header = lines[0].strip()
+                headers = [col.strip() for col in header.split('|')]
+                # Create dataset dict for structured data
+                data_dict = {header: [] for header in headers}
+                # Process each data row
+                for i, line in enumerate(lines[1:]):
+                    if not line.strip():
+                        continue
+                    values = [val.strip() for val in line.split('|')]
+                    # Ensure we have the right number of values
+                    if len(values) == len(headers):
+                        for j, header in enumerate(headers):
+                            data_dict[header].append(values[j])
+                    else:
+                        print(f"Warning: Skipping row {i+1} due to mismatch in column count")
+                # Create dataset from structured data
+                dataset = Dataset.from_dict(data_dict)
+                print(f"Created structured dataset with {len(data_dict[headers[0]])} rows and {len(headers)} columns")
+            else:
+                # Handle as regular text data (single row)
+                print("Processing as regular text data")
+                dataset = Dataset.from_dict({"text": [conversation_data]})
+        # First ensure the repository exists
+        try:
+            repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
+            if not repo_exists:
+                hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
+                print(f"Created repository: {repo_id}")
+            else:
+                print(f"Repository already exists: {repo_id}")
+        except Exception as e:
+            print(f"Note when checking/creating repository: {str(e)}")
+        # Push to Hugging Face Hub with simplified parameters
+        print(f"Pushing dataset to {repo_id}")
         dataset.push_to_hub(
             repo_id=repo_id,
             token=api_key,
+            commit_message=f"Upload dataset: {dataset_name}"
         )
         # Generate the URL for the dataset
         import traceback
         error_trace = traceback.format_exc()
         print(f"Dataset creation error: {str(e)}\n{error_trace}")
+        return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores"
 @tool
 def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str: