Agent-Course-First_agent

Runtime error

App Files Files Community

SolshineMisfit commited on Mar 6

Commit

dfafa93

verified ·

1 Parent(s): 40e5f48

Push dataset to hub now to handle data restructure and upload in chunks

Browse files

Uses pandas DataFrames for better data handling before creating datasets
Adds better debug logging to see what's happening with the data
Pushes with a "train" split name which is standard practice on Hugging Face
Handles JSON arrays, single JSON objects, structured data, and plain text properly
Creates the repository first, then pushes data to it
Provides more detailed feedback about the process

Files changed (1) hide show

app.py +80 -37

app.py CHANGED Viewed

@@ -86,6 +86,17 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         print(f"Creating dataset: {repo_id}")
         # Check if data is JSON first (preferred format)
         is_json = False
         try:
@@ -100,20 +111,52 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
                 all_keys = set()
                 for item in json_data:
                     all_keys.update(item.keys())
-                # Initialize the data dictionary with empty lists for each key
-                data_dict = {key: [] for key in all_keys}
-                # Process each item in the array
                 for item in json_data:
-                    for key in all_keys:
-                        # Use the value if present, otherwise empty string
-                        data_dict[key].append(item.get(key, ""))
-                # Create dataset from JSON data
-                dataset = Dataset.from_dict(data_dict)
-                print(f"Created dataset with {len(json_data)} rows and {len(all_keys)} columns")
                 is_json = True
         except json.JSONDecodeError:
             # Not valid JSON, will try other formats
             print("Not valid JSON, checking other formats...")
@@ -130,11 +173,12 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
                 header = lines[0].strip()
                 headers = [col.strip() for col in header.split('|')]
-                # Create dataset dict for structured data
-                data_dict = {header: [] for header in headers}
                 # Process each data row
-                for i, line in enumerate(lines[1:]):
                     if not line.strip():
                         continue
@@ -142,37 +186,36 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
                     # Ensure we have the right number of values
                     if len(values) == len(headers):
-                        for j, header in enumerate(headers):
-                            data_dict[header].append(values[j])
                     else:
-                        print(f"Warning: Skipping row {i+1} due to mismatch in column count")
-                # Create dataset from structured data
-                dataset = Dataset.from_dict(data_dict)
-                print(f"Created structured dataset with {len(data_dict[headers[0]])} rows and {len(headers)} columns")
             else:
                 # Handle as regular text data (single row)
                 print("Processing as regular text data")
                 dataset = Dataset.from_dict({"text": [conversation_data]})
-        # First ensure the repository exists
-        try:
-            repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
-            if not repo_exists:
-                hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
-                print(f"Created repository: {repo_id}")
-            else:
-                print(f"Repository already exists: {repo_id}")
-        except Exception as e:
-            print(f"Note when checking/creating repository: {str(e)}")
-        # Push to Hugging Face Hub with simplified parameters
-        print(f"Pushing dataset to {repo_id}")
-        dataset.push_to_hub(
-            repo_id=repo_id,
-            token=api_key,
-            commit_message=f"Upload dataset: {dataset_name}"
-        )
         # Generate the URL for the dataset
         dataset_url = f"https://huggingface.co/datasets/{repo_id}"

         print(f"Creating dataset: {repo_id}")
+        # Check if the repository exists or create it
+        try:
+            repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
+            if not repo_exists:
+                hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
+                print(f"Created repository: {repo_id}")
+            else:
+                print(f"Repository already exists: {repo_id}")
+        except Exception as e:
+            print(f"Note when checking/creating repository: {str(e)}")
         # Check if data is JSON first (preferred format)
         is_json = False
         try:
                 all_keys = set()
                 for item in json_data:
                     all_keys.update(item.keys())
+                all_keys = sorted(list(all_keys))  # Sort keys for consistent order
+                print(f"Detected columns: {', '.join(all_keys)}")
+                # Create dataset with proper structure
+                rows = []
                 for item in json_data:
+                    row = {key: item.get(key, "") for key in all_keys}
+                    rows.append(row)
+                # Convert to pandas DataFrame for better control
+                import pandas as pd
+                df = pd.DataFrame(rows)
+                print(df.head())  # Print first few rows for verification
+                # Create dataset from pandas DataFrame
+                from datasets import Dataset
+                dataset = Dataset.from_pandas(df)
+                # Push to Hugging Face Hub with the train split name
+                dataset.push_to_hub(
+                    repo_id=repo_id,
+                    token=api_key,
+                    split="train",
+                    commit_message=f"Upload JSON dataset: {dataset_name}"
+                )
+                print(f"Successfully pushed JSON dataset with {len(json_data)} rows")
+                is_json = True
+            elif isinstance(json_data, dict):
+                # Single object - convert to dataset
+                print("Processing as single JSON object")
+                import pandas as pd
+                df = pd.DataFrame([json_data])
+                dataset = Dataset.from_pandas(df)
+                # Push to Hugging Face Hub
+                dataset.push_to_hub(
+                    repo_id=repo_id,
+                    token=api_key,
+                    split="train",
+                    commit_message=f"Upload single JSON object: {dataset_name}"
+                )
                 is_json = True
         except json.JSONDecodeError:
             # Not valid JSON, will try other formats
             print("Not valid JSON, checking other formats...")
                 header = lines[0].strip()
                 headers = [col.strip() for col in header.split('|')]
+                # Create structured data
+                import pandas as pd
+                rows = []
                 # Process each data row
+                for i, line in enumerate(lines[1:], 1):
                     if not line.strip():
                         continue
                     # Ensure we have the right number of values
                     if len(values) == len(headers):
+                        row = {headers[j]: values[j] for j in range(len(headers))}
+                        rows.append(row)
                     else:
+                        print(f"Warning: Skipping row {i} due to mismatch in column count")
+                # Create dataset from pandas DataFrame
+                df = pd.DataFrame(rows)
+                dataset = Dataset.from_pandas(df)
+                # Push to Hugging Face Hub
+                dataset.push_to_hub(
+                    repo_id=repo_id,
+                    token=api_key,
+                    split="train",
+                    commit_message=f"Upload structured data: {dataset_name}"
+                )
+                print(f"Successfully pushed structured dataset with {len(rows)} rows")
             else:
                 # Handle as regular text data (single row)
                 print("Processing as regular text data")
                 dataset = Dataset.from_dict({"text": [conversation_data]})
+                # Push to Hugging Face Hub
+                dataset.push_to_hub(
+                    repo_id=repo_id,
+                    token=api_key,
+                    split="train",
+                    commit_message=f"Upload text data: {dataset_name}"
+                )
         # Generate the URL for the dataset
         dataset_url = f"https://huggingface.co/datasets/{repo_id}"