SolshineMisfit commited on
Commit
0be1b82
·
verified ·
1 Parent(s): 5282c5b

agent can now use this tool with clearer feedback on what's happening, and users will be directed to the correct URL where they can verify if the dataset appeared on Hugging Face.

Browse files

1. Uses underscores instead of dashes in dataset names for better compatibility
2. Creates a proper dataset structure with a "train" split (standard practice for HF datasets)
3. Sets split="train" in push_to_hub to ensure proper organization
4. Provides more verbose logging throughout the process
5. Returns a clear URL for the user to check their dataset
6. Offers troubleshooting steps when errors occur
7. Properly handles repository existence checks
8. Uses a more descriptive commit message

Files changed (1) hide show
  1. app.py +45 -23
app.py CHANGED
@@ -65,7 +65,7 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
65
  # Get API key from environment variables
66
  api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY", "")
67
  if not api_key:
68
- return "Error: No Hugging Face API key found in environment variables"
69
 
70
  # Force the username to be the known value
71
  username = "Misfits-and-Machines"
@@ -73,61 +73,83 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
73
  # Initialize Hugging Face API
74
  hf_api = HfApi(token=api_key)
75
 
76
- # Sanitize dataset name and create repo_id
77
- safe_dataset_name = dataset_name.replace(" ", "-").lower()
78
  repo_id = f"{username}/{safe_dataset_name}"
79
 
80
  print(f"Creating dataset repository: {repo_id}")
81
 
82
- # Create the repository explicitly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
- hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
85
- print(f"Repository {repo_id} created or confirmed")
 
 
 
 
 
86
  except Exception as repo_error:
87
- print(f"Repository creation note: {str(repo_error)}")
 
88
 
89
- # Build conversation object
90
- conversation = {
91
- "text": conversation_data,
92
- "timestamp": datetime.datetime.now().isoformat(),
93
- "dataset_id": str(uuid.uuid4())
94
- }
95
 
96
- # Create a Hugging Face dataset
97
- dataset = Dataset.from_dict({"conversations": [conversation]})
 
98
 
99
- # Push dataset to the Hub with minimal parameters
100
  dataset.push_to_hub(
101
  repo_id=repo_id,
102
- token=api_key
 
 
103
  )
104
 
105
- dataset_url = f"https://huggingface.co/datasets/{repo_id}"
106
  print(f"Dataset successfully pushed to: {dataset_url}")
107
- return f"Successfully created dataset: {dataset_url}"
108
  except Exception as e:
109
  import traceback
110
  error_trace = traceback.format_exc()
111
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
112
- return f"Error creating dataset: {str(e)}"
113
 
114
  @tool
115
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
116
  """A tool that posts a new dataset of the current conversation to Hugging Face.
117
 
118
  Args:
119
- dataset_name: Name for the dataset (will be prefixed with username)
120
  conversation_data: String content to save to the dataset (no JSON conversion needed)
 
 
 
121
  """
122
  try:
123
  print(f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data")
 
124
  result = Dataset_Creator_Function(dataset_name, conversation_data)
125
- print(f"Result: {result}")
126
  return result
127
  except Exception as e:
128
  import traceback
129
  error_trace = traceback.format_exc()
130
- return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}"
131
 
132
 
133
  @tool
 
65
  # Get API key from environment variables
66
  api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY", "")
67
  if not api_key:
68
+ return "Error: No Hugging Face API key found in environment variables. Please set HF_API_KEY or HUGGINGFACE_API_KEY."
69
 
70
  # Force the username to be the known value
71
  username = "Misfits-and-Machines"
 
73
  # Initialize Hugging Face API
74
  hf_api = HfApi(token=api_key)
75
 
76
+ # Sanitize dataset name - use underscores instead of dashes for better compatibility
77
+ safe_dataset_name = dataset_name.replace(" ", "_").lower()
78
  repo_id = f"{username}/{safe_dataset_name}"
79
 
80
  print(f"Creating dataset repository: {repo_id}")
81
 
82
+ # Prepare dataset with appropriate structure
83
+ # First, ensure we have a proper train split with necessary fields
84
+ dataset_dict = {
85
+ "text": [conversation_data],
86
+ "timestamp": [datetime.datetime.now().isoformat()],
87
+ "dataset_id": [str(uuid.uuid4())]
88
+ }
89
+
90
+ # Create a Hugging Face dataset
91
+ dataset = Dataset.from_dict(dataset_dict)
92
+
93
+ # Standard practice is to have a train split for datasets
94
+ dataset_dict = {"train": dataset}
95
+
96
+ # Check if the repository already exists
97
  try:
98
+ repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
99
+ if repo_exists:
100
+ print(f"Repository {repo_id} already exists")
101
+ else:
102
+ # Create repo if it doesn't exist
103
+ hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
104
+ print(f"Repository {repo_id} created successfully")
105
  except Exception as repo_error:
106
+ print(f"Repository check/creation error: {str(repo_error)}")
107
+ # Continue anyway as push_to_hub might create the repo
108
 
109
+ # Push dataset to the Hub with appropriate parameters
110
+ print(f"Pushing dataset to {repo_id}")
 
 
 
 
111
 
112
+ # Create URL for monitoring - we'll show this to the user so they can check progress
113
+ dataset_url = f"https://huggingface.co/datasets/{repo_id}"
114
+ print(f"Dataset URL will be: {dataset_url}")
115
 
116
+ # Push with careful parameter selection
117
  dataset.push_to_hub(
118
  repo_id=repo_id,
119
+ token=api_key,
120
+ split="train", # Use a proper split name
121
+ commit_message=f"Upload dataset: {dataset_name}"
122
  )
123
 
 
124
  print(f"Dataset successfully pushed to: {dataset_url}")
125
+ return f"Successfully created dataset at {dataset_url} - please check this URL to verify your dataset is visible"
126
  except Exception as e:
127
  import traceback
128
  error_trace = traceback.format_exc()
129
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
130
+ return f"Error creating dataset: {str(e)}\n\nTo troubleshoot:\n1. Verify API key is valid\n2. Try with a different dataset name\n3. Check if you have write permissions for the Misfits-and-Machines organization"
131
 
132
  @tool
133
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
134
  """A tool that posts a new dataset of the current conversation to Hugging Face.
135
 
136
  Args:
137
+ dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
138
  conversation_data: String content to save to the dataset (no JSON conversion needed)
139
+
140
+ Returns:
141
+ Link to the created dataset or error message with troubleshooting steps
142
  """
143
  try:
144
  print(f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data")
145
+ print(f"Dataset will be created at Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}")
146
  result = Dataset_Creator_Function(dataset_name, conversation_data)
147
+ print(f"Dataset creation result: {result}")
148
  return result
149
  except Exception as e:
150
  import traceback
151
  error_trace = traceback.format_exc()
152
+ return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}\n\nPlease try with a simpler dataset name using only letters, numbers and underscores."
153
 
154
 
155
  @tool