SolshineMisfit commited on
Commit
d8d214d
·
verified ·
1 Parent(s): 13b9d4a

Add tool to validate datasets on HF Hub

Browse files
Files changed (1) hide show
  1. app.py +73 -8
app.py CHANGED
@@ -79,34 +79,74 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
79
 
80
  print(f"Creating dataset: {repo_id}")
81
 
82
- # Create a simple dataset from a dictionary
83
  data = {
84
  "text": [conversation_data],
85
  "timestamp": [datetime.datetime.now().isoformat()],
86
  "id": [str(uuid.uuid4())]
87
  }
88
 
89
- # Create the dataset directly
 
 
 
 
 
 
 
90
  dataset = Dataset.from_dict(data)
91
 
92
- # Push to Hugging Face Hub using the simpler method from documentation
93
  dataset.push_to_hub(
94
- repo_id=repo_id, # Include username in repo_id
95
- token=api_key, # Pass token explicitly
96
- private=False # Make it public
97
  )
98
 
99
  # Generate the URL for the dataset
100
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"
101
  print(f"Dataset successfully pushed to: {dataset_url}")
102
 
103
- return f"Successfully created dataset at {dataset_url}"
 
 
 
 
 
 
104
  except Exception as e:
105
  import traceback
106
  error_trace = traceback.format_exc()
107
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
108
  return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores\n3. Check your permissions for the Misfits-and-Machines organization"
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  @tool
111
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
112
  """A tool that posts a new dataset of the current conversation to Hugging Face.
@@ -129,6 +169,30 @@ def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
129
  error_trace = traceback.format_exc()
130
  return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}"
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  @tool
134
  def get_current_time_in_timezone(timezone: str) -> str:
@@ -169,7 +233,8 @@ agent = CodeAgent(
169
  Sonar_Web_Search_Tool,
170
  get_current_time_in_timezone,
171
  image_generation_tool,
172
- Dataset_Creator_Tool
 
173
  ],
174
  max_steps=6,
175
  verbosity_level=1,
 
79
 
80
  print(f"Creating dataset: {repo_id}")
81
 
82
+ # Create a simple dataset from a dictionary with a train split
83
  data = {
84
  "text": [conversation_data],
85
  "timestamp": [datetime.datetime.now().isoformat()],
86
  "id": [str(uuid.uuid4())]
87
  }
88
 
89
+ # Explicitly ensure the repository exists
90
+ try:
91
+ hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
92
+ print(f"Repository {repo_id} created or already exists")
93
+ except Exception as repo_error:
94
+ print(f"Note about repository creation: {str(repo_error)}")
95
+
96
+ # Create the dataset and push to hub
97
  dataset = Dataset.from_dict(data)
98
 
99
+ # Push to Hugging Face Hub using the standard method
100
  dataset.push_to_hub(
101
+ repo_id=repo_id,
102
+ token=api_key,
103
+ private=False
104
  )
105
 
106
  # Generate the URL for the dataset
107
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"
108
  print(f"Dataset successfully pushed to: {dataset_url}")
109
 
110
+ # Verify dataset exists by checking the API
111
+ verify_result = verify_dataset_exists(repo_id)
112
+ if verify_result["exists"]:
113
+ return f"Successfully created dataset at {dataset_url}"
114
+ else:
115
+ return f"Dataset was uploaded, but it may take a few minutes to appear at {dataset_url}. Error: {verify_result['message']}"
116
+
117
  except Exception as e:
118
  import traceback
119
  error_trace = traceback.format_exc()
120
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
121
  return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores\n3. Check your permissions for the Misfits-and-Machines organization"
122
 
123
+ def verify_dataset_exists(repo_id: str) -> dict:
124
+ """Verify that a dataset exists and is valid on the Hugging Face Hub.
125
+
126
+ Args:
127
+ repo_id: Full repository ID in format "username/dataset_name"
128
+
129
+ Returns:
130
+ Dict with "exists" boolean and "message" string
131
+ """
132
+ try:
133
+ # Check if dataset exists using the datasets-server API
134
+ api_url = f"https://datasets-server.huggingface.co/is-valid?dataset={repo_id}"
135
+ response = requests.get(api_url)
136
+
137
+ # Parse the response
138
+ if response.status_code == 200:
139
+ data = response.json()
140
+ # If any of these are True, the dataset exists in some form
141
+ if data.get("viewer", False) or data.get("preview", False):
142
+ return {"exists": True, "message": "Dataset is valid and accessible"}
143
+ else:
144
+ return {"exists": False, "message": "Dataset exists but may not be fully processed yet"}
145
+ else:
146
+ return {"exists": False, "message": f"API returned status code {response.status_code}"}
147
+ except Exception as e:
148
+ return {"exists": False, "message": f"Error verifying dataset: {str(e)}"}
149
+
150
  @tool
151
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
152
  """A tool that posts a new dataset of the current conversation to Hugging Face.
 
169
  error_trace = traceback.format_exc()
170
  return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}"
171
 
172
+ @tool
173
+ def Check_Dataset_Validity(dataset_name: str) -> str:
174
+ """A tool that checks if a dataset exists and is valid on Hugging Face.
175
+
176
+ Args:
177
+ dataset_name: Name of the dataset to check (with or without organization prefix)
178
+
179
+ Returns:
180
+ Status message about the dataset validity
181
+ """
182
+ try:
183
+ # Ensure the dataset name has the organization prefix
184
+ if "/" not in dataset_name:
185
+ dataset_name = f"Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}"
186
+
187
+ # Check dataset validity
188
+ result = verify_dataset_exists(dataset_name)
189
+
190
+ if result["exists"]:
191
+ return f"Dataset '{dataset_name}' exists and is valid. You can access it at https://huggingface.co/datasets/{dataset_name}"
192
+ else:
193
+ return f"Dataset '{dataset_name}' could not be verified: {result['message']}. It may still be processing or may not exist."
194
+ except Exception as e:
195
+ return f"Error checking dataset validity: {str(e)}"
196
 
197
  @tool
198
  def get_current_time_in_timezone(timezone: str) -> str:
 
233
  Sonar_Web_Search_Tool,
234
  get_current_time_in_timezone,
235
  image_generation_tool,
236
+ Dataset_Creator_Tool,
237
+ Check_Dataset_Validity
238
  ],
239
  max_steps=6,
240
  verbosity_level=1,