Spaces:
Runtime error
Runtime error
agent can now use this tool with clearer feedback on what's happening, and users will be directed to the correct URL where they can verify if the dataset appeared on Hugging Face.
Browse files1. Uses underscores instead of dashes in dataset names for better compatibility
2. Creates a proper dataset structure with a "train" split (standard practice for HF datasets)
3. Sets split="train" in push_to_hub to ensure proper organization
4. Provides more verbose logging throughout the process
5. Returns a clear URL for the user to check their dataset
6. Offers troubleshooting steps when errors occur
7. Properly handles repository existence checks
8. Uses a more descriptive commit message
app.py
CHANGED
|
@@ -65,7 +65,7 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 65 |
# Get API key from environment variables
|
| 66 |
api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY", "")
|
| 67 |
if not api_key:
|
| 68 |
-
return "Error: No Hugging Face API key found in environment variables"
|
| 69 |
|
| 70 |
# Force the username to be the known value
|
| 71 |
username = "Misfits-and-Machines"
|
|
@@ -73,61 +73,83 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 73 |
# Initialize Hugging Face API
|
| 74 |
hf_api = HfApi(token=api_key)
|
| 75 |
|
| 76 |
-
# Sanitize dataset name
|
| 77 |
-
safe_dataset_name = dataset_name.replace(" ", "
|
| 78 |
repo_id = f"{username}/{safe_dataset_name}"
|
| 79 |
|
| 80 |
print(f"Creating dataset repository: {repo_id}")
|
| 81 |
|
| 82 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
try:
|
| 84 |
-
hf_api.
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
except Exception as repo_error:
|
| 87 |
-
print(f"Repository creation
|
|
|
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
"text": conversation_data,
|
| 92 |
-
"timestamp": datetime.datetime.now().isoformat(),
|
| 93 |
-
"dataset_id": str(uuid.uuid4())
|
| 94 |
-
}
|
| 95 |
|
| 96 |
-
# Create
|
| 97 |
-
|
|
|
|
| 98 |
|
| 99 |
-
# Push
|
| 100 |
dataset.push_to_hub(
|
| 101 |
repo_id=repo_id,
|
| 102 |
-
token=api_key
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
-
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
|
| 106 |
print(f"Dataset successfully pushed to: {dataset_url}")
|
| 107 |
-
return f"Successfully created dataset
|
| 108 |
except Exception as e:
|
| 109 |
import traceback
|
| 110 |
error_trace = traceback.format_exc()
|
| 111 |
print(f"Dataset creation error: {str(e)}\n{error_trace}")
|
| 112 |
-
return f"Error creating dataset: {str(e)}"
|
| 113 |
|
| 114 |
@tool
|
| 115 |
def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
|
| 116 |
"""A tool that posts a new dataset of the current conversation to Hugging Face.
|
| 117 |
|
| 118 |
Args:
|
| 119 |
-
dataset_name: Name for the dataset (will be prefixed with
|
| 120 |
conversation_data: String content to save to the dataset (no JSON conversion needed)
|
|
|
|
|
|
|
|
|
|
| 121 |
"""
|
| 122 |
try:
|
| 123 |
print(f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data")
|
|
|
|
| 124 |
result = Dataset_Creator_Function(dataset_name, conversation_data)
|
| 125 |
-
print(f"
|
| 126 |
return result
|
| 127 |
except Exception as e:
|
| 128 |
import traceback
|
| 129 |
error_trace = traceback.format_exc()
|
| 130 |
-
return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}"
|
| 131 |
|
| 132 |
|
| 133 |
@tool
|
|
|
|
| 65 |
# Get API key from environment variables
|
| 66 |
api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY", "")
|
| 67 |
if not api_key:
|
| 68 |
+
return "Error: No Hugging Face API key found in environment variables. Please set HF_API_KEY or HUGGINGFACE_API_KEY."
|
| 69 |
|
| 70 |
# Force the username to be the known value
|
| 71 |
username = "Misfits-and-Machines"
|
|
|
|
| 73 |
# Initialize Hugging Face API
|
| 74 |
hf_api = HfApi(token=api_key)
|
| 75 |
|
| 76 |
+
# Sanitize dataset name - use underscores instead of dashes for better compatibility
|
| 77 |
+
safe_dataset_name = dataset_name.replace(" ", "_").lower()
|
| 78 |
repo_id = f"{username}/{safe_dataset_name}"
|
| 79 |
|
| 80 |
print(f"Creating dataset repository: {repo_id}")
|
| 81 |
|
| 82 |
+
# Prepare dataset with appropriate structure
|
| 83 |
+
# First, ensure we have a proper train split with necessary fields
|
| 84 |
+
dataset_dict = {
|
| 85 |
+
"text": [conversation_data],
|
| 86 |
+
"timestamp": [datetime.datetime.now().isoformat()],
|
| 87 |
+
"dataset_id": [str(uuid.uuid4())]
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# Create a Hugging Face dataset
|
| 91 |
+
dataset = Dataset.from_dict(dataset_dict)
|
| 92 |
+
|
| 93 |
+
# Standard practice is to have a train split for datasets
|
| 94 |
+
dataset_dict = {"train": dataset}
|
| 95 |
+
|
| 96 |
+
# Check if the repository already exists
|
| 97 |
try:
|
| 98 |
+
repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
|
| 99 |
+
if repo_exists:
|
| 100 |
+
print(f"Repository {repo_id} already exists")
|
| 101 |
+
else:
|
| 102 |
+
# Create repo if it doesn't exist
|
| 103 |
+
hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
|
| 104 |
+
print(f"Repository {repo_id} created successfully")
|
| 105 |
except Exception as repo_error:
|
| 106 |
+
print(f"Repository check/creation error: {str(repo_error)}")
|
| 107 |
+
# Continue anyway as push_to_hub might create the repo
|
| 108 |
|
| 109 |
+
# Push dataset to the Hub with appropriate parameters
|
| 110 |
+
print(f"Pushing dataset to {repo_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
+
# Create URL for monitoring - we'll show this to the user so they can check progress
|
| 113 |
+
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
|
| 114 |
+
print(f"Dataset URL will be: {dataset_url}")
|
| 115 |
|
| 116 |
+
# Push with careful parameter selection
|
| 117 |
dataset.push_to_hub(
|
| 118 |
repo_id=repo_id,
|
| 119 |
+
token=api_key,
|
| 120 |
+
split="train", # Use a proper split name
|
| 121 |
+
commit_message=f"Upload dataset: {dataset_name}"
|
| 122 |
)
|
| 123 |
|
|
|
|
| 124 |
print(f"Dataset successfully pushed to: {dataset_url}")
|
| 125 |
+
return f"Successfully created dataset at {dataset_url} - please check this URL to verify your dataset is visible"
|
| 126 |
except Exception as e:
|
| 127 |
import traceback
|
| 128 |
error_trace = traceback.format_exc()
|
| 129 |
print(f"Dataset creation error: {str(e)}\n{error_trace}")
|
| 130 |
+
return f"Error creating dataset: {str(e)}\n\nTo troubleshoot:\n1. Verify API key is valid\n2. Try with a different dataset name\n3. Check if you have write permissions for the Misfits-and-Machines organization"
|
| 131 |
|
| 132 |
@tool
|
| 133 |
def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
|
| 134 |
"""A tool that posts a new dataset of the current conversation to Hugging Face.
|
| 135 |
|
| 136 |
Args:
|
| 137 |
+
dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
|
| 138 |
conversation_data: String content to save to the dataset (no JSON conversion needed)
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Link to the created dataset or error message with troubleshooting steps
|
| 142 |
"""
|
| 143 |
try:
|
| 144 |
print(f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data")
|
| 145 |
+
print(f"Dataset will be created at Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}")
|
| 146 |
result = Dataset_Creator_Function(dataset_name, conversation_data)
|
| 147 |
+
print(f"Dataset creation result: {result}")
|
| 148 |
return result
|
| 149 |
except Exception as e:
|
| 150 |
import traceback
|
| 151 |
error_trace = traceback.format_exc()
|
| 152 |
+
return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}\n\nPlease try with a simpler dataset name using only letters, numbers and underscores."
|
| 153 |
|
| 154 |
|
| 155 |
@tool
|