Spaces:
Runtime error
Runtime error
json dataframe updates
Browse filesIn this updated solution, the JSON input is converted into a pandas DataFrame and then wrapped into a Dataset object; finally, it is placed in a DatasetDict under the "train" split before being pushed to the Hub. This ensures that every entry in a JSON array becomes its own row in the "train" split, allowing for multiple updates or separate entries as desired.
app.py
CHANGED
|
@@ -61,9 +61,9 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 61 |
Args:
|
| 62 |
dataset_name: Name for the dataset (will be prefixed with username)
|
| 63 |
conversation_data: String representing the conversation data. Can be:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
|
| 68 |
Returns:
|
| 69 |
URL of the created dataset or error message
|
|
@@ -72,30 +72,25 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 72 |
# Required imports
|
| 73 |
import json
|
| 74 |
import pandas as pd
|
| 75 |
-
from datasets import Dataset
|
| 76 |
from huggingface_hub import HfApi
|
| 77 |
|
| 78 |
-
# Get API key
|
| 79 |
api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
|
| 80 |
if not api_key:
|
| 81 |
return "Error: No Hugging Face API key found in environment variables"
|
| 82 |
|
| 83 |
-
# Set fixed username
|
| 84 |
username = "Misfits-and-Machines"
|
| 85 |
-
|
| 86 |
-
# Initialize Hugging Face API
|
| 87 |
-
hf_api = HfApi(token=api_key)
|
| 88 |
-
|
| 89 |
-
# Sanitize dataset name
|
| 90 |
safe_dataset_name = dataset_name.replace(" ", "_").lower()
|
| 91 |
repo_id = f"{username}/{safe_dataset_name}"
|
| 92 |
|
| 93 |
print(f"Creating dataset: {repo_id}")
|
| 94 |
|
| 95 |
-
#
|
|
|
|
| 96 |
try:
|
| 97 |
-
|
| 98 |
-
if not repo_exists:
|
| 99 |
hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
|
| 100 |
print(f"Created repository: {repo_id}")
|
| 101 |
else:
|
|
@@ -103,36 +98,30 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 103 |
except Exception as e:
|
| 104 |
print(f"Note when checking/creating repository: {str(e)}")
|
| 105 |
|
| 106 |
-
# Process
|
|
|
|
| 107 |
try:
|
| 108 |
-
# Try parsing as JSON first
|
| 109 |
json_data = json.loads(conversation_data)
|
| 110 |
-
|
| 111 |
if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
|
| 112 |
-
# Process JSON array of objects (preferred format)
|
| 113 |
print(f"Processing JSON array with {len(json_data)} items")
|
| 114 |
df = pd.DataFrame(json_data)
|
| 115 |
-
|
|
|
|
| 116 |
elif isinstance(json_data, dict):
|
| 117 |
-
# Single JSON object
|
| 118 |
print("Processing single JSON object")
|
| 119 |
df = pd.DataFrame([json_data])
|
| 120 |
-
|
|
|
|
| 121 |
else:
|
| 122 |
-
raise ValueError("JSON
|
| 123 |
except (json.JSONDecodeError, ValueError) as e:
|
| 124 |
-
# Not valid JSON or not in expected format
|
| 125 |
print(f"Not processing as JSON: {str(e)}")
|
| 126 |
-
|
| 127 |
-
# Check if pipe-separated format
|
| 128 |
lines = conversation_data.strip().split('\n')
|
| 129 |
if '|' in conversation_data and len(lines) > 1:
|
| 130 |
print("Processing as pipe-separated data")
|
| 131 |
-
|
| 132 |
-
# Parse headers and data rows
|
| 133 |
headers = [h.strip() for h in lines[0].split('|')]
|
| 134 |
data = []
|
| 135 |
-
|
| 136 |
for i, line in enumerate(lines[1:], 1):
|
| 137 |
if not line.strip():
|
| 138 |
continue
|
|
@@ -141,35 +130,33 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 141 |
data.append(dict(zip(headers, values)))
|
| 142 |
else:
|
| 143 |
print(f"Warning: Skipping row {i} (column count mismatch)")
|
| 144 |
-
|
| 145 |
if data:
|
| 146 |
df = pd.DataFrame(data)
|
| 147 |
-
|
|
|
|
| 148 |
else:
|
| 149 |
-
|
| 150 |
-
dataset = Dataset.from_dict({"text": [conversation_data]})
|
| 151 |
else:
|
| 152 |
-
#
|
| 153 |
print("Processing as plain text")
|
| 154 |
-
|
| 155 |
|
| 156 |
-
# Push
|
| 157 |
print(f"Pushing dataset to {repo_id}")
|
| 158 |
-
|
| 159 |
repo_id=repo_id,
|
| 160 |
token=api_key,
|
| 161 |
-
|
| 162 |
)
|
| 163 |
|
| 164 |
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
|
| 165 |
print(f"Dataset successfully pushed to: {dataset_url}")
|
| 166 |
-
|
| 167 |
return f"Successfully created dataset at {dataset_url}"
|
| 168 |
except Exception as e:
|
| 169 |
import traceback
|
| 170 |
error_trace = traceback.format_exc()
|
| 171 |
print(f"Dataset creation error: {str(e)}\n{error_trace}")
|
| 172 |
-
return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2.
|
| 173 |
|
| 174 |
@tool
|
| 175 |
def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
|
|
@@ -266,13 +253,14 @@ def get_current_time_in_timezone(timezone: str) -> str:
|
|
| 266 |
return f"Error fetching time for timezone '{timezone}': {str(e)}"
|
| 267 |
|
| 268 |
|
| 269 |
-
# Update the model configuration to use Qwen2.5-Coder
|
| 270 |
final_answer = FinalAnswerTool()
|
|
|
|
|
|
|
| 271 |
model = HfApiModel(
|
| 272 |
max_tokens=2096,
|
| 273 |
temperature=0.5,
|
| 274 |
-
model_id='
|
| 275 |
-
custom_role_conversions=None
|
| 276 |
)
|
| 277 |
|
| 278 |
# Import tool from Hub
|
|
@@ -281,18 +269,18 @@ image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_co
|
|
| 281 |
with open("prompts.yaml", 'r') as stream:
|
| 282 |
prompt_templates = yaml.safe_load(stream)
|
| 283 |
|
| 284 |
-
# Updated CodeAgent with tools
|
| 285 |
agent = CodeAgent(
|
| 286 |
model=model,
|
| 287 |
tools=[
|
| 288 |
-
final_answer,
|
| 289 |
Sonar_Web_Search_Tool,
|
| 290 |
-
ddg_search_tool,
|
|
|
|
| 291 |
get_current_time_in_timezone,
|
| 292 |
image_generation_tool,
|
| 293 |
Dataset_Creator_Tool,
|
| 294 |
Check_Dataset_Validity
|
| 295 |
-
],
|
| 296 |
max_steps=6,
|
| 297 |
verbosity_level=1,
|
| 298 |
grammar=None,
|
|
@@ -302,7 +290,8 @@ agent = CodeAgent(
|
|
| 302 |
prompt_templates=prompt_templates
|
| 303 |
)
|
| 304 |
|
| 305 |
-
#
|
|
|
|
| 306 |
try:
|
| 307 |
GradioUI(agent).launch()
|
| 308 |
except TypeError as e:
|
|
|
|
| 61 |
Args:
|
| 62 |
dataset_name: Name for the dataset (will be prefixed with username)
|
| 63 |
conversation_data: String representing the conversation data. Can be:
|
| 64 |
+
- JSON array of objects (each object becomes a row)
|
| 65 |
+
- Pipe-separated values (first row as headers, subsequent rows as values)
|
| 66 |
+
- Plain text (stored in a single 'text' column)
|
| 67 |
|
| 68 |
Returns:
|
| 69 |
URL of the created dataset or error message
|
|
|
|
| 72 |
# Required imports
|
| 73 |
import json
|
| 74 |
import pandas as pd
|
| 75 |
+
from datasets import Dataset, DatasetDict
|
| 76 |
from huggingface_hub import HfApi
|
| 77 |
|
| 78 |
+
# Get API key
|
| 79 |
api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
|
| 80 |
if not api_key:
|
| 81 |
return "Error: No Hugging Face API key found in environment variables"
|
| 82 |
|
| 83 |
+
# Set fixed username
|
| 84 |
username = "Misfits-and-Machines"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
safe_dataset_name = dataset_name.replace(" ", "_").lower()
|
| 86 |
repo_id = f"{username}/{safe_dataset_name}"
|
| 87 |
|
| 88 |
print(f"Creating dataset: {repo_id}")
|
| 89 |
|
| 90 |
+
# Ensure repository exists
|
| 91 |
+
hf_api = HfApi(token=api_key)
|
| 92 |
try:
|
| 93 |
+
if not hf_api.repo_exists(repo_id=repo_id, repo_type="dataset"):
|
|
|
|
| 94 |
hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
|
| 95 |
print(f"Created repository: {repo_id}")
|
| 96 |
else:
|
|
|
|
| 98 |
except Exception as e:
|
| 99 |
print(f"Note when checking/creating repository: {str(e)}")
|
| 100 |
|
| 101 |
+
# Process input data
|
| 102 |
+
created_ds = None
|
| 103 |
try:
|
|
|
|
| 104 |
json_data = json.loads(conversation_data)
|
|
|
|
| 105 |
if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
|
|
|
|
| 106 |
print(f"Processing JSON array with {len(json_data)} items")
|
| 107 |
df = pd.DataFrame(json_data)
|
| 108 |
+
ds = Dataset.from_pandas(df)
|
| 109 |
+
created_ds = DatasetDict({"train": ds})
|
| 110 |
elif isinstance(json_data, dict):
|
|
|
|
| 111 |
print("Processing single JSON object")
|
| 112 |
df = pd.DataFrame([json_data])
|
| 113 |
+
ds = Dataset.from_pandas(df)
|
| 114 |
+
created_ds = DatasetDict({"train": ds})
|
| 115 |
else:
|
| 116 |
+
raise ValueError("JSON not recognized as array or single object")
|
| 117 |
except (json.JSONDecodeError, ValueError) as e:
|
|
|
|
| 118 |
print(f"Not processing as JSON: {str(e)}")
|
| 119 |
+
# Try pipe-separated format
|
|
|
|
| 120 |
lines = conversation_data.strip().split('\n')
|
| 121 |
if '|' in conversation_data and len(lines) > 1:
|
| 122 |
print("Processing as pipe-separated data")
|
|
|
|
|
|
|
| 123 |
headers = [h.strip() for h in lines[0].split('|')]
|
| 124 |
data = []
|
|
|
|
| 125 |
for i, line in enumerate(lines[1:], 1):
|
| 126 |
if not line.strip():
|
| 127 |
continue
|
|
|
|
| 130 |
data.append(dict(zip(headers, values)))
|
| 131 |
else:
|
| 132 |
print(f"Warning: Skipping row {i} (column count mismatch)")
|
|
|
|
| 133 |
if data:
|
| 134 |
df = pd.DataFrame(data)
|
| 135 |
+
ds = Dataset.from_pandas(df)
|
| 136 |
+
created_ds = DatasetDict({"train": ds})
|
| 137 |
else:
|
| 138 |
+
created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
|
|
|
|
| 139 |
else:
|
| 140 |
+
# Fallback for plain text
|
| 141 |
print("Processing as plain text")
|
| 142 |
+
created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
|
| 143 |
|
| 144 |
+
# Push using the DatasetDict push_to_hub method.
|
| 145 |
print(f"Pushing dataset to {repo_id}")
|
| 146 |
+
created_ds.push_to_hub(
|
| 147 |
repo_id=repo_id,
|
| 148 |
token=api_key,
|
| 149 |
+
commit_message=f"Upload dataset: {dataset_name}"
|
| 150 |
)
|
| 151 |
|
| 152 |
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
|
| 153 |
print(f"Dataset successfully pushed to: {dataset_url}")
|
|
|
|
| 154 |
return f"Successfully created dataset at {dataset_url}"
|
| 155 |
except Exception as e:
|
| 156 |
import traceback
|
| 157 |
error_trace = traceback.format_exc()
|
| 158 |
print(f"Dataset creation error: {str(e)}\n{error_trace}")
|
| 159 |
+
return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Use a simpler dataset name (letters and underscores only)"
|
| 160 |
|
| 161 |
@tool
|
| 162 |
def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
|
|
|
|
| 253 |
return f"Error fetching time for timezone '{timezone}': {str(e)}"
|
| 254 |
|
| 255 |
|
|
|
|
| 256 |
final_answer = FinalAnswerTool()
|
| 257 |
+
|
| 258 |
+
# Remove the huggingface_api_key parameter - it's not supported
|
| 259 |
model = HfApiModel(
|
| 260 |
max_tokens=2096,
|
| 261 |
temperature=0.5,
|
| 262 |
+
model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # Using the backup endpoint
|
| 263 |
+
custom_role_conversions=None
|
| 264 |
)
|
| 265 |
|
| 266 |
# Import tool from Hub
|
|
|
|
| 269 |
with open("prompts.yaml", 'r') as stream:
|
| 270 |
prompt_templates = yaml.safe_load(stream)
|
| 271 |
|
|
|
|
| 272 |
agent = CodeAgent(
|
| 273 |
model=model,
|
| 274 |
tools=[
|
| 275 |
+
final_answer,
|
| 276 |
Sonar_Web_Search_Tool,
|
| 277 |
+
ddg_search_tool, # Added DuckDuckGo search tool
|
| 278 |
+
# google_search_tool, # Added Google search tool
|
| 279 |
get_current_time_in_timezone,
|
| 280 |
image_generation_tool,
|
| 281 |
Dataset_Creator_Tool,
|
| 282 |
Check_Dataset_Validity
|
| 283 |
+
],
|
| 284 |
max_steps=6,
|
| 285 |
verbosity_level=1,
|
| 286 |
grammar=None,
|
|
|
|
| 290 |
prompt_templates=prompt_templates
|
| 291 |
)
|
| 292 |
|
| 293 |
+
# To fix the TypeError in Gradio_UI.py, you would need to modify that file
|
| 294 |
+
# For now, we'll just use the agent directly
|
| 295 |
try:
|
| 296 |
GradioUI(agent).launch()
|
| 297 |
except TypeError as e:
|