SolshineMisfit commited on
Commit
9e2fccb
·
verified ·
1 Parent(s): ad13edc

json dataframe updates

Browse files

In this updated solution, the JSON input is converted into a pandas DataFrame and then wrapped into a Dataset object; finally, it is placed in a DatasetDict under the "train" split before being pushed to the Hub. This ensures that every entry in a JSON array becomes its own row in the "train" split, allowing for multiple updates or separate entries as desired.

Files changed (1) hide show
  1. app.py +36 -47
app.py CHANGED
@@ -61,9 +61,9 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
61
  Args:
62
  dataset_name: Name for the dataset (will be prefixed with username)
63
  conversation_data: String representing the conversation data. Can be:
64
- - JSON array of objects (each object becomes a row)
65
- - Pipe-separated values (col1 | col2 | col3) for tabular data
66
- - Plain text (stored in a 'text' column)
67
 
68
  Returns:
69
  URL of the created dataset or error message
@@ -72,30 +72,25 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
72
  # Required imports
73
  import json
74
  import pandas as pd
75
- from datasets import Dataset
76
  from huggingface_hub import HfApi
77
 
78
- # Get API key from environment variables
79
  api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
80
  if not api_key:
81
  return "Error: No Hugging Face API key found in environment variables"
82
 
83
- # Set fixed username for dataset organization
84
  username = "Misfits-and-Machines"
85
-
86
- # Initialize Hugging Face API
87
- hf_api = HfApi(token=api_key)
88
-
89
- # Sanitize dataset name
90
  safe_dataset_name = dataset_name.replace(" ", "_").lower()
91
  repo_id = f"{username}/{safe_dataset_name}"
92
 
93
  print(f"Creating dataset: {repo_id}")
94
 
95
- # First ensure the repository exists
 
96
  try:
97
- repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
98
- if not repo_exists:
99
  hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
100
  print(f"Created repository: {repo_id}")
101
  else:
@@ -103,36 +98,30 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
103
  except Exception as e:
104
  print(f"Note when checking/creating repository: {str(e)}")
105
 
106
- # Process the data based on format
 
107
  try:
108
- # Try parsing as JSON first
109
  json_data = json.loads(conversation_data)
110
-
111
  if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
112
- # Process JSON array of objects (preferred format)
113
  print(f"Processing JSON array with {len(json_data)} items")
114
  df = pd.DataFrame(json_data)
115
- dataset = Dataset.from_pandas(df)
 
116
  elif isinstance(json_data, dict):
117
- # Single JSON object
118
  print("Processing single JSON object")
119
  df = pd.DataFrame([json_data])
120
- dataset = Dataset.from_pandas(df)
 
121
  else:
122
- raise ValueError("JSON format not recognized as array of objects or single object")
123
  except (json.JSONDecodeError, ValueError) as e:
124
- # Not valid JSON or not in expected format
125
  print(f"Not processing as JSON: {str(e)}")
126
-
127
- # Check if pipe-separated format
128
  lines = conversation_data.strip().split('\n')
129
  if '|' in conversation_data and len(lines) > 1:
130
  print("Processing as pipe-separated data")
131
-
132
- # Parse headers and data rows
133
  headers = [h.strip() for h in lines[0].split('|')]
134
  data = []
135
-
136
  for i, line in enumerate(lines[1:], 1):
137
  if not line.strip():
138
  continue
@@ -141,35 +130,33 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
141
  data.append(dict(zip(headers, values)))
142
  else:
143
  print(f"Warning: Skipping row {i} (column count mismatch)")
144
-
145
  if data:
146
  df = pd.DataFrame(data)
147
- dataset = Dataset.from_pandas(df)
 
148
  else:
149
- # Fallback to text if no valid rows
150
- dataset = Dataset.from_dict({"text": [conversation_data]})
151
  else:
152
- # Plain text
153
  print("Processing as plain text")
154
- dataset = Dataset.from_dict({"text": [conversation_data]})
155
 
156
- # Push to Hugging Face Hub
157
  print(f"Pushing dataset to {repo_id}")
158
- dataset.push_to_hub(
159
  repo_id=repo_id,
160
  token=api_key,
161
- split="train"
162
  )
163
 
164
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"
165
  print(f"Dataset successfully pushed to: {dataset_url}")
166
-
167
  return f"Successfully created dataset at {dataset_url}"
168
  except Exception as e:
169
  import traceback
170
  error_trace = traceback.format_exc()
171
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
172
- return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores"
173
 
174
  @tool
175
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
@@ -266,13 +253,14 @@ def get_current_time_in_timezone(timezone: str) -> str:
266
  return f"Error fetching time for timezone '{timezone}': {str(e)}"
267
 
268
 
269
- # Update the model configuration to use Qwen2.5-Coder
270
  final_answer = FinalAnswerTool()
 
 
271
  model = HfApiModel(
272
  max_tokens=2096,
273
  temperature=0.5,
274
- model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # Changed to Qwen model
275
- custom_role_conversions=None,
276
  )
277
 
278
  # Import tool from Hub
@@ -281,18 +269,18 @@ image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_co
281
  with open("prompts.yaml", 'r') as stream:
282
  prompt_templates = yaml.safe_load(stream)
283
 
284
- # Updated CodeAgent with tools
285
  agent = CodeAgent(
286
  model=model,
287
  tools=[
288
- final_answer,
289
  Sonar_Web_Search_Tool,
290
- ddg_search_tool,
 
291
  get_current_time_in_timezone,
292
  image_generation_tool,
293
  Dataset_Creator_Tool,
294
  Check_Dataset_Validity
295
- ],
296
  max_steps=6,
297
  verbosity_level=1,
298
  grammar=None,
@@ -302,7 +290,8 @@ agent = CodeAgent(
302
  prompt_templates=prompt_templates
303
  )
304
 
305
- # Launch the Gradio UI
 
306
  try:
307
  GradioUI(agent).launch()
308
  except TypeError as e:
 
61
  Args:
62
  dataset_name: Name for the dataset (will be prefixed with username)
63
  conversation_data: String representing the conversation data. Can be:
64
+ - JSON array of objects (each object becomes a row)
65
+ - Pipe-separated values (first row as headers, subsequent rows as values)
66
+ - Plain text (stored in a single 'text' column)
67
 
68
  Returns:
69
  URL of the created dataset or error message
 
72
  # Required imports
73
  import json
74
  import pandas as pd
75
+ from datasets import Dataset, DatasetDict
76
  from huggingface_hub import HfApi
77
 
78
+ # Get API key
79
  api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
80
  if not api_key:
81
  return "Error: No Hugging Face API key found in environment variables"
82
 
83
+ # Set fixed username
84
  username = "Misfits-and-Machines"
 
 
 
 
 
85
  safe_dataset_name = dataset_name.replace(" ", "_").lower()
86
  repo_id = f"{username}/{safe_dataset_name}"
87
 
88
  print(f"Creating dataset: {repo_id}")
89
 
90
+ # Ensure repository exists
91
+ hf_api = HfApi(token=api_key)
92
  try:
93
+ if not hf_api.repo_exists(repo_id=repo_id, repo_type="dataset"):
 
94
  hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
95
  print(f"Created repository: {repo_id}")
96
  else:
 
98
  except Exception as e:
99
  print(f"Note when checking/creating repository: {str(e)}")
100
 
101
+ # Process input data
102
+ created_ds = None
103
  try:
 
104
  json_data = json.loads(conversation_data)
 
105
  if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
 
106
  print(f"Processing JSON array with {len(json_data)} items")
107
  df = pd.DataFrame(json_data)
108
+ ds = Dataset.from_pandas(df)
109
+ created_ds = DatasetDict({"train": ds})
110
  elif isinstance(json_data, dict):
 
111
  print("Processing single JSON object")
112
  df = pd.DataFrame([json_data])
113
+ ds = Dataset.from_pandas(df)
114
+ created_ds = DatasetDict({"train": ds})
115
  else:
116
+ raise ValueError("JSON not recognized as array or single object")
117
  except (json.JSONDecodeError, ValueError) as e:
 
118
  print(f"Not processing as JSON: {str(e)}")
119
+ # Try pipe-separated format
 
120
  lines = conversation_data.strip().split('\n')
121
  if '|' in conversation_data and len(lines) > 1:
122
  print("Processing as pipe-separated data")
 
 
123
  headers = [h.strip() for h in lines[0].split('|')]
124
  data = []
 
125
  for i, line in enumerate(lines[1:], 1):
126
  if not line.strip():
127
  continue
 
130
  data.append(dict(zip(headers, values)))
131
  else:
132
  print(f"Warning: Skipping row {i} (column count mismatch)")
 
133
  if data:
134
  df = pd.DataFrame(data)
135
+ ds = Dataset.from_pandas(df)
136
+ created_ds = DatasetDict({"train": ds})
137
  else:
138
+ created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
 
139
  else:
140
+ # Fallback for plain text
141
  print("Processing as plain text")
142
+ created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
143
 
144
+ # Push using the DatasetDict push_to_hub method.
145
  print(f"Pushing dataset to {repo_id}")
146
+ created_ds.push_to_hub(
147
  repo_id=repo_id,
148
  token=api_key,
149
+ commit_message=f"Upload dataset: {dataset_name}"
150
  )
151
 
152
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"
153
  print(f"Dataset successfully pushed to: {dataset_url}")
 
154
  return f"Successfully created dataset at {dataset_url}"
155
  except Exception as e:
156
  import traceback
157
  error_trace = traceback.format_exc()
158
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
159
+ return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Use a simpler dataset name (letters and underscores only)"
160
 
161
  @tool
162
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
 
253
  return f"Error fetching time for timezone '{timezone}': {str(e)}"
254
 
255
 
 
256
  final_answer = FinalAnswerTool()
257
+
258
+ # Remove the huggingface_api_key parameter - it's not supported
259
  model = HfApiModel(
260
  max_tokens=2096,
261
  temperature=0.5,
262
+ model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # Using the backup endpoint
263
+ custom_role_conversions=None
264
  )
265
 
266
  # Import tool from Hub
 
269
  with open("prompts.yaml", 'r') as stream:
270
  prompt_templates = yaml.safe_load(stream)
271
 
 
272
  agent = CodeAgent(
273
  model=model,
274
  tools=[
275
+ final_answer,
276
  Sonar_Web_Search_Tool,
277
+ ddg_search_tool, # Added DuckDuckGo search tool
278
+ # google_search_tool, # Added Google search tool
279
  get_current_time_in_timezone,
280
  image_generation_tool,
281
  Dataset_Creator_Tool,
282
  Check_Dataset_Validity
283
+ ],
284
  max_steps=6,
285
  verbosity_level=1,
286
  grammar=None,
 
290
  prompt_templates=prompt_templates
291
  )
292
 
293
+ # To fix the TypeError in Gradio_UI.py, you would need to modify that file
294
+ # For now, we'll just use the agent directly
295
  try:
296
  GradioUI(agent).launch()
297
  except TypeError as e: