SolshineMisfit commited on
Commit
6220e54
·
verified ·
1 Parent(s): dfafa93

Changed the method for hugging face data push to docs again and tried to integrate json structuring in between

Browse files
Files changed (1) hide show
  1. app.py +52 -104
app.py CHANGED
@@ -69,6 +69,12 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
69
  URL of the created dataset or error message
70
  """
71
  try:
 
 
 
 
 
 
72
  # Get API key from environment variables
73
  api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
74
  if not api_key:
@@ -86,7 +92,7 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
86
 
87
  print(f"Creating dataset: {repo_id}")
88
 
89
- # Check if the repository exists or create it
90
  try:
91
  repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
92
  if not repo_exists:
@@ -97,127 +103,64 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
97
  except Exception as e:
98
  print(f"Note when checking/creating repository: {str(e)}")
99
 
100
- # Check if data is JSON first (preferred format)
101
- is_json = False
102
  try:
103
- # Try to parse as JSON
104
  json_data = json.loads(conversation_data)
105
 
106
- # Check if it's an array of objects (preferred structure)
107
- if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data) and len(json_data) > 0:
108
- print(f"Processing as JSON array with {len(json_data)} items")
109
-
110
- # Extract all keys to ensure consistent columns
111
- all_keys = set()
112
- for item in json_data:
113
- all_keys.update(item.keys())
114
- all_keys = sorted(list(all_keys)) # Sort keys for consistent order
115
-
116
- print(f"Detected columns: {', '.join(all_keys)}")
117
-
118
- # Create dataset with proper structure
119
- rows = []
120
- for item in json_data:
121
- row = {key: item.get(key, "") for key in all_keys}
122
- rows.append(row)
123
-
124
- # Convert to pandas DataFrame for better control
125
- import pandas as pd
126
- df = pd.DataFrame(rows)
127
- print(df.head()) # Print first few rows for verification
128
-
129
- # Create dataset from pandas DataFrame
130
- from datasets import Dataset
131
  dataset = Dataset.from_pandas(df)
132
-
133
- # Push to Hugging Face Hub with the train split name
134
- dataset.push_to_hub(
135
- repo_id=repo_id,
136
- token=api_key,
137
- split="train",
138
- commit_message=f"Upload JSON dataset: {dataset_name}"
139
- )
140
-
141
- print(f"Successfully pushed JSON dataset with {len(json_data)} rows")
142
- is_json = True
143
-
144
  elif isinstance(json_data, dict):
145
- # Single object - convert to dataset
146
- print("Processing as single JSON object")
147
- import pandas as pd
148
  df = pd.DataFrame([json_data])
149
  dataset = Dataset.from_pandas(df)
150
-
151
- # Push to Hugging Face Hub
152
- dataset.push_to_hub(
153
- repo_id=repo_id,
154
- token=api_key,
155
- split="train",
156
- commit_message=f"Upload single JSON object: {dataset_name}"
157
- )
158
- is_json = True
159
-
160
- except json.JSONDecodeError:
161
- # Not valid JSON, will try other formats
162
- print("Not valid JSON, checking other formats...")
163
-
164
- # If not JSON, check if data is structured with pipe separators
165
- if not is_json:
166
- lines = conversation_data.strip().split('\n')
167
- is_structured = '|' in conversation_data and len(lines) > 1
168
 
169
- if is_structured:
170
- print("Detected pipe-separated structured data")
171
-
172
- # Parse the header row for column names
173
- header = lines[0].strip()
174
- headers = [col.strip() for col in header.split('|')]
175
 
176
- # Create structured data
177
- import pandas as pd
178
- rows = []
179
 
180
- # Process each data row
181
  for i, line in enumerate(lines[1:], 1):
182
  if not line.strip():
183
  continue
184
-
185
  values = [val.strip() for val in line.split('|')]
186
-
187
- # Ensure we have the right number of values
188
  if len(values) == len(headers):
189
- row = {headers[j]: values[j] for j in range(len(headers))}
190
- rows.append(row)
191
  else:
192
- print(f"Warning: Skipping row {i} due to mismatch in column count")
193
 
194
- # Create dataset from pandas DataFrame
195
- df = pd.DataFrame(rows)
196
- dataset = Dataset.from_pandas(df)
197
-
198
- # Push to Hugging Face Hub
199
- dataset.push_to_hub(
200
- repo_id=repo_id,
201
- token=api_key,
202
- split="train",
203
- commit_message=f"Upload structured data: {dataset_name}"
204
- )
205
-
206
- print(f"Successfully pushed structured dataset with {len(rows)} rows")
207
  else:
208
- # Handle as regular text data (single row)
209
- print("Processing as regular text data")
210
  dataset = Dataset.from_dict({"text": [conversation_data]})
211
-
212
- # Push to Hugging Face Hub
213
- dataset.push_to_hub(
214
- repo_id=repo_id,
215
- token=api_key,
216
- split="train",
217
- commit_message=f"Upload text data: {dataset_name}"
218
- )
219
 
220
- # Generate the URL for the dataset
 
 
 
 
 
 
 
221
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"
222
  print(f"Dataset successfully pushed to: {dataset_url}")
223
 
@@ -230,11 +173,16 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
230
 
231
  @tool
232
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
233
- """A tool that posts a new dataset of the current conversation to Hugging Face.
234
 
235
  Args:
236
  dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
237
- conversation_data: String content to save to the dataset
 
 
 
 
 
238
 
239
  Returns:
240
  Link to the created dataset or error message with troubleshooting steps
 
69
  URL of the created dataset or error message
70
  """
71
  try:
72
+ # Required imports
73
+ import json
74
+ import pandas as pd
75
+ from datasets import Dataset
76
+ from huggingface_hub import HfApi
77
+
78
  # Get API key from environment variables
79
  api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
80
  if not api_key:
 
92
 
93
  print(f"Creating dataset: {repo_id}")
94
 
95
+ # First ensure the repository exists
96
  try:
97
  repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
98
  if not repo_exists:
 
103
  except Exception as e:
104
  print(f"Note when checking/creating repository: {str(e)}")
105
 
106
+ # Process the data based on format
 
107
  try:
108
+ # Try parsing as JSON first
109
  json_data = json.loads(conversation_data)
110
 
111
+ if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
112
+ # Process JSON array of objects (preferred format)
113
+ print(f"Processing JSON array with {len(json_data)} items")
114
+ df = pd.DataFrame(json_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  dataset = Dataset.from_pandas(df)
 
 
 
 
 
 
 
 
 
 
 
 
116
  elif isinstance(json_data, dict):
117
+ # Single JSON object
118
+ print("Processing single JSON object")
 
119
  df = pd.DataFrame([json_data])
120
  dataset = Dataset.from_pandas(df)
121
+ else:
122
+ raise ValueError("JSON format not recognized as array of objects or single object")
123
+ except (json.JSONDecodeError, ValueError) as e:
124
+ # Not valid JSON or not in expected format
125
+ print(f"Not processing as JSON: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ # Check if pipe-separated format
128
+ lines = conversation_data.strip().split('\n')
129
+ if '|' in conversation_data and len(lines) > 1:
130
+ print("Processing as pipe-separated data")
 
 
131
 
132
+ # Parse headers and data rows
133
+ headers = [h.strip() for h in lines[0].split('|')]
134
+ data = []
135
 
 
136
  for i, line in enumerate(lines[1:], 1):
137
  if not line.strip():
138
  continue
 
139
  values = [val.strip() for val in line.split('|')]
 
 
140
  if len(values) == len(headers):
141
+ data.append(dict(zip(headers, values)))
 
142
  else:
143
+ print(f"Warning: Skipping row {i} (column count mismatch)")
144
 
145
+ if data:
146
+ df = pd.DataFrame(data)
147
+ dataset = Dataset.from_pandas(df)
148
+ else:
149
+ # Fallback to text if no valid rows
150
+ dataset = Dataset.from_dict({"text": [conversation_data]})
 
 
 
 
 
 
 
151
  else:
152
+ # Plain text
153
+ print("Processing as plain text")
154
  dataset = Dataset.from_dict({"text": [conversation_data]})
 
 
 
 
 
 
 
 
155
 
156
+ # Push to Hugging Face Hub
157
+ print(f"Pushing dataset to {repo_id}")
158
+ dataset.push_to_hub(
159
+ repo_id=repo_id,
160
+ token=api_key,
161
+ split="train"
162
+ )
163
+
164
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"
165
  print(f"Dataset successfully pushed to: {dataset_url}")
166
 
 
173
 
174
  @tool
175
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
176
+ """A tool that creates and pushes a dataset to Hugging Face.
177
 
178
  Args:
179
  dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
180
+ conversation_data: Data content to save in the dataset. Can be formatted in three ways:
181
+ 1. JSON array of objects - Each object becomes a row in the dataset with its properties as columns:
182
+ Example: [{"name": "Product A", "brand": "Company X"}, {"name": "Product B", "brand": "Company Y"}]
183
+ 2. Pipe-separated values - First row as headers, subsequent rows as values:
184
+ Example: "name | brand\nProduct A | Company X\nProduct B | Company Y"
185
+ 3. Plain text - Will be stored in a single 'text' column
186
 
187
  Returns:
188
  Link to the created dataset or error message with troubleshooting steps