SolshineMisfit commited on
Commit
dfafa93
·
verified ·
1 Parent(s): 40e5f48

Push dataset to hub now to handle data restructure and upload in chunks

Browse files

Uses pandas DataFrames for better data handling before creating datasets
Adds better debug logging to see what's happening with the data
Pushes with a "train" split name which is standard practice on Hugging Face
Handles JSON arrays, single JSON objects, structured data, and plain text properly
Creates the repository first, then pushes data to it
Provides more detailed feedback about the process

Files changed (1) hide show
  1. app.py +80 -37
app.py CHANGED
@@ -86,6 +86,17 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
86
 
87
  print(f"Creating dataset: {repo_id}")
88
 
 
 
 
 
 
 
 
 
 
 
 
89
  # Check if data is JSON first (preferred format)
90
  is_json = False
91
  try:
@@ -100,20 +111,52 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
100
  all_keys = set()
101
  for item in json_data:
102
  all_keys.update(item.keys())
 
103
 
104
- # Initialize the data dictionary with empty lists for each key
105
- data_dict = {key: [] for key in all_keys}
106
 
107
- # Process each item in the array
 
108
  for item in json_data:
109
- for key in all_keys:
110
- # Use the value if present, otherwise empty string
111
- data_dict[key].append(item.get(key, ""))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- # Create dataset from JSON data
114
- dataset = Dataset.from_dict(data_dict)
115
- print(f"Created dataset with {len(json_data)} rows and {len(all_keys)} columns")
 
 
 
 
116
  is_json = True
 
117
  except json.JSONDecodeError:
118
  # Not valid JSON, will try other formats
119
  print("Not valid JSON, checking other formats...")
@@ -130,11 +173,12 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
130
  header = lines[0].strip()
131
  headers = [col.strip() for col in header.split('|')]
132
 
133
- # Create dataset dict for structured data
134
- data_dict = {header: [] for header in headers}
 
135
 
136
  # Process each data row
137
- for i, line in enumerate(lines[1:]):
138
  if not line.strip():
139
  continue
140
 
@@ -142,37 +186,36 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
142
 
143
  # Ensure we have the right number of values
144
  if len(values) == len(headers):
145
- for j, header in enumerate(headers):
146
- data_dict[header].append(values[j])
147
  else:
148
- print(f"Warning: Skipping row {i+1} due to mismatch in column count")
 
 
 
 
149
 
150
- # Create dataset from structured data
151
- dataset = Dataset.from_dict(data_dict)
152
- print(f"Created structured dataset with {len(data_dict[headers[0]])} rows and {len(headers)} columns")
 
 
 
 
 
 
153
  else:
154
  # Handle as regular text data (single row)
155
  print("Processing as regular text data")
156
  dataset = Dataset.from_dict({"text": [conversation_data]})
157
-
158
- # First ensure the repository exists
159
- try:
160
- repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
161
- if not repo_exists:
162
- hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
163
- print(f"Created repository: {repo_id}")
164
- else:
165
- print(f"Repository already exists: {repo_id}")
166
- except Exception as e:
167
- print(f"Note when checking/creating repository: {str(e)}")
168
-
169
- # Push to Hugging Face Hub with simplified parameters
170
- print(f"Pushing dataset to {repo_id}")
171
- dataset.push_to_hub(
172
- repo_id=repo_id,
173
- token=api_key,
174
- commit_message=f"Upload dataset: {dataset_name}"
175
- )
176
 
177
  # Generate the URL for the dataset
178
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"
 
86
 
87
  print(f"Creating dataset: {repo_id}")
88
 
89
+ # Check if the repository exists or create it
90
+ try:
91
+ repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
92
+ if not repo_exists:
93
+ hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
94
+ print(f"Created repository: {repo_id}")
95
+ else:
96
+ print(f"Repository already exists: {repo_id}")
97
+ except Exception as e:
98
+ print(f"Note when checking/creating repository: {str(e)}")
99
+
100
  # Check if data is JSON first (preferred format)
101
  is_json = False
102
  try:
 
111
  all_keys = set()
112
  for item in json_data:
113
  all_keys.update(item.keys())
114
+ all_keys = sorted(list(all_keys)) # Sort keys for consistent order
115
 
116
+ print(f"Detected columns: {', '.join(all_keys)}")
 
117
 
118
+ # Create dataset with proper structure
119
+ rows = []
120
  for item in json_data:
121
+ row = {key: item.get(key, "") for key in all_keys}
122
+ rows.append(row)
123
+
124
+ # Convert to pandas DataFrame for better control
125
+ import pandas as pd
126
+ df = pd.DataFrame(rows)
127
+ print(df.head()) # Print first few rows for verification
128
+
129
+ # Create dataset from pandas DataFrame
130
+ from datasets import Dataset
131
+ dataset = Dataset.from_pandas(df)
132
+
133
+ # Push to Hugging Face Hub with the train split name
134
+ dataset.push_to_hub(
135
+ repo_id=repo_id,
136
+ token=api_key,
137
+ split="train",
138
+ commit_message=f"Upload JSON dataset: {dataset_name}"
139
+ )
140
+
141
+ print(f"Successfully pushed JSON dataset with {len(json_data)} rows")
142
+ is_json = True
143
+
144
+ elif isinstance(json_data, dict):
145
+ # Single object - convert to dataset
146
+ print("Processing as single JSON object")
147
+ import pandas as pd
148
+ df = pd.DataFrame([json_data])
149
+ dataset = Dataset.from_pandas(df)
150
 
151
+ # Push to Hugging Face Hub
152
+ dataset.push_to_hub(
153
+ repo_id=repo_id,
154
+ token=api_key,
155
+ split="train",
156
+ commit_message=f"Upload single JSON object: {dataset_name}"
157
+ )
158
  is_json = True
159
+
160
  except json.JSONDecodeError:
161
  # Not valid JSON, will try other formats
162
  print("Not valid JSON, checking other formats...")
 
173
  header = lines[0].strip()
174
  headers = [col.strip() for col in header.split('|')]
175
 
176
+ # Create structured data
177
+ import pandas as pd
178
+ rows = []
179
 
180
  # Process each data row
181
+ for i, line in enumerate(lines[1:], 1):
182
  if not line.strip():
183
  continue
184
 
 
186
 
187
  # Ensure we have the right number of values
188
  if len(values) == len(headers):
189
+ row = {headers[j]: values[j] for j in range(len(headers))}
190
+ rows.append(row)
191
  else:
192
+ print(f"Warning: Skipping row {i} due to mismatch in column count")
193
+
194
+ # Create dataset from pandas DataFrame
195
+ df = pd.DataFrame(rows)
196
+ dataset = Dataset.from_pandas(df)
197
 
198
+ # Push to Hugging Face Hub
199
+ dataset.push_to_hub(
200
+ repo_id=repo_id,
201
+ token=api_key,
202
+ split="train",
203
+ commit_message=f"Upload structured data: {dataset_name}"
204
+ )
205
+
206
+ print(f"Successfully pushed structured dataset with {len(rows)} rows")
207
  else:
208
  # Handle as regular text data (single row)
209
  print("Processing as regular text data")
210
  dataset = Dataset.from_dict({"text": [conversation_data]})
211
+
212
+ # Push to Hugging Face Hub
213
+ dataset.push_to_hub(
214
+ repo_id=repo_id,
215
+ token=api_key,
216
+ split="train",
217
+ commit_message=f"Upload text data: {dataset_name}"
218
+ )
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  # Generate the URL for the dataset
221
  dataset_url = f"https://huggingface.co/datasets/{repo_id}"