Spaces:
Runtime error
Runtime error
Push dataset to hub now to handle data restructure and upload in chunks
Browse filesUses pandas DataFrames for better data handling before creating datasets
Adds better debug logging to see what's happening with the data
Pushes with a "train" split name which is standard practice on Hugging Face
Handles JSON arrays, single JSON objects, structured data, and plain text properly
Creates the repository first, then pushes data to it
Provides more detailed feedback about the process
app.py
CHANGED
|
@@ -86,6 +86,17 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 86 |
|
| 87 |
print(f"Creating dataset: {repo_id}")
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
# Check if data is JSON first (preferred format)
|
| 90 |
is_json = False
|
| 91 |
try:
|
|
@@ -100,20 +111,52 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 100 |
all_keys = set()
|
| 101 |
for item in json_data:
|
| 102 |
all_keys.update(item.keys())
|
|
|
|
| 103 |
|
| 104 |
-
|
| 105 |
-
data_dict = {key: [] for key in all_keys}
|
| 106 |
|
| 107 |
-
#
|
|
|
|
| 108 |
for item in json_data:
|
| 109 |
-
for key in all_keys
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
#
|
| 114 |
-
dataset
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
is_json = True
|
|
|
|
| 117 |
except json.JSONDecodeError:
|
| 118 |
# Not valid JSON, will try other formats
|
| 119 |
print("Not valid JSON, checking other formats...")
|
|
@@ -130,11 +173,12 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 130 |
header = lines[0].strip()
|
| 131 |
headers = [col.strip() for col in header.split('|')]
|
| 132 |
|
| 133 |
-
# Create
|
| 134 |
-
|
|
|
|
| 135 |
|
| 136 |
# Process each data row
|
| 137 |
-
for i, line in enumerate(lines[1:]):
|
| 138 |
if not line.strip():
|
| 139 |
continue
|
| 140 |
|
|
@@ -142,37 +186,36 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 142 |
|
| 143 |
# Ensure we have the right number of values
|
| 144 |
if len(values) == len(headers):
|
| 145 |
-
for j
|
| 146 |
-
|
| 147 |
else:
|
| 148 |
-
print(f"Warning: Skipping row {i
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
#
|
| 151 |
-
dataset
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
else:
|
| 154 |
# Handle as regular text data (single row)
|
| 155 |
print("Processing as regular text data")
|
| 156 |
dataset = Dataset.from_dict({"text": [conversation_data]})
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
print(f"Repository already exists: {repo_id}")
|
| 166 |
-
except Exception as e:
|
| 167 |
-
print(f"Note when checking/creating repository: {str(e)}")
|
| 168 |
-
|
| 169 |
-
# Push to Hugging Face Hub with simplified parameters
|
| 170 |
-
print(f"Pushing dataset to {repo_id}")
|
| 171 |
-
dataset.push_to_hub(
|
| 172 |
-
repo_id=repo_id,
|
| 173 |
-
token=api_key,
|
| 174 |
-
commit_message=f"Upload dataset: {dataset_name}"
|
| 175 |
-
)
|
| 176 |
|
| 177 |
# Generate the URL for the dataset
|
| 178 |
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
|
|
|
|
| 86 |
|
| 87 |
print(f"Creating dataset: {repo_id}")
|
| 88 |
|
| 89 |
+
# Check if the repository exists or create it
|
| 90 |
+
try:
|
| 91 |
+
repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
|
| 92 |
+
if not repo_exists:
|
| 93 |
+
hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
|
| 94 |
+
print(f"Created repository: {repo_id}")
|
| 95 |
+
else:
|
| 96 |
+
print(f"Repository already exists: {repo_id}")
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f"Note when checking/creating repository: {str(e)}")
|
| 99 |
+
|
| 100 |
# Check if data is JSON first (preferred format)
|
| 101 |
is_json = False
|
| 102 |
try:
|
|
|
|
| 111 |
all_keys = set()
|
| 112 |
for item in json_data:
|
| 113 |
all_keys.update(item.keys())
|
| 114 |
+
all_keys = sorted(list(all_keys)) # Sort keys for consistent order
|
| 115 |
|
| 116 |
+
print(f"Detected columns: {', '.join(all_keys)}")
|
|
|
|
| 117 |
|
| 118 |
+
# Create dataset with proper structure
|
| 119 |
+
rows = []
|
| 120 |
for item in json_data:
|
| 121 |
+
row = {key: item.get(key, "") for key in all_keys}
|
| 122 |
+
rows.append(row)
|
| 123 |
+
|
| 124 |
+
# Convert to pandas DataFrame for better control
|
| 125 |
+
import pandas as pd
|
| 126 |
+
df = pd.DataFrame(rows)
|
| 127 |
+
print(df.head()) # Print first few rows for verification
|
| 128 |
+
|
| 129 |
+
# Create dataset from pandas DataFrame
|
| 130 |
+
from datasets import Dataset
|
| 131 |
+
dataset = Dataset.from_pandas(df)
|
| 132 |
+
|
| 133 |
+
# Push to Hugging Face Hub with the train split name
|
| 134 |
+
dataset.push_to_hub(
|
| 135 |
+
repo_id=repo_id,
|
| 136 |
+
token=api_key,
|
| 137 |
+
split="train",
|
| 138 |
+
commit_message=f"Upload JSON dataset: {dataset_name}"
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
print(f"Successfully pushed JSON dataset with {len(json_data)} rows")
|
| 142 |
+
is_json = True
|
| 143 |
+
|
| 144 |
+
elif isinstance(json_data, dict):
|
| 145 |
+
# Single object - convert to dataset
|
| 146 |
+
print("Processing as single JSON object")
|
| 147 |
+
import pandas as pd
|
| 148 |
+
df = pd.DataFrame([json_data])
|
| 149 |
+
dataset = Dataset.from_pandas(df)
|
| 150 |
|
| 151 |
+
# Push to Hugging Face Hub
|
| 152 |
+
dataset.push_to_hub(
|
| 153 |
+
repo_id=repo_id,
|
| 154 |
+
token=api_key,
|
| 155 |
+
split="train",
|
| 156 |
+
commit_message=f"Upload single JSON object: {dataset_name}"
|
| 157 |
+
)
|
| 158 |
is_json = True
|
| 159 |
+
|
| 160 |
except json.JSONDecodeError:
|
| 161 |
# Not valid JSON, will try other formats
|
| 162 |
print("Not valid JSON, checking other formats...")
|
|
|
|
| 173 |
header = lines[0].strip()
|
| 174 |
headers = [col.strip() for col in header.split('|')]
|
| 175 |
|
| 176 |
+
# Create structured data
|
| 177 |
+
import pandas as pd
|
| 178 |
+
rows = []
|
| 179 |
|
| 180 |
# Process each data row
|
| 181 |
+
for i, line in enumerate(lines[1:], 1):
|
| 182 |
if not line.strip():
|
| 183 |
continue
|
| 184 |
|
|
|
|
| 186 |
|
| 187 |
# Ensure we have the right number of values
|
| 188 |
if len(values) == len(headers):
|
| 189 |
+
row = {headers[j]: values[j] for j in range(len(headers))}
|
| 190 |
+
rows.append(row)
|
| 191 |
else:
|
| 192 |
+
print(f"Warning: Skipping row {i} due to mismatch in column count")
|
| 193 |
+
|
| 194 |
+
# Create dataset from pandas DataFrame
|
| 195 |
+
df = pd.DataFrame(rows)
|
| 196 |
+
dataset = Dataset.from_pandas(df)
|
| 197 |
|
| 198 |
+
# Push to Hugging Face Hub
|
| 199 |
+
dataset.push_to_hub(
|
| 200 |
+
repo_id=repo_id,
|
| 201 |
+
token=api_key,
|
| 202 |
+
split="train",
|
| 203 |
+
commit_message=f"Upload structured data: {dataset_name}"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
print(f"Successfully pushed structured dataset with {len(rows)} rows")
|
| 207 |
else:
|
| 208 |
# Handle as regular text data (single row)
|
| 209 |
print("Processing as regular text data")
|
| 210 |
dataset = Dataset.from_dict({"text": [conversation_data]})
|
| 211 |
+
|
| 212 |
+
# Push to Hugging Face Hub
|
| 213 |
+
dataset.push_to_hub(
|
| 214 |
+
repo_id=repo_id,
|
| 215 |
+
token=api_key,
|
| 216 |
+
split="train",
|
| 217 |
+
commit_message=f"Upload text data: {dataset_name}"
|
| 218 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
# Generate the URL for the dataset
|
| 221 |
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
|