Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -123,13 +123,36 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 123 |
# Process based on data structure
|
| 124 |
if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
|
| 125 |
log_text += f"Processing JSON array with {len(json_data)} items\n"
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
created_ds = DatasetDict({"train": ds})
|
| 129 |
elif isinstance(json_data, dict):
|
| 130 |
log_text += "Processing single JSON object\n"
|
| 131 |
-
|
| 132 |
-
|
|
|
|
| 133 |
created_ds = DatasetDict({"train": ds})
|
| 134 |
else:
|
| 135 |
raise ValueError("JSON not recognized as array or single object")
|
|
@@ -142,20 +165,29 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
|
|
| 142 |
if '|' in conversation_data and len(lines) > 1:
|
| 143 |
log_text += "Processing as pipe-separated data\n"
|
| 144 |
headers = [h.strip() for h in lines[0].split('|')]
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
for i, line in enumerate(lines[1:], 1):
|
| 147 |
if not line.strip():
|
| 148 |
continue
|
| 149 |
values = [val.strip() for val in line.split('|')]
|
| 150 |
if len(values) == len(headers):
|
| 151 |
-
|
|
|
|
| 152 |
else:
|
| 153 |
log_text += f"Warning: Skipping row {i} (column count mismatch)\n"
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
| 157 |
created_ds = DatasetDict({"train": ds})
|
| 158 |
else:
|
|
|
|
| 159 |
created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
|
| 160 |
else:
|
| 161 |
# Fallback for plain text
|
|
|
|
| 123 |
# Process based on data structure
|
| 124 |
if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
|
| 125 |
log_text += f"Processing JSON array with {len(json_data)} items\n"
|
| 126 |
+
|
| 127 |
+
# Create a dataset with columns for all keys in the first item
|
| 128 |
+
# This ensures the dataset structure is consistent
|
| 129 |
+
first_item = json_data[0]
|
| 130 |
+
columns = list(first_item.keys())
|
| 131 |
+
log_text += f"Detected columns: {columns}\n"
|
| 132 |
+
|
| 133 |
+
# Initialize data dictionary with empty lists for each column
|
| 134 |
+
data_dict = {col: [] for col in columns}
|
| 135 |
+
|
| 136 |
+
# Process each item
|
| 137 |
+
for item in json_data:
|
| 138 |
+
for col in columns:
|
| 139 |
+
# Get the value for this column, or empty string if missing
|
| 140 |
+
value = item.get(col, "")
|
| 141 |
+
data_dict[col].append(value)
|
| 142 |
+
|
| 143 |
+
# Debug output to verify data structure
|
| 144 |
+
for col in columns:
|
| 145 |
+
log_text += f"Column '{col}' has {len(data_dict[col])} entries\n"
|
| 146 |
+
|
| 147 |
+
# Create dataset from dictionary
|
| 148 |
+
ds = Dataset.from_dict(data_dict)
|
| 149 |
+
log_text += f"Created dataset with {len(ds)} rows\n"
|
| 150 |
created_ds = DatasetDict({"train": ds})
|
| 151 |
elif isinstance(json_data, dict):
|
| 152 |
log_text += "Processing single JSON object\n"
|
| 153 |
+
# For a single object, create a dataset with one row
|
| 154 |
+
data_dict = {k: [v] for k, v in json_data.items()}
|
| 155 |
+
ds = Dataset.from_dict(data_dict)
|
| 156 |
created_ds = DatasetDict({"train": ds})
|
| 157 |
else:
|
| 158 |
raise ValueError("JSON not recognized as array or single object")
|
|
|
|
| 165 |
if '|' in conversation_data and len(lines) > 1:
|
| 166 |
log_text += "Processing as pipe-separated data\n"
|
| 167 |
headers = [h.strip() for h in lines[0].split('|')]
|
| 168 |
+
log_text += f"Detected headers: {headers}\n"
|
| 169 |
+
|
| 170 |
+
# Initialize data dictionary
|
| 171 |
+
data_dict = {header: [] for header in headers}
|
| 172 |
+
|
| 173 |
+
# Process each data row
|
| 174 |
for i, line in enumerate(lines[1:], 1):
|
| 175 |
if not line.strip():
|
| 176 |
continue
|
| 177 |
values = [val.strip() for val in line.split('|')]
|
| 178 |
if len(values) == len(headers):
|
| 179 |
+
for j, header in enumerate(headers):
|
| 180 |
+
data_dict[header].append(values[j])
|
| 181 |
else:
|
| 182 |
log_text += f"Warning: Skipping row {i} (column count mismatch)\n"
|
| 183 |
+
|
| 184 |
+
# Create dataset from dictionary
|
| 185 |
+
if all(len(values) > 0 for values in data_dict.values()):
|
| 186 |
+
ds = Dataset.from_dict(data_dict)
|
| 187 |
+
log_text += f"Created dataset with {len(ds)} rows\n"
|
| 188 |
created_ds = DatasetDict({"train": ds})
|
| 189 |
else:
|
| 190 |
+
log_text += "No valid rows found in pipe-separated data\n"
|
| 191 |
created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
|
| 192 |
else:
|
| 193 |
# Fallback for plain text
|