SolshineMisfit commited on
Commit
8010a87
·
verified ·
1 Parent(s): 116bda5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -9
app.py CHANGED
@@ -123,13 +123,36 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
123
  # Process based on data structure
124
  if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
125
  log_text += f"Processing JSON array with {len(json_data)} items\n"
126
- df = pd.DataFrame(json_data)
127
- ds = Dataset.from_pandas(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  created_ds = DatasetDict({"train": ds})
129
  elif isinstance(json_data, dict):
130
  log_text += "Processing single JSON object\n"
131
- df = pd.DataFrame([json_data])
132
- ds = Dataset.from_pandas(df)
 
133
  created_ds = DatasetDict({"train": ds})
134
  else:
135
  raise ValueError("JSON not recognized as array or single object")
@@ -142,20 +165,29 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
142
  if '|' in conversation_data and len(lines) > 1:
143
  log_text += "Processing as pipe-separated data\n"
144
  headers = [h.strip() for h in lines[0].split('|')]
145
- data = []
 
 
 
 
 
146
  for i, line in enumerate(lines[1:], 1):
147
  if not line.strip():
148
  continue
149
  values = [val.strip() for val in line.split('|')]
150
  if len(values) == len(headers):
151
- data.append(dict(zip(headers, values)))
 
152
  else:
153
  log_text += f"Warning: Skipping row {i} (column count mismatch)\n"
154
- if data:
155
- df = pd.DataFrame(data)
156
- ds = Dataset.from_pandas(df)
 
 
157
  created_ds = DatasetDict({"train": ds})
158
  else:
 
159
  created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
160
  else:
161
  # Fallback for plain text
 
123
  # Process based on data structure
124
  if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
125
  log_text += f"Processing JSON array with {len(json_data)} items\n"
126
+
127
+ # Create a dataset with columns for all keys in the first item
128
+ # This ensures the dataset structure is consistent
129
+ first_item = json_data[0]
130
+ columns = list(first_item.keys())
131
+ log_text += f"Detected columns: {columns}\n"
132
+
133
+ # Initialize data dictionary with empty lists for each column
134
+ data_dict = {col: [] for col in columns}
135
+
136
+ # Process each item
137
+ for item in json_data:
138
+ for col in columns:
139
+ # Get the value for this column, or empty string if missing
140
+ value = item.get(col, "")
141
+ data_dict[col].append(value)
142
+
143
+ # Debug output to verify data structure
144
+ for col in columns:
145
+ log_text += f"Column '{col}' has {len(data_dict[col])} entries\n"
146
+
147
+ # Create dataset from dictionary
148
+ ds = Dataset.from_dict(data_dict)
149
+ log_text += f"Created dataset with {len(ds)} rows\n"
150
  created_ds = DatasetDict({"train": ds})
151
  elif isinstance(json_data, dict):
152
  log_text += "Processing single JSON object\n"
153
+ # For a single object, create a dataset with one row
154
+ data_dict = {k: [v] for k, v in json_data.items()}
155
+ ds = Dataset.from_dict(data_dict)
156
  created_ds = DatasetDict({"train": ds})
157
  else:
158
  raise ValueError("JSON not recognized as array or single object")
 
165
  if '|' in conversation_data and len(lines) > 1:
166
  log_text += "Processing as pipe-separated data\n"
167
  headers = [h.strip() for h in lines[0].split('|')]
168
+ log_text += f"Detected headers: {headers}\n"
169
+
170
+ # Initialize data dictionary
171
+ data_dict = {header: [] for header in headers}
172
+
173
+ # Process each data row
174
  for i, line in enumerate(lines[1:], 1):
175
  if not line.strip():
176
  continue
177
  values = [val.strip() for val in line.split('|')]
178
  if len(values) == len(headers):
179
+ for j, header in enumerate(headers):
180
+ data_dict[header].append(values[j])
181
  else:
182
  log_text += f"Warning: Skipping row {i} (column count mismatch)\n"
183
+
184
+ # Create dataset from dictionary
185
+ if all(len(values) > 0 for values in data_dict.values()):
186
+ ds = Dataset.from_dict(data_dict)
187
+ log_text += f"Created dataset with {len(ds)} rows\n"
188
  created_ds = DatasetDict({"train": ds})
189
  else:
190
+ log_text += "No valid rows found in pipe-separated data\n"
191
  created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
192
  else:
193
  # Fallback for plain text