SolshineMisfit commited on
Commit
40e5f48
·
verified ·
1 Parent(s): bdb213e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -49
app.py CHANGED
@@ -60,8 +60,10 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
60
 
61
  Args:
62
  dataset_name: Name for the dataset (will be prefixed with username)
63
- conversation_data: String representing the conversation data, can be structured
64
- with pipe-separated values (col1 | col2 | col3) for tabular data
 
 
65
 
66
  Returns:
67
  URL of the created dataset or error message
@@ -84,60 +86,92 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
84
 
85
  print(f"Creating dataset: {repo_id}")
86
 
87
- # Check if data is structured (contains pipe separators and multiple lines)
88
- lines = conversation_data.strip().split('\n')
89
- is_structured = '|' in conversation_data and len(lines) > 1
90
-
91
- if is_structured:
92
- print("Detected structured data with multiple rows")
93
-
94
- # Parse the header row for column names
95
- header = lines[0].strip()
96
- headers = [col.strip() for col in header.split('|')]
97
-
98
- # Parse the data rows
99
- data_dict = {header: [] for header in headers}
100
 
101
- # Add a timestamp and id column
102
- data_dict['timestamp'] = []
103
- data_dict['id'] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Process each data row
106
- for i, line in enumerate(lines[1:]):
107
- if not line.strip():
108
- continue
109
-
110
- values = [val.strip() for val in line.split('|')]
111
 
112
- # Ensure we have the right number of values
113
- if len(values) == len(headers):
114
- for j, header in enumerate(headers):
115
- data_dict[header].append(values[j])
 
 
 
 
 
 
 
 
 
116
 
117
- # Add timestamp and ID for each row
118
- data_dict['timestamp'].append(datetime.datetime.now().isoformat())
119
- data_dict['id'].append(str(uuid.uuid4()))
120
- else:
121
- print(f"Warning: Skipping row {i+1} due to mismatch in column count")
122
-
123
- # Create dataset from structured data
124
- dataset = Dataset.from_dict(data_dict)
125
- print(f"Created structured dataset with {len(data_dict[headers[0]])} rows and {len(data_dict)} columns")
126
- else:
127
- # Handle as regular text data (single row)
128
- print("Processing as regular text data")
129
- data = {
130
- "text": [conversation_data],
131
- "timestamp": [datetime.datetime.now().isoformat()],
132
- "id": [str(uuid.uuid4())]
133
- }
134
- dataset = Dataset.from_dict(data)
 
 
 
 
 
 
 
135
 
136
- # Push to Hugging Face Hub
 
137
  dataset.push_to_hub(
138
  repo_id=repo_id,
139
  token=api_key,
140
- private=False
141
  )
142
 
143
  # Generate the URL for the dataset
@@ -149,7 +183,7 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
149
  import traceback
150
  error_trace = traceback.format_exc()
151
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
152
- return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores\n3. Check your permissions for the Misfits-and-Machines organization"
153
 
154
  @tool
155
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
 
60
 
61
  Args:
62
  dataset_name: Name for the dataset (will be prefixed with username)
63
+ conversation_data: String representing the conversation data. Can be:
64
+ - JSON array of objects (each object becomes a row)
65
+ - Pipe-separated values (col1 | col2 | col3) for tabular data
66
+ - Plain text (stored in a 'text' column)
67
 
68
  Returns:
69
  URL of the created dataset or error message
 
86
 
87
  print(f"Creating dataset: {repo_id}")
88
 
89
+ # Check if data is JSON first (preferred format)
90
+ is_json = False
91
+ try:
92
+ # Try to parse as JSON
93
+ json_data = json.loads(conversation_data)
 
 
 
 
 
 
 
 
94
 
95
+ # Check if it's an array of objects (preferred structure)
96
+ if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data) and len(json_data) > 0:
97
+ print(f"Processing as JSON array with {len(json_data)} items")
98
+
99
+ # Extract all keys to ensure consistent columns
100
+ all_keys = set()
101
+ for item in json_data:
102
+ all_keys.update(item.keys())
103
+
104
+ # Initialize the data dictionary with empty lists for each key
105
+ data_dict = {key: [] for key in all_keys}
106
+
107
+ # Process each item in the array
108
+ for item in json_data:
109
+ for key in all_keys:
110
+ # Use the value if present, otherwise empty string
111
+ data_dict[key].append(item.get(key, ""))
112
+
113
+ # Create dataset from JSON data
114
+ dataset = Dataset.from_dict(data_dict)
115
+ print(f"Created dataset with {len(json_data)} rows and {len(all_keys)} columns")
116
+ is_json = True
117
+ except json.JSONDecodeError:
118
+ # Not valid JSON, will try other formats
119
+ print("Not valid JSON, checking other formats...")
120
+
121
+ # If not JSON, check if data is structured with pipe separators
122
+ if not is_json:
123
+ lines = conversation_data.strip().split('\n')
124
+ is_structured = '|' in conversation_data and len(lines) > 1
125
 
126
+ if is_structured:
127
+ print("Detected pipe-separated structured data")
 
 
 
 
128
 
129
+ # Parse the header row for column names
130
+ header = lines[0].strip()
131
+ headers = [col.strip() for col in header.split('|')]
132
+
133
+ # Create dataset dict for structured data
134
+ data_dict = {header: [] for header in headers}
135
+
136
+ # Process each data row
137
+ for i, line in enumerate(lines[1:]):
138
+ if not line.strip():
139
+ continue
140
+
141
+ values = [val.strip() for val in line.split('|')]
142
 
143
+ # Ensure we have the right number of values
144
+ if len(values) == len(headers):
145
+ for j, header in enumerate(headers):
146
+ data_dict[header].append(values[j])
147
+ else:
148
+ print(f"Warning: Skipping row {i+1} due to mismatch in column count")
149
+
150
+ # Create dataset from structured data
151
+ dataset = Dataset.from_dict(data_dict)
152
+ print(f"Created structured dataset with {len(data_dict[headers[0]])} rows and {len(headers)} columns")
153
+ else:
154
+ # Handle as regular text data (single row)
155
+ print("Processing as regular text data")
156
+ dataset = Dataset.from_dict({"text": [conversation_data]})
157
+
158
+ # First ensure the repository exists
159
+ try:
160
+ repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
161
+ if not repo_exists:
162
+ hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
163
+ print(f"Created repository: {repo_id}")
164
+ else:
165
+ print(f"Repository already exists: {repo_id}")
166
+ except Exception as e:
167
+ print(f"Note when checking/creating repository: {str(e)}")
168
 
169
+ # Push to Hugging Face Hub with simplified parameters
170
+ print(f"Pushing dataset to {repo_id}")
171
  dataset.push_to_hub(
172
  repo_id=repo_id,
173
  token=api_key,
174
+ commit_message=f"Upload dataset: {dataset_name}"
175
  )
176
 
177
  # Generate the URL for the dataset
 
183
  import traceback
184
  error_trace = traceback.format_exc()
185
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
186
+ return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores"
187
 
188
  @tool
189
  def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str: