ALFHAzero commited on
Commit
56bfde5
·
verified ·
1 Parent(s): 4af0e1f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +765 -0
app.py ADDED
@@ -0,0 +1,765 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import json
4
+ import os
5
+ from huggingface_hub import HfApi, snapshot_download
6
+ import threading
7
+ import sys
8
+
9
+ # Add basic logging
10
+ def log_message(message):
11
+ print(f"[APP_LOG] {message}", file=sys.stderr) # Use stderr so it appears in Colab output
12
+
13
+ # Function to handle saving dataset
14
+ def save_dataset(dataset_entries, filename):
15
+ """Saves the dataset entries to a JSONL file."""
16
+ log_message(f"Attempting to save dataset to local file: {filename}")
17
+ if not dataset_entries:
18
+ log_message("No entries in dataset_entries to save.")
19
+ return "No entries to save."
20
+
21
+ jsonl_data = ""
22
+ try:
23
+ for entry in dataset_entries:
24
+ # Pastikan entri adalah dictionary sebelum di-dumps
25
+ if isinstance(entry, dict):
26
+ jsonl_data += json.dumps(entry, ensure_ascii=False) + "\n"
27
+ else:
28
+ log_message(f"Warning: Skipping non-dictionary entry during local save: {entry}") # Log warning
29
+
30
+
31
+ with open(filename, "w", encoding="utf-8") as f:
32
+ f.write(jsonl_data)
33
+ log_message(f"Dataset successfully saved to local file: {filename}")
34
+ return f"Dataset saved successfully to {filename}"
35
+ except Exception as e:
36
+ log_message(f"Error saving local file {filename}: {e}")
37
+ # Include the specific exception 'e' in the error message
38
+ return f"Error saving file: {e}"
39
+
40
+ # Function to handle saving to Hugging Face Hub
41
+ def save_to_hf(dataset_entries, hf_token, hf_repo_id, hf_file_path):
42
+ """Saves the dataset entries to Hugging Face Hub."""
43
+ log_message(f"Attempting to save dataset to Hugging Face Hub: {hf_repo_id}/{hf_file_path}")
44
+ if not dataset_entries:
45
+ log_message("No dataset entries to save to Hugging Face Hub.")
46
+ return "No dataset entries to save to Hugging Face Hub."
47
+ elif not hf_token or not hf_repo_id or not hf_file_path:
48
+ log_message("Missing HF token, repo ID, or file path for saving.")
49
+ return "Please provide Hugging Face API Token, Repository Name, and file path."
50
+
51
+ try:
52
+ api = HfApi(token=hf_token)
53
+ log_message("HfApi initialized.")
54
+
55
+ jsonl_data = ""
56
+ for entry in dataset_entries:
57
+ # Pastikan entri adalah dictionary sebelum di-dumps
58
+ if isinstance(entry, dict):
59
+ jsonl_data += json.dumps(entry, ensure_ascii=False) + "\n"
60
+ else:
61
+ log_message(f"Warning: Skipping non-dictionary entry during HF save: {entry}") # Log warning
62
+
63
+
64
+ # Save the data to a temporary file to upload
65
+ temp_file_path = "temp_dataset.jsonl"
66
+ log_message(f"Saving to temporary file for upload: {temp_file_path}")
67
+ with open(temp_file_path, "w", encoding="utf-8") as f:
68
+ f.write(jsonl_data)
69
+ log_message("Temporary file created.")
70
+
71
+ # Upload the file
72
+ log_message(f"Uploading file to HF Hub: repo_id={hf_repo_id}, path_in_repo={hf_file_path}")
73
+ upload_info = api.upload_file(
74
+ path_or_fileobj=temp_file_path,
75
+ path_in_repo=hf_file_path,
76
+ repo_id=hf_repo_id,
77
+ repo_type="dataset", # Specify repo type as dataset
78
+ commit_message="Add or update dataset via Gradio app"
79
+ )
80
+ log_message(f"Upload successful. Info: {upload_info}")
81
+
82
+ # Clean up the temporary file
83
+ log_message(f"Removing temporary file: {temp_file_path}")
84
+ os.remove(temp_file_path)
85
+
86
+ return f"Dataset saved successfully to Hugging Face Hub: {upload_info.url}"
87
+
88
+ except Exception as e:
89
+ error_message = f"Error saving to Hugging Face Hub: {e}"
90
+ log_message(f"HF Save Error: {e}")
91
+ # Enhance specific error messages
92
+ if "Repository not found" in str(e):
93
+ error_message = f"Error: Repository '{hf_repo_id}' not found. Please check the repository ID. Original error: {e}"
94
+ elif "Authentication required" in str(e) or "Invalid token" in str(e):
95
+ error_message = f"Error: Authentication failed. Please check your Hugging Face API Token or ensure the repository is public. Original error: {e}"
96
+ else:
97
+ # Include the specific exception 'e' for other errors
98
+ error_message = f"Error saving to Hugging Face Hub: {e}"
99
+ return error_message
100
+
101
+ # Function to handle loading dataset from a file
102
+ def load_dataset_from_file(file_obj, local_file_path):
103
+ """Loads dataset entries from an uploaded file object or a local file path."""
104
+ log_message("Attempting to load dataset from uploaded file or local path.")
105
+ log_message(f"Received file_obj type: {type(file_obj)}")
106
+ log_message(f"Received local_file_path type: {type(local_file_path)}")
107
+ log_message(f"Received local_file_path value: {local_file_path}")
108
+
109
+ loaded_entries = []
110
+ filename = ""
111
+
112
+ try:
113
+ if file_obj is not None and hasattr(file_obj, 'read'): # Handle file object (upload) if provided
114
+ log_message(f"Loading from uploaded file object: {file_obj.name}")
115
+ jsonl_data = file_obj.read().decode("utf-8")
116
+ filename = os.path.basename(file_obj.name)
117
+ log_message(f"Read {len(jsonl_data)} characters from uploaded file object: {filename}")
118
+ elif local_file_path is not None and isinstance(local_file_path, str) and local_file_path.strip(): # Handle string (local path) if provided and not empty
119
+ file_path = local_file_path.strip()
120
+ log_message(f"Loading from local file path: {file_path}")
121
+ if not os.path.exists(file_path):
122
+ log_message(f"Local file not found: {file_path}")
123
+ return [], 0, f"Error loading file: Local file not found at {file_path}", ""
124
+
125
+ with open(file_path, "r", encoding="utf-8") as f:
126
+ jsonl_data = f.read()
127
+ filename = os.path.basename(file_path)
128
+ log_message(f"Read {len(jsonl_data)} characters from local file path: {filename}")
129
+ else:
130
+ log_message("No file uploaded or local path provided.")
131
+ return [], 0, "Please upload a JSONL file or provide a local path.", "" # Return empty data, index, message, and filename
132
+
133
+
134
+ for i, line in enumerate(jsonl_data.strip().split('\n')):
135
+ if line.strip(): # Check if line is not empty after stripping whitespace
136
+ try:
137
+ loaded_entries.append(json.loads(line))
138
+ except json.JSONDecodeError as e:
139
+ log_message(f"Error decoding JSON on line {i+1}: {line.strip()} - {e}") # Log decoding errors
140
+ # Continue processing other lines even if one fails
141
+ pass
142
+ else:
143
+ log_message(f"Skipping empty line {i+1} in uploaded file.")
144
+
145
+
146
+ log_message(f"Successfully loaded {len(loaded_entries)} entries from file: {filename}")
147
+
148
+ # Return loaded entries, set index to 0, success message, and filename
149
+ return loaded_entries, 0, f"Successfully loaded {len(loaded_entries)} entries.", filename
150
+
151
+ except Exception as e:
152
+ log_message(f"Error loading file: {e}")
153
+ # Include the specific exception 'e' in the error message
154
+ return [], 0, f"Error loading file: {e}", "" # Return empty data, index, and error message
155
+
156
+ # Function to handle loading from Hugging Face Hub
157
+ def load_from_hf(hf_token, hf_repo_id, hf_file_path):
158
+ """Loads dataset entries from Hugging Face Hub."""
159
+ log_message(f"Attempting to load dataset from Hugging Face Hub: {hf_repo_id}/{hf_file_path}")
160
+ if not hf_repo_id or not hf_file_path:
161
+ log_message("Missing HF repo ID or file path for loading.")
162
+ return [], 0, "Please provide Hugging Face Repository ID and file path.", ""
163
+
164
+ loaded_entries = []
165
+ filename_for_save = ""
166
+ try:
167
+ # Download the file from the Hugging Face Hub
168
+ log_message(f"Downloading file from HF Hub: repo_id={hf_repo_id}, path_in_repo={hf_file_path}")
169
+ # Pass token if provided, otherwise allow anonymous download for public repos
170
+ downloaded_folder = snapshot_download(repo_id=hf_repo_id, allow_patterns=hf_file_path, token=hf_token if hf_token else None)
171
+ downloaded_file_path = os.path.join(downloaded_folder, hf_file_path)
172
+ log_message(f"File downloaded to temporary path: {downloaded_file_path}")
173
+
174
+
175
+ if not os.path.exists(downloaded_file_path):
176
+ log_message(f"Downloaded file not found at expected path: {downloaded_file_path}")
177
+ # Provide a specific message if the file is not found in the repo
178
+ return [], 0, f"Error: File '{hf_file_path}' not found in repository '{hf_repo_id}'. Please check the file path.", ""
179
+
180
+ with open(downloaded_file_path, "r", encoding="utf-8") as f:
181
+ for i, line in enumerate(f):
182
+ if line.strip(): # Check if line is not empty after stripping whitespace
183
+ try:
184
+ loaded_entries.append(json.loads(line))
185
+ except json.JSONDecodeError as e:
186
+ log_message(f"Error decoding JSON on line {i+1} in HF file: {line.strip()} - {e}") # Log decoding errors
187
+ # Continue processing other lines even if one fails
188
+ pass
189
+ else:
190
+ log_message(f"Skipping empty line {i+1} in HF file.")
191
+
192
+
193
+ # Extract filename for saving
194
+ filename_for_save = os.path.basename(hf_file_path)
195
+ log_message(f"Successfully loaded {len(loaded_entries)} entries from Hugging Face Hub file: {filename_for_save}")
196
+
197
+ # Return loaded entries, set index to 0, success message, and filename
198
+ return loaded_entries, 0, f"Successfully loaded {len(loaded_entries)} entri dari Hugging Face Hub.", filename_for_save
199
+
200
+ except Exception as e:
201
+ error_message = f"Gagal memuat dari Hugging Face Hub: {e}"
202
+ log_message(f"HF Load Error: {e}")
203
+ # Enhance specific error messages
204
+ if "Repository not found" in str(e):
205
+ error_message = f"Error: Repository '{hf_repo_id}' not found. Please check the repository ID. Original error: {e}"
206
+ elif "Authentication required" in str(e) or "Invalid token" in str(e):
207
+ error_message = f"Error: Authentication failed. Please check your Hugging Face API Token or ensure the repository is public. Original error: {e}"
208
+ elif "allow_patterns" in str(e): # Handle specific download errors related to patterns
209
+ error_message = f"Error: File path '{hf_file_path}' not found in repository '{hf_repo_id}' or pattern matching failed. Original error: {e}"
210
+ else:
211
+ # Include the specific exception 'e' for other errors
212
+ error_message = f"Error loading from Hugging Face Hub: {e}"
213
+ return [], 0, error_message, ""
214
+
215
+
216
+ # Function to add a user/assistant turn
217
+ def add_turn(messages, user_input, assistant_response):
218
+ """Adds a user and assistant turn to the current messages."""
219
+ log_message("Attempting to add user/assistant turn.")
220
+ if not user_input.strip() or not assistant_response.strip(): # Added strip() for validation
221
+ log_message("User input or assistant response is empty, not adding turn.")
222
+ # Return current state and a user-facing message
223
+ return messages, user_input, assistant_response, "Please provide both User Input and Assistant Response."
224
+
225
+ messages.append({"role": "user", "content": user_input.strip()}) # Added strip() for content
226
+ messages.append({"role": "assistant", "content": assistant_response.strip()}) # Added strip() for content
227
+ log_message("User/assistant turn added.")
228
+ # Return updated messages, clear input fields, and return an empty status message
229
+ return messages, "", "", "Turn added successfully." # Return updated messages and clear input fields
230
+
231
+
232
+ # Function to clear turns
233
+ def clear_turns():
234
+ """Clears the current messages."""
235
+ log_message("Clearing current turns.")
236
+ return [], "" # Return empty messages and clear status message
237
+
238
+ # Function to add an entry to the dataset
239
+ def add_entry_to_dataset(dataset_entries, system_message, messages):
240
+ """Adds the current system message and turns as a new entry to the dataset."""
241
+ log_message("Attempting to add entry to dataset.")
242
+ new_entry_messages = []
243
+ if system_message.strip(): # Added strip() for validation
244
+ new_entry_messages.append({"role": "system", "content": system_message.strip()}) # Added strip() for content
245
+ log_message("System message added to new entry.")
246
+ new_entry_messages.extend(messages)
247
+ log_message(f"New entry messages: {new_entry_messages}")
248
+
249
+
250
+ if new_entry_messages:
251
+ dataset_entries.append({"messages": new_entry_messages})
252
+ log_message(f"Entry added to dataset. New dataset size: {len(dataset_entries)}")
253
+ # After adding, update the dataset size display
254
+ return dataset_entries, "", [], "Entry added to dataset!", f"Number of entries: {len(dataset_entries)}"
255
+ else:
256
+ log_message("No messages to add as an entry.")
257
+ # Return current state and a user-facing message
258
+ return dataset_entries, system_message, messages, "Cannot add empty entry. Add system message or user/assistant turns.", f"Number of entries: {len(dataset_entries)}"
259
+
260
+
261
+ # Function to display current entry
262
+ def display_entry(dataset_entries, current_index):
263
+ """Displays the messages of the current dataset entry and provides editable textboxes."""
264
+ log_message(f"Attempting to display entry at index: {current_index}")
265
+ log_message(f"Current dataset_entries size in display_entry: {len(dataset_entries) if dataset_entries is not None else 0}")
266
+
267
+ # Prepare default outputs for empty dataset or invalid index
268
+ empty_display_text = "No entries to display yet."
269
+ empty_system_message = ""
270
+ # Create a list of 10 gr.update objects for textboxes, setting initial values to "" and visible=False
271
+ hidden_textboxes = [gr.update(value="", visible=False) for _ in range(10)]
272
+ hide_buttons = gr.update(visible=False)
273
+ clear_status = ""
274
+
275
+ if not dataset_entries:
276
+ log_message("dataset_entries is empty, cannot display.")
277
+ # Return empty state and hide components
278
+ # Note the use of *hidden_textboxes to unpack the list into individual arguments
279
+ return empty_display_text, empty_system_message, *hidden_textboxes, hide_buttons, hide_buttons, clear_status
280
+
281
+ total_entries = len(dataset_entries)
282
+ # Ensure current_index is within bounds after operations like deletion
283
+ if not (0 <= current_index < total_entries):
284
+ log_message(f"Current index {current_index} out of bounds for dataset size {total_entries}. Adjusting.")
285
+ # Adjust index to the last entry if out of bounds high, or stay at 0 if empty
286
+ current_index = max(0, min(current_index, total_entries - 1)) if total_entries > 0 else 0
287
+ log_message(f"Adjusted index: {current_index}")
288
+ # Re-evaluate based on the adjusted index
289
+ if not (0 <= current_index < total_entries): # Check again if dataset became empty
290
+ log_message("Dataset is empty after index adjustment.")
291
+ # Return empty state and hide components
292
+ # Ensure all output components match the function's expected outputs
293
+ return empty_display_text, empty_system_message, *hidden_textboxes, hide_buttons, hide_buttons, clear_status
294
+
295
+
296
+ # Proceed with displaying the valid entry
297
+ entry = dataset_entries[current_index]
298
+ log_message(f"Displaying entry {current_index + 1} of {total_entries}. Entry content sample: {str(entry)[:100]}...") # Log sample of entry
299
+
300
+
301
+ display_text = f"Viewing Entry {current_index + 1} of {total_entries}\n\n"
302
+
303
+ system_message_content = ""
304
+ messages_content = []
305
+
306
+ # Separate system message from user/assistant messages
307
+ if entry and 'messages' in entry and isinstance(entry['messages'], list) and entry['messages']: # Added type check and emptiness check
308
+ if entry['messages'][0]['role'] == 'system':
309
+ system_message_content = entry['messages'][0]['content']
310
+ messages_content = entry['messages'][1:]
311
+ log_message("Found system message and user/assistant messages.")
312
+ else: # Assume all messages are user/assistant if the first is not system
313
+ messages_content = entry['messages']
314
+ log_message("No system message found, displaying all as user/assistant.")
315
+ elif entry and 'messages' in entry and isinstance(entry['messages'], list) and not entry['messages']:
316
+ log_message("Entry has empty messages list.")
317
+ # messages_content remains empty
318
+ else: # Handle invalid entry format or missing messages key
319
+ log_message(f"Warning: Invalid entry format or missing messages key at index {current_index}: {entry}")
320
+ # Return error state for this specific entry and hide components
321
+ # Ensure all output components match the function's expected outputs
322
+ return f"Error displaying entry {current_index + 1}: Invalid format.", "", *hidden_textboxes, hide_buttons, hide_buttons, ""
323
+
324
+
325
+ # Format display text for user/assistant messages
326
+ for msg in messages_content:
327
+ display_text += f"**{msg['role'].capitalize()}:** {msg['content']}\n\n"
328
+
329
+ # Prepare values for the editable textboxes
330
+ editable_system_message = system_message_content
331
+ # Ensure we only populate up to 10 textboxes
332
+ # Also ensure message objects have 'content' key
333
+ editable_messages = [msg.get('content', '') for msg in messages_content[:10] if isinstance(msg, dict)] + [""] * (10 - len(messages_content[:10])) # Pad with empty strings up to 10, added safety checks
334
+
335
+
336
+ # Update visibility of message textboxes
337
+ # Ensure visibility is based on the actual number of messages_content
338
+ textbox_updates = [gr.update(value=editable_messages[i], visible=(i < len(messages_content) and i < 10)) for i in range(10)] # Ensure max 10 textboxes
339
+
340
+
341
+ log_message("Successfully prepared display text and textbox updates.")
342
+ # Show edit/delete buttons and clear edit status
343
+ # Return all output components, including the updated value for edited_system_message_input
344
+ return display_text, gr.update(value=editable_system_message, visible=True), *textbox_updates, gr.update(visible=True), gr.update(visible=True), ""
345
+
346
+
347
+ # Function to navigate to the previous entry
348
+ def prev_entry(current_index, dataset_entries):
349
+ """Navigates to the previous entry."""
350
+ log_message(f"Navigating to previous entry from index {current_index}")
351
+ if current_index > 0:
352
+ new_index = current_index - 1
353
+ log_message(f"New index: {new_index}")
354
+ return new_index
355
+ log_message("Already at the beginning (index 0). Staying at 0.")
356
+ return 0 # Stay at 0 if already at the beginning
357
+
358
+ # Function to navigate to the next entry
359
+ def next_entry(current_index, dataset_entries):
360
+ """Navigates to the next entry."""
361
+ log_message(f"Navigating to next entry from index {current_index}")
362
+ if len(dataset_entries) > 0 and current_index < len(dataset_entries) - 1:
363
+ new_index = current_index + 1
364
+ log_message(f"New index: {new_index}")
365
+ return new_index
366
+ if len(dataset_entries) > 0:
367
+ log_message("Already at the end. Staying at last index.")
368
+ return len(dataset_entries) - 1 # Stay at the last index if already at the end
369
+ log_message("Dataset is empty. Staying at index 0.")
370
+ return 0 # If dataset is empty
371
+
372
+ # Function to go to a specific entry number
373
+ def go_to_entry(entry_number, dataset_entries):
374
+ """Navigates to a specific entry number."""
375
+ log_message(f"Attempting to go to entry number: {entry_number}")
376
+ total_entries = len(dataset_entries)
377
+ default_index = 0 if total_entries == 0 else 0 # Default to 0 if empty, or first if not
378
+
379
+ try:
380
+ # Attempt to convert input to integer
381
+ index = int(entry_number) - 1
382
+ # Validate index range
383
+ if 0 <= index < total_entries:
384
+ log_message(f"Valid index calculated: {index}")
385
+ # Return valid index and empty status message
386
+ return index, ""
387
+ else:
388
+ log_message(f"Calculated index {index} is out of bounds (0 to {total_entries-1 if total_entries > 0 else 0}).")
389
+ # Return default index and error message for out of bounds
390
+ return default_index, f"Error: Entry number {entry_number} is out of bounds. Please enter a number between 1 and {total_entries if total_entries > 0 else 1}."
391
+ except (ValueError, TypeError):
392
+ log_message(f"Invalid input for entry number: {entry_number}")
393
+ # Return default index and error message for invalid input type
394
+ return default_index, f"Error: Invalid input '{entry_number}'. Please enter a valid integer number."
395
+
396
+
397
+ # Function to update messages in the current entry
398
+ def update_entry_messages(dataset_entries, current_index, edited_system_message, *edited_contents):
399
+ """Updates the messages of the current entry with edited content."""
400
+ log_message(f"Attempting to update entry at index: {current_index}")
401
+ if not dataset_entries or not (0 <= current_index < len(dataset_entries)):
402
+ log_message("Cannot update entry: dataset_entries empty or index out of bounds.")
403
+ # Return current state and a specific error message
404
+ return dataset_entries, "Error: Cannot update entry. Dataset is empty or index is out of bounds."
405
+
406
+ updated_messages = []
407
+
408
+ # Handle the system message
409
+ if edited_system_message.strip(): # Added strip() for validation and content
410
+ updated_messages.append({"role": "system", "content": edited_system_message.strip()})
411
+ log_message("Updated system message added.")
412
+
413
+
414
+ # Get original user/assistant messages
415
+ original_messages_in_entry = dataset_entries[current_index].get('messages', [])
416
+ original_user_assistant_messages = [msg for msg in original_messages_in_entry if msg.get('role') in ['user', 'assistant']] # Added role check and get with default
417
+ original_user_assistant_count = len(original_user_assistant_messages)
418
+
419
+ # Iterate through the edited contents provided by the textboxes
420
+ # We are assuming a max of 10 editable message textboxes
421
+ for i in range(10): # Process up to 10 edited message textboxes
422
+ edited_content = edited_contents[i]
423
+ # Check if the edited content is not empty
424
+ if edited_content.strip():
425
+ # If it corresponds to an original message index, use its original role
426
+ if i < original_user_assistant_count:
427
+ updated_messages.append({"role": original_user_assistant_messages[i].get('role', 'user'), "content": edited_content.strip()}) # Corrected quote
428
+ log_message(f"Updated original message {i+1} with role {original_user_assistant_messages[i].get('role', 'user')}.")
429
+ # If it's a new message (beyond original count but within the 10 textboxes)
430
+ else:
431
+ # Determine role based on the last message added in the updated_messages list
432
+ if len(updated_messages) > 0:
433
+ last_role = updated_messages[-1]['role']
434
+ # Alternate roles, assuming the sequence is always user, assistant, user, assistant...
435
+ new_role = 'user' if last_role == 'assistant' else 'assistant'
436
+ else:
437
+ # If no messages exist yet (only system message or initially empty), the first new message is 'user'
438
+ new_role = 'user'
439
+ updated_messages.append({"role": new_role, "content": edited_content.strip()}) # Corrected quote
440
+ log_message(f"Added new message {i+1} with inferred role {new_role}.")
441
+ # If edited content is empty and it was an original message, it's effectively deleted (not added to updated_messages)
442
+ elif i < original_user_assistant_count:
443
+ log_message(f"Original message {i+1} was cleared, effectively deleting it.")
444
+
445
+
446
+ # Check if the updated entry has any messages (system or user/assistant)
447
+ if not updated_messages:
448
+ # Prevent saving an empty entry if it wasn't originally empty (unless system message was the only thing and is now empty)
449
+ # Allow saving an empty messages list if the original entry only had a system message and it was cleared
450
+ if not (len(original_messages_in_entry) == 1 and original_messages_in_entry[0]['role'] == 'system' and not edited_system_message.strip()):
451
+ log_message("Attempted to save an empty entry. Preventing save.")
452
+ # Return current state and a specific error message
453
+ return dataset_entries, "Error: Cannot save an empty entry. Add system message or user/assistant turns."
454
+
455
+
456
+ # Update the entry in the dataset_entries list
457
+ if 0 <= current_index < len(dataset_entries):
458
+ dataset_entries[current_index]['messages'] = updated_messages
459
+ log_message(f"Entry {current_index + 1} updated successfully. New message count: {len(updated_messages)}")
460
+ return dataset_entries, f"Changes saved for Entry {current_index + 1}."
461
+ else:
462
+ log_message(f"Error updating entry: index {current_index} out of bounds.")
463
+ # Return current state and a specific error message
464
+ return dataset_entries, "Error: Cannot update entry. Index out of bounds."
465
+
466
+
467
+ # Function to delete the current entry
468
+ def delete_entry(dataset_entries, current_index):
469
+ """Deletes the current entry from the dataset."""
470
+ log_message(f"Attempting to delete entry at index: {current_index}")
471
+ if not dataset_entries or not (0 <= current_index < len(dataset_entries)):
472
+ log_message("Cannot delete entry: dataset_entries empty or index out of bounds.")
473
+ # If dataset is already empty or index is invalid, just return current state and an error message
474
+ # Return the current index as it hasn't changed due to deletion not happening
475
+ return dataset_entries, current_index, "Error: Cannot delete entry. Dataset is empty or index is out of bounds."
476
+
477
+ deleted_entry_index = current_index # Keep track of the index being deleted
478
+ log_message(f"Deleting entry at index {deleted_entry_index}.")
479
+ del dataset_entries[current_index]
480
+
481
+ # Adjust index after deletion
482
+ new_index = deleted_entry_index
483
+ if new_index >= len(dataset_entries) and len(dataset_entries) > 0:
484
+ new_index = len(dataset_entries) - 1
485
+ log_message(f"Adjusting index after deletion to last entry: {new_index}")
486
+ elif len(dataset_entries) == 0:
487
+ new_index = 0 # Reset index if dataset is empty
488
+ log_message("Dataset is empty after deletion. Resetting index to 0.")
489
+ else:
490
+ log_message(f"Index remains {new_index} after deletion.")
491
+
492
+
493
+ # Return updated dataset, new index, and a success message
494
+ return dataset_entries, new_index, f"Entry {deleted_entry_index + 1} deleted."
495
+
496
+
497
+ # Define the Gradio Interface
498
+ with gr.Blocks() as demo:
499
+ dataset_entries = gr.State([]) # Use Gradio State to maintain dataset entries
500
+ current_messages = gr.State([]) # Use Gradio State to maintain current messages for creation
501
+ current_entry_index = gr.State(0) # Use Gradio State for current viewing index
502
+ current_loaded_filename = gr.State("") # State to hold the name of the currently loaded file
503
+
504
+
505
+ gr.Markdown("## LLM Dataset Creator")
506
+
507
+ with gr.Tabs() as tabs:
508
+ with gr.TabItem("Create Entry", id=0):
509
+ gr.Markdown("### Create a new entry")
510
+ system_message_input = gr.Textbox(label="System Message", lines=5, placeholder="Instruksi peran yang sangat kuat (misalnya: Kamu adalah Yui Airi, teman yang santai...)")
511
+
512
+ gr.Markdown("### User and Assistant Messages")
513
+ user_input = gr.Textbox(label="User Input", lines=3)
514
+ assistant_response = gr.Textbox(label="Assistant Response", lines=3)
515
+
516
+ with gr.Row():
517
+ add_turn_btn = gr.Button("Add User/Assistant Turn")
518
+ clear_turns_btn = gr.Button("Clear Turns")
519
+
520
+ current_turns_output = gr.Markdown("Current Turns:")
521
+ # Add a dedicated status textbox for this tab
522
+ create_status_output = gr.Textbox(label="Status", interactive=False)
523
+
524
+
525
+ add_entry_btn = gr.Button("Add Entry to Dataset")
526
+
527
+ gr.Markdown("### Dataset Entries")
528
+ dataset_size_output = gr.Markdown("Number of entries: 0") # Define dataset_size_output here
529
+
530
+
531
+ # Link add_turn_btn to the add_turn function
532
+ add_turn_btn.click(
533
+ add_turn,
534
+ inputs=[current_messages, user_input, assistant_response],
535
+ outputs=[current_messages, user_input, assistant_response, create_status_output] # Update status output
536
+ ).then( # Chain another event to update the displayed turns and clear status
537
+ lambda messages: ("Current Turns:\n" + "\n".join([f"**{msg['role'].capitalize()}:** {msg['content']}" for msg in messages])),
538
+ inputs=[current_messages],
539
+ outputs=[current_turns_output]
540
+ )
541
+
542
+ # Link clear_turns_btn to the clear_turns function
543
+ clear_turns_btn.click(
544
+ clear_turns,
545
+ inputs=[],
546
+ outputs=[current_messages, create_status_output] # Clear messages and status output
547
+ ).then( # Chain another event to clear the displayed turns
548
+ lambda: "Current Turns:",
549
+ inputs=[],
550
+ outputs=[current_turns_output]
551
+ )
552
+
553
+ # Link add_entry_btn to the add_entry_to_dataset function
554
+ add_entry_btn.click(
555
+ add_entry_to_dataset,
556
+ inputs=[dataset_entries, system_message_input, current_messages],
557
+ outputs=[dataset_entries, system_message_input, current_messages, create_status_output, dataset_size_output] # Update status output
558
+ ).then( # Chain another event to clear turns output
559
+ lambda: "Current Turns:",
560
+ inputs=[],
561
+ outputs=[current_turns_output]
562
+ )
563
+
564
+
565
+ with gr.TabItem("View/Edit Entries", id=1):
566
+ gr.Markdown("### View Dataset Entries")
567
+ entry_display = gr.Markdown("No entries to display yet.") # Define entry_display here
568
+
569
+
570
+ # Components for navigation
571
+ with gr.Row():
572
+ prev_btn = gr.Button("Previous")
573
+ next_btn = gr.Button("Next")
574
+ go_to_input = gr.Number(label="Go to Entry #", value=1, precision=0)
575
+
576
+
577
+ # Textbox for editing system message
578
+ edited_system_message_input = gr.Textbox(label="System Message", lines=5, visible=False) # Define edited_system_message_input here
579
+
580
+
581
+ # Placeholder textboxes for editing user/assistant messages (assuming max 10 messages for simplicity)
582
+ # We need 10 output components for the textboxes
583
+ edited_message_inputs = [gr.Textbox(label=f"Message {i+1}", lines=3, visible=False) for i in range(10)] # Define edited_message_inputs here
584
+
585
+
586
+ save_changes_btn = gr.Button("Save Changes", visible=False) # Define save_changes_btn here
587
+ delete_entry_btn = gr.Button("Delete Entry", visible=False) # Define delete_entry_btn here
588
+
589
+
590
+ edit_status_output = gr.Textbox(label="Edit Status", interactive=False) # Define edit_status_output here, already visible
591
+
592
+
593
+ # Link navigation buttons and go_to_input to update the current_entry_index and display
594
+ # The .then() calls need to output to all 11 textboxes (1 system + 10 messages) and the buttons/status
595
+ prev_btn.click(
596
+ prev_entry,
597
+ inputs=[current_entry_index, dataset_entries],
598
+ outputs=[current_entry_index]
599
+ ).then( # Chain to display the updated entry
600
+ display_entry,
601
+ inputs=[dataset_entries, current_entry_index],
602
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
603
+ )
604
+
605
+ next_btn.click(
606
+ next_entry,
607
+ inputs=[current_entry_index, dataset_entries],
608
+ outputs=[current_entry_index]
609
+ ).then( # Chain to display the updated entry
610
+ display_entry,
611
+ inputs=[dataset_entries, current_entry_index],
612
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
613
+ )
614
+
615
+ go_to_input.submit( # Use submit event for number input
616
+ go_to_entry,
617
+ inputs=[go_to_input, dataset_entries],
618
+ outputs=[current_entry_index, edit_status_output] # Output to index and status
619
+ ).then( # Chain to display the updated entry (or the default if invalid)
620
+ display_entry,
621
+ inputs=[dataset_entries, current_entry_index],
622
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
623
+ )
624
+
625
+ # Add event listener for the 'change' event on go_to_input
626
+ go_to_input.change( # Trigger on change as well
627
+ go_to_entry,
628
+ inputs=[go_to_input, dataset_entries],
629
+ outputs=[current_entry_index, edit_status_output] # Output to index and status
630
+ ).then( # Chain to display the updated entry (or the default if invalid)
631
+ display_entry,
632
+ inputs=[dataset_entries, current_entry_index],
633
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
634
+ )
635
+
636
+
637
+ # Link save_changes_btn to the update_entry_messages function
638
+ save_changes_btn.click(
639
+ update_entry_messages,
640
+ inputs=[dataset_entries, current_entry_index, edited_system_message_input] + edited_message_inputs,
641
+ outputs=[dataset_entries, edit_status_output]
642
+ ).then( # Chain to re-display the entry after saving
643
+ display_entry,
644
+ inputs=[dataset_entries, current_entry_index],
645
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
646
+ )
647
+
648
+ # Link delete_entry_btn to the delete_entry function
649
+ delete_entry_btn.click(
650
+ delete_entry,
651
+ inputs=[dataset_entries, current_entry_index], # Pass State objects as inputs to delete_entry
652
+ outputs=[dataset_entries, current_entry_index, edit_status_output] # delete_entry returns updated list, new index, and status
653
+ ).then( # First chained event: display the new current entry
654
+ fn=display_entry,
655
+ # Take the outputs from delete_entry as inputs for display_entry
656
+ # Mapping: delete_entry outputs (dataset_entries, current_index, edit_status_output)
657
+ # display_entry expects (dataset_entries, current_index)
658
+ inputs=[dataset_entries, current_entry_index],
659
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output]
660
+ ).then( # Second chained event: update dataset size
661
+ lambda entries: f"Number of entries: {len(entries)}",
662
+ inputs=[dataset_entries],
663
+ outputs=[dataset_size_output]
664
+ )
665
+
666
+
667
+ with gr.TabItem("Save/Load Dataset", id=2):
668
+ gr.Markdown("### Save Dataset")
669
+ # Use the state variable for the filename input's value
670
+ filename_to_save = gr.Textbox(label="Enter filename to save", value="dataset.jsonl", key="filename_to_save") # Added key
671
+ with gr.Row():
672
+ save_local_btn = gr.Button("Save to File") # Changed button label
673
+ hf_save_btn = gr.Button("Save to Hugging Face Hub")
674
+
675
+ save_output = gr.Textbox(label="Save Status", interactive=False) # Already visible
676
+
677
+
678
+ with gr.Accordion("Hugging Face Hub (Save)", open=False):
679
+ hf_token_save = gr.Textbox(label="HF API Token", type="password")
680
+ hf_repo_id_save = gr.Textbox(label="HF Repo Name", placeholder="user/repo")
681
+ hf_file_path_save = gr.Textbox(label="File Path in Repo", value="dataset.jsonl")
682
+
683
+
684
+ # Link save buttons to their respective functions
685
+ save_local_btn.click(
686
+ save_dataset,
687
+ inputs=[dataset_entries, filename_to_save],
688
+ outputs=[save_output]
689
+ )
690
+
691
+ hf_save_btn.click(
692
+ save_to_hf,
693
+ inputs=[dataset_entries, hf_token_save, hf_repo_id_save, hf_file_path_save],
694
+ outputs=[save_output]
695
+ )
696
+
697
+
698
+ gr.Markdown("---")
699
+ gr.Markdown("### Load Dataset")
700
+
701
+ # Local File Load - Simplified to directly show upload and path input
702
+ gr.Markdown("#### Load from Local File")
703
+ uploaded_file = gr.File(label="Upload a JSONL file", file_types=[".jsonl"]) # Specify file type
704
+ local_file_path_input = gr.Textbox(label="Or load from local path", placeholder="/path/to/your/dataset.jsonl") # New path input
705
+ load_local_btn = gr.Button("Load Local File") # Changed button label
706
+
707
+
708
+ # Hugging Face Hub Load
709
+ gr.Markdown("#### Load from Hugging Face Hub")
710
+ with gr.Column():
711
+ hf_token_load = gr.Textbox(label="HF API Token (optional for public repos)", type="password")
712
+ hf_repo_id_load = gr.Textbox(label="HF Repository ID (e.g., your_username/your_repo)")
713
+ hf_file_path_load = gr.Textbox(label="Path file JSONL in repository (e.g., dataset.jsonl)")
714
+ load_hf_btn = gr.Button("Muat dari Hugging Face Hub")
715
+
716
+ load_output = gr.Textbox(label="Load Status", interactive=False) # Already visible
717
+
718
+
719
+ # Removed Logic to show/hide load columns based on radio button
720
+
721
+
722
+ # Link load buttons to their respective functions
723
+ # Modified load_local_btn to handle both upload and path input
724
+ load_local_btn.click(
725
+ load_dataset_from_file,
726
+ inputs=[uploaded_file, local_file_path_input], # Pass both file object and path input
727
+ outputs=[dataset_entries, current_entry_index, load_output, current_loaded_filename] # Update state variables and status
728
+ ).then( # Chain to update dataset size and display the first entry
729
+ display_entry, # Call display_entry first
730
+ inputs=[dataset_entries, current_entry_index],
731
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Update UI components
732
+ ).then( # Then update dataset size and filename
733
+ lambda entries, loaded_filename: (f"Number of entries: {len(entries)}", loaded_filename),
734
+ inputs=[dataset_entries, current_loaded_filename],
735
+ outputs=[dataset_size_output, filename_to_save] # Update filename_to_save
736
+ )
737
+
738
+
739
+ load_hf_btn.click(
740
+ load_from_hf,
741
+ inputs=[hf_token_load, hf_repo_id_load, hf_file_path_load],
742
+ outputs=[dataset_entries, current_entry_index, load_output, current_loaded_filename] # Update state variables and status
743
+ ).then( # Chain to update dataset size and display the first entry
744
+ display_entry, # Call display_entry first
745
+ inputs=[dataset_entries, current_entry_index],
746
+ outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Update UI components
747
+ ).then( # Then update dataset size and filename
748
+ lambda entries, loaded_filename: (f"Number of entries: {len(entries)}", loaded_filename),
749
+ inputs=[dataset_entries, current_loaded_filename],
750
+ outputs=[dataset_size_output, filename_to_save] # Update filename_to_save
751
+ )
752
+
753
+
754
+ # Add initial display of dataset size and first entry when the app loads
755
+ # This will also handle the case after loading
756
+ demo.load(
757
+ fn=lambda entries: (f"Number of entries: {len(entries)}",) + display_entry(entries, 0), # Also display the first entry
758
+ inputs=[dataset_entries],
759
+ outputs=[dataset_size_output, entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output]
760
+ )
761
+
762
+ # To run the app in Colab, you'll need to use the public interface
763
+ # demo.launch(share=True)
764
+
765
+ demo.launch(share=True)