File size: 41,318 Bytes
56bfde5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 |
import gradio as gr
import json
import os
from huggingface_hub import HfApi, snapshot_download
import threading
import sys
# Add basic logging
def log_message(message):
print(f"[APP_LOG] {message}", file=sys.stderr) # Use stderr so it appears in Colab output
# Function to handle saving dataset
def save_dataset(dataset_entries, filename):
"""Saves the dataset entries to a JSONL file."""
log_message(f"Attempting to save dataset to local file: {filename}")
if not dataset_entries:
log_message("No entries in dataset_entries to save.")
return "No entries to save."
jsonl_data = ""
try:
for entry in dataset_entries:
# Pastikan entri adalah dictionary sebelum di-dumps
if isinstance(entry, dict):
jsonl_data += json.dumps(entry, ensure_ascii=False) + "\n"
else:
log_message(f"Warning: Skipping non-dictionary entry during local save: {entry}") # Log warning
with open(filename, "w", encoding="utf-8") as f:
f.write(jsonl_data)
log_message(f"Dataset successfully saved to local file: {filename}")
return f"Dataset saved successfully to {filename}"
except Exception as e:
log_message(f"Error saving local file {filename}: {e}")
# Include the specific exception 'e' in the error message
return f"Error saving file: {e}"
# Function to handle saving to Hugging Face Hub
def save_to_hf(dataset_entries, hf_token, hf_repo_id, hf_file_path):
"""Saves the dataset entries to Hugging Face Hub."""
log_message(f"Attempting to save dataset to Hugging Face Hub: {hf_repo_id}/{hf_file_path}")
if not dataset_entries:
log_message("No dataset entries to save to Hugging Face Hub.")
return "No dataset entries to save to Hugging Face Hub."
elif not hf_token or not hf_repo_id or not hf_file_path:
log_message("Missing HF token, repo ID, or file path for saving.")
return "Please provide Hugging Face API Token, Repository Name, and file path."
try:
api = HfApi(token=hf_token)
log_message("HfApi initialized.")
jsonl_data = ""
for entry in dataset_entries:
# Pastikan entri adalah dictionary sebelum di-dumps
if isinstance(entry, dict):
jsonl_data += json.dumps(entry, ensure_ascii=False) + "\n"
else:
log_message(f"Warning: Skipping non-dictionary entry during HF save: {entry}") # Log warning
# Save the data to a temporary file to upload
temp_file_path = "temp_dataset.jsonl"
log_message(f"Saving to temporary file for upload: {temp_file_path}")
with open(temp_file_path, "w", encoding="utf-8") as f:
f.write(jsonl_data)
log_message("Temporary file created.")
# Upload the file
log_message(f"Uploading file to HF Hub: repo_id={hf_repo_id}, path_in_repo={hf_file_path}")
upload_info = api.upload_file(
path_or_fileobj=temp_file_path,
path_in_repo=hf_file_path,
repo_id=hf_repo_id,
repo_type="dataset", # Specify repo type as dataset
commit_message="Add or update dataset via Gradio app"
)
log_message(f"Upload successful. Info: {upload_info}")
# Clean up the temporary file
log_message(f"Removing temporary file: {temp_file_path}")
os.remove(temp_file_path)
return f"Dataset saved successfully to Hugging Face Hub: {upload_info.url}"
except Exception as e:
error_message = f"Error saving to Hugging Face Hub: {e}"
log_message(f"HF Save Error: {e}")
# Enhance specific error messages
if "Repository not found" in str(e):
error_message = f"Error: Repository '{hf_repo_id}' not found. Please check the repository ID. Original error: {e}"
elif "Authentication required" in str(e) or "Invalid token" in str(e):
error_message = f"Error: Authentication failed. Please check your Hugging Face API Token or ensure the repository is public. Original error: {e}"
else:
# Include the specific exception 'e' for other errors
error_message = f"Error saving to Hugging Face Hub: {e}"
return error_message
# Function to handle loading dataset from a file
def load_dataset_from_file(file_obj, local_file_path):
"""Loads dataset entries from an uploaded file object or a local file path."""
log_message("Attempting to load dataset from uploaded file or local path.")
log_message(f"Received file_obj type: {type(file_obj)}")
log_message(f"Received local_file_path type: {type(local_file_path)}")
log_message(f"Received local_file_path value: {local_file_path}")
loaded_entries = []
filename = ""
try:
if file_obj is not None and hasattr(file_obj, 'read'): # Handle file object (upload) if provided
log_message(f"Loading from uploaded file object: {file_obj.name}")
jsonl_data = file_obj.read().decode("utf-8")
filename = os.path.basename(file_obj.name)
log_message(f"Read {len(jsonl_data)} characters from uploaded file object: {filename}")
elif local_file_path is not None and isinstance(local_file_path, str) and local_file_path.strip(): # Handle string (local path) if provided and not empty
file_path = local_file_path.strip()
log_message(f"Loading from local file path: {file_path}")
if not os.path.exists(file_path):
log_message(f"Local file not found: {file_path}")
return [], 0, f"Error loading file: Local file not found at {file_path}", ""
with open(file_path, "r", encoding="utf-8") as f:
jsonl_data = f.read()
filename = os.path.basename(file_path)
log_message(f"Read {len(jsonl_data)} characters from local file path: {filename}")
else:
log_message("No file uploaded or local path provided.")
return [], 0, "Please upload a JSONL file or provide a local path.", "" # Return empty data, index, message, and filename
for i, line in enumerate(jsonl_data.strip().split('\n')):
if line.strip(): # Check if line is not empty after stripping whitespace
try:
loaded_entries.append(json.loads(line))
except json.JSONDecodeError as e:
log_message(f"Error decoding JSON on line {i+1}: {line.strip()} - {e}") # Log decoding errors
# Continue processing other lines even if one fails
pass
else:
log_message(f"Skipping empty line {i+1} in uploaded file.")
log_message(f"Successfully loaded {len(loaded_entries)} entries from file: {filename}")
# Return loaded entries, set index to 0, success message, and filename
return loaded_entries, 0, f"Successfully loaded {len(loaded_entries)} entries.", filename
except Exception as e:
log_message(f"Error loading file: {e}")
# Include the specific exception 'e' in the error message
return [], 0, f"Error loading file: {e}", "" # Return empty data, index, and error message
# Function to handle loading from Hugging Face Hub
def load_from_hf(hf_token, hf_repo_id, hf_file_path):
"""Loads dataset entries from Hugging Face Hub."""
log_message(f"Attempting to load dataset from Hugging Face Hub: {hf_repo_id}/{hf_file_path}")
if not hf_repo_id or not hf_file_path:
log_message("Missing HF repo ID or file path for loading.")
return [], 0, "Please provide Hugging Face Repository ID and file path.", ""
loaded_entries = []
filename_for_save = ""
try:
# Download the file from the Hugging Face Hub
log_message(f"Downloading file from HF Hub: repo_id={hf_repo_id}, path_in_repo={hf_file_path}")
# Pass token if provided, otherwise allow anonymous download for public repos
downloaded_folder = snapshot_download(repo_id=hf_repo_id, allow_patterns=hf_file_path, token=hf_token if hf_token else None)
downloaded_file_path = os.path.join(downloaded_folder, hf_file_path)
log_message(f"File downloaded to temporary path: {downloaded_file_path}")
if not os.path.exists(downloaded_file_path):
log_message(f"Downloaded file not found at expected path: {downloaded_file_path}")
# Provide a specific message if the file is not found in the repo
return [], 0, f"Error: File '{hf_file_path}' not found in repository '{hf_repo_id}'. Please check the file path.", ""
with open(downloaded_file_path, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
if line.strip(): # Check if line is not empty after stripping whitespace
try:
loaded_entries.append(json.loads(line))
except json.JSONDecodeError as e:
log_message(f"Error decoding JSON on line {i+1} in HF file: {line.strip()} - {e}") # Log decoding errors
# Continue processing other lines even if one fails
pass
else:
log_message(f"Skipping empty line {i+1} in HF file.")
# Extract filename for saving
filename_for_save = os.path.basename(hf_file_path)
log_message(f"Successfully loaded {len(loaded_entries)} entries from Hugging Face Hub file: {filename_for_save}")
# Return loaded entries, set index to 0, success message, and filename
return loaded_entries, 0, f"Successfully loaded {len(loaded_entries)} entri dari Hugging Face Hub.", filename_for_save
except Exception as e:
error_message = f"Gagal memuat dari Hugging Face Hub: {e}"
log_message(f"HF Load Error: {e}")
# Enhance specific error messages
if "Repository not found" in str(e):
error_message = f"Error: Repository '{hf_repo_id}' not found. Please check the repository ID. Original error: {e}"
elif "Authentication required" in str(e) or "Invalid token" in str(e):
error_message = f"Error: Authentication failed. Please check your Hugging Face API Token or ensure the repository is public. Original error: {e}"
elif "allow_patterns" in str(e): # Handle specific download errors related to patterns
error_message = f"Error: File path '{hf_file_path}' not found in repository '{hf_repo_id}' or pattern matching failed. Original error: {e}"
else:
# Include the specific exception 'e' for other errors
error_message = f"Error loading from Hugging Face Hub: {e}"
return [], 0, error_message, ""
# Function to add a user/assistant turn
def add_turn(messages, user_input, assistant_response):
"""Adds a user and assistant turn to the current messages."""
log_message("Attempting to add user/assistant turn.")
if not user_input.strip() or not assistant_response.strip(): # Added strip() for validation
log_message("User input or assistant response is empty, not adding turn.")
# Return current state and a user-facing message
return messages, user_input, assistant_response, "Please provide both User Input and Assistant Response."
messages.append({"role": "user", "content": user_input.strip()}) # Added strip() for content
messages.append({"role": "assistant", "content": assistant_response.strip()}) # Added strip() for content
log_message("User/assistant turn added.")
# Return updated messages, clear input fields, and return an empty status message
return messages, "", "", "Turn added successfully." # Return updated messages and clear input fields
# Function to clear turns
def clear_turns():
"""Clears the current messages."""
log_message("Clearing current turns.")
return [], "" # Return empty messages and clear status message
# Function to add an entry to the dataset
def add_entry_to_dataset(dataset_entries, system_message, messages):
"""Adds the current system message and turns as a new entry to the dataset."""
log_message("Attempting to add entry to dataset.")
new_entry_messages = []
if system_message.strip(): # Added strip() for validation
new_entry_messages.append({"role": "system", "content": system_message.strip()}) # Added strip() for content
log_message("System message added to new entry.")
new_entry_messages.extend(messages)
log_message(f"New entry messages: {new_entry_messages}")
if new_entry_messages:
dataset_entries.append({"messages": new_entry_messages})
log_message(f"Entry added to dataset. New dataset size: {len(dataset_entries)}")
# After adding, update the dataset size display
return dataset_entries, "", [], "Entry added to dataset!", f"Number of entries: {len(dataset_entries)}"
else:
log_message("No messages to add as an entry.")
# Return current state and a user-facing message
return dataset_entries, system_message, messages, "Cannot add empty entry. Add system message or user/assistant turns.", f"Number of entries: {len(dataset_entries)}"
# Function to display current entry
def display_entry(dataset_entries, current_index):
"""Displays the messages of the current dataset entry and provides editable textboxes."""
log_message(f"Attempting to display entry at index: {current_index}")
log_message(f"Current dataset_entries size in display_entry: {len(dataset_entries) if dataset_entries is not None else 0}")
# Prepare default outputs for empty dataset or invalid index
empty_display_text = "No entries to display yet."
empty_system_message = ""
# Create a list of 10 gr.update objects for textboxes, setting initial values to "" and visible=False
hidden_textboxes = [gr.update(value="", visible=False) for _ in range(10)]
hide_buttons = gr.update(visible=False)
clear_status = ""
if not dataset_entries:
log_message("dataset_entries is empty, cannot display.")
# Return empty state and hide components
# Note the use of *hidden_textboxes to unpack the list into individual arguments
return empty_display_text, empty_system_message, *hidden_textboxes, hide_buttons, hide_buttons, clear_status
total_entries = len(dataset_entries)
# Ensure current_index is within bounds after operations like deletion
if not (0 <= current_index < total_entries):
log_message(f"Current index {current_index} out of bounds for dataset size {total_entries}. Adjusting.")
# Adjust index to the last entry if out of bounds high, or stay at 0 if empty
current_index = max(0, min(current_index, total_entries - 1)) if total_entries > 0 else 0
log_message(f"Adjusted index: {current_index}")
# Re-evaluate based on the adjusted index
if not (0 <= current_index < total_entries): # Check again if dataset became empty
log_message("Dataset is empty after index adjustment.")
# Return empty state and hide components
# Ensure all output components match the function's expected outputs
return empty_display_text, empty_system_message, *hidden_textboxes, hide_buttons, hide_buttons, clear_status
# Proceed with displaying the valid entry
entry = dataset_entries[current_index]
log_message(f"Displaying entry {current_index + 1} of {total_entries}. Entry content sample: {str(entry)[:100]}...") # Log sample of entry
display_text = f"Viewing Entry {current_index + 1} of {total_entries}\n\n"
system_message_content = ""
messages_content = []
# Separate system message from user/assistant messages
if entry and 'messages' in entry and isinstance(entry['messages'], list) and entry['messages']: # Added type check and emptiness check
if entry['messages'][0]['role'] == 'system':
system_message_content = entry['messages'][0]['content']
messages_content = entry['messages'][1:]
log_message("Found system message and user/assistant messages.")
else: # Assume all messages are user/assistant if the first is not system
messages_content = entry['messages']
log_message("No system message found, displaying all as user/assistant.")
elif entry and 'messages' in entry and isinstance(entry['messages'], list) and not entry['messages']:
log_message("Entry has empty messages list.")
# messages_content remains empty
else: # Handle invalid entry format or missing messages key
log_message(f"Warning: Invalid entry format or missing messages key at index {current_index}: {entry}")
# Return error state for this specific entry and hide components
# Ensure all output components match the function's expected outputs
return f"Error displaying entry {current_index + 1}: Invalid format.", "", *hidden_textboxes, hide_buttons, hide_buttons, ""
# Format display text for user/assistant messages
for msg in messages_content:
display_text += f"**{msg['role'].capitalize()}:** {msg['content']}\n\n"
# Prepare values for the editable textboxes
editable_system_message = system_message_content
# Ensure we only populate up to 10 textboxes
# Also ensure message objects have 'content' key
editable_messages = [msg.get('content', '') for msg in messages_content[:10] if isinstance(msg, dict)] + [""] * (10 - len(messages_content[:10])) # Pad with empty strings up to 10, added safety checks
# Update visibility of message textboxes
# Ensure visibility is based on the actual number of messages_content
textbox_updates = [gr.update(value=editable_messages[i], visible=(i < len(messages_content) and i < 10)) for i in range(10)] # Ensure max 10 textboxes
log_message("Successfully prepared display text and textbox updates.")
# Show edit/delete buttons and clear edit status
# Return all output components, including the updated value for edited_system_message_input
return display_text, gr.update(value=editable_system_message, visible=True), *textbox_updates, gr.update(visible=True), gr.update(visible=True), ""
# Function to navigate to the previous entry
def prev_entry(current_index, dataset_entries):
"""Navigates to the previous entry."""
log_message(f"Navigating to previous entry from index {current_index}")
if current_index > 0:
new_index = current_index - 1
log_message(f"New index: {new_index}")
return new_index
log_message("Already at the beginning (index 0). Staying at 0.")
return 0 # Stay at 0 if already at the beginning
# Function to navigate to the next entry
def next_entry(current_index, dataset_entries):
"""Navigates to the next entry."""
log_message(f"Navigating to next entry from index {current_index}")
if len(dataset_entries) > 0 and current_index < len(dataset_entries) - 1:
new_index = current_index + 1
log_message(f"New index: {new_index}")
return new_index
if len(dataset_entries) > 0:
log_message("Already at the end. Staying at last index.")
return len(dataset_entries) - 1 # Stay at the last index if already at the end
log_message("Dataset is empty. Staying at index 0.")
return 0 # If dataset is empty
# Function to go to a specific entry number
def go_to_entry(entry_number, dataset_entries):
"""Navigates to a specific entry number."""
log_message(f"Attempting to go to entry number: {entry_number}")
total_entries = len(dataset_entries)
default_index = 0 if total_entries == 0 else 0 # Default to 0 if empty, or first if not
try:
# Attempt to convert input to integer
index = int(entry_number) - 1
# Validate index range
if 0 <= index < total_entries:
log_message(f"Valid index calculated: {index}")
# Return valid index and empty status message
return index, ""
else:
log_message(f"Calculated index {index} is out of bounds (0 to {total_entries-1 if total_entries > 0 else 0}).")
# Return default index and error message for out of bounds
return default_index, f"Error: Entry number {entry_number} is out of bounds. Please enter a number between 1 and {total_entries if total_entries > 0 else 1}."
except (ValueError, TypeError):
log_message(f"Invalid input for entry number: {entry_number}")
# Return default index and error message for invalid input type
return default_index, f"Error: Invalid input '{entry_number}'. Please enter a valid integer number."
# Function to update messages in the current entry
def update_entry_messages(dataset_entries, current_index, edited_system_message, *edited_contents):
"""Updates the messages of the current entry with edited content."""
log_message(f"Attempting to update entry at index: {current_index}")
if not dataset_entries or not (0 <= current_index < len(dataset_entries)):
log_message("Cannot update entry: dataset_entries empty or index out of bounds.")
# Return current state and a specific error message
return dataset_entries, "Error: Cannot update entry. Dataset is empty or index is out of bounds."
updated_messages = []
# Handle the system message
if edited_system_message.strip(): # Added strip() for validation and content
updated_messages.append({"role": "system", "content": edited_system_message.strip()})
log_message("Updated system message added.")
# Get original user/assistant messages
original_messages_in_entry = dataset_entries[current_index].get('messages', [])
original_user_assistant_messages = [msg for msg in original_messages_in_entry if msg.get('role') in ['user', 'assistant']] # Added role check and get with default
original_user_assistant_count = len(original_user_assistant_messages)
# Iterate through the edited contents provided by the textboxes
# We are assuming a max of 10 editable message textboxes
for i in range(10): # Process up to 10 edited message textboxes
edited_content = edited_contents[i]
# Check if the edited content is not empty
if edited_content.strip():
# If it corresponds to an original message index, use its original role
if i < original_user_assistant_count:
updated_messages.append({"role": original_user_assistant_messages[i].get('role', 'user'), "content": edited_content.strip()}) # Corrected quote
log_message(f"Updated original message {i+1} with role {original_user_assistant_messages[i].get('role', 'user')}.")
# If it's a new message (beyond original count but within the 10 textboxes)
else:
# Determine role based on the last message added in the updated_messages list
if len(updated_messages) > 0:
last_role = updated_messages[-1]['role']
# Alternate roles, assuming the sequence is always user, assistant, user, assistant...
new_role = 'user' if last_role == 'assistant' else 'assistant'
else:
# If no messages exist yet (only system message or initially empty), the first new message is 'user'
new_role = 'user'
updated_messages.append({"role": new_role, "content": edited_content.strip()}) # Corrected quote
log_message(f"Added new message {i+1} with inferred role {new_role}.")
# If edited content is empty and it was an original message, it's effectively deleted (not added to updated_messages)
elif i < original_user_assistant_count:
log_message(f"Original message {i+1} was cleared, effectively deleting it.")
# Check if the updated entry has any messages (system or user/assistant)
if not updated_messages:
# Prevent saving an empty entry if it wasn't originally empty (unless system message was the only thing and is now empty)
# Allow saving an empty messages list if the original entry only had a system message and it was cleared
if not (len(original_messages_in_entry) == 1 and original_messages_in_entry[0]['role'] == 'system' and not edited_system_message.strip()):
log_message("Attempted to save an empty entry. Preventing save.")
# Return current state and a specific error message
return dataset_entries, "Error: Cannot save an empty entry. Add system message or user/assistant turns."
# Update the entry in the dataset_entries list
if 0 <= current_index < len(dataset_entries):
dataset_entries[current_index]['messages'] = updated_messages
log_message(f"Entry {current_index + 1} updated successfully. New message count: {len(updated_messages)}")
return dataset_entries, f"Changes saved for Entry {current_index + 1}."
else:
log_message(f"Error updating entry: index {current_index} out of bounds.")
# Return current state and a specific error message
return dataset_entries, "Error: Cannot update entry. Index out of bounds."
# Function to delete the current entry
def delete_entry(dataset_entries, current_index):
"""Deletes the current entry from the dataset."""
log_message(f"Attempting to delete entry at index: {current_index}")
if not dataset_entries or not (0 <= current_index < len(dataset_entries)):
log_message("Cannot delete entry: dataset_entries empty or index out of bounds.")
# If dataset is already empty or index is invalid, just return current state and an error message
# Return the current index as it hasn't changed due to deletion not happening
return dataset_entries, current_index, "Error: Cannot delete entry. Dataset is empty or index is out of bounds."
deleted_entry_index = current_index # Keep track of the index being deleted
log_message(f"Deleting entry at index {deleted_entry_index}.")
del dataset_entries[current_index]
# Adjust index after deletion
new_index = deleted_entry_index
if new_index >= len(dataset_entries) and len(dataset_entries) > 0:
new_index = len(dataset_entries) - 1
log_message(f"Adjusting index after deletion to last entry: {new_index}")
elif len(dataset_entries) == 0:
new_index = 0 # Reset index if dataset is empty
log_message("Dataset is empty after deletion. Resetting index to 0.")
else:
log_message(f"Index remains {new_index} after deletion.")
# Return updated dataset, new index, and a success message
return dataset_entries, new_index, f"Entry {deleted_entry_index + 1} deleted."
# Define the Gradio Interface
with gr.Blocks() as demo:
dataset_entries = gr.State([]) # Use Gradio State to maintain dataset entries
current_messages = gr.State([]) # Use Gradio State to maintain current messages for creation
current_entry_index = gr.State(0) # Use Gradio State for current viewing index
current_loaded_filename = gr.State("") # State to hold the name of the currently loaded file
gr.Markdown("## LLM Dataset Creator")
with gr.Tabs() as tabs:
with gr.TabItem("Create Entry", id=0):
gr.Markdown("### Create a new entry")
system_message_input = gr.Textbox(label="System Message", lines=5, placeholder="Instruksi peran yang sangat kuat (misalnya: Kamu adalah Yui Airi, teman yang santai...)")
gr.Markdown("### User and Assistant Messages")
user_input = gr.Textbox(label="User Input", lines=3)
assistant_response = gr.Textbox(label="Assistant Response", lines=3)
with gr.Row():
add_turn_btn = gr.Button("Add User/Assistant Turn")
clear_turns_btn = gr.Button("Clear Turns")
current_turns_output = gr.Markdown("Current Turns:")
# Add a dedicated status textbox for this tab
create_status_output = gr.Textbox(label="Status", interactive=False)
add_entry_btn = gr.Button("Add Entry to Dataset")
gr.Markdown("### Dataset Entries")
dataset_size_output = gr.Markdown("Number of entries: 0") # Define dataset_size_output here
# Link add_turn_btn to the add_turn function
add_turn_btn.click(
add_turn,
inputs=[current_messages, user_input, assistant_response],
outputs=[current_messages, user_input, assistant_response, create_status_output] # Update status output
).then( # Chain another event to update the displayed turns and clear status
lambda messages: ("Current Turns:\n" + "\n".join([f"**{msg['role'].capitalize()}:** {msg['content']}" for msg in messages])),
inputs=[current_messages],
outputs=[current_turns_output]
)
# Link clear_turns_btn to the clear_turns function
clear_turns_btn.click(
clear_turns,
inputs=[],
outputs=[current_messages, create_status_output] # Clear messages and status output
).then( # Chain another event to clear the displayed turns
lambda: "Current Turns:",
inputs=[],
outputs=[current_turns_output]
)
# Link add_entry_btn to the add_entry_to_dataset function
add_entry_btn.click(
add_entry_to_dataset,
inputs=[dataset_entries, system_message_input, current_messages],
outputs=[dataset_entries, system_message_input, current_messages, create_status_output, dataset_size_output] # Update status output
).then( # Chain another event to clear turns output
lambda: "Current Turns:",
inputs=[],
outputs=[current_turns_output]
)
with gr.TabItem("View/Edit Entries", id=1):
gr.Markdown("### View Dataset Entries")
entry_display = gr.Markdown("No entries to display yet.") # Define entry_display here
# Components for navigation
with gr.Row():
prev_btn = gr.Button("Previous")
next_btn = gr.Button("Next")
go_to_input = gr.Number(label="Go to Entry #", value=1, precision=0)
# Textbox for editing system message
edited_system_message_input = gr.Textbox(label="System Message", lines=5, visible=False) # Define edited_system_message_input here
# Placeholder textboxes for editing user/assistant messages (assuming max 10 messages for simplicity)
# We need 10 output components for the textboxes
edited_message_inputs = [gr.Textbox(label=f"Message {i+1}", lines=3, visible=False) for i in range(10)] # Define edited_message_inputs here
save_changes_btn = gr.Button("Save Changes", visible=False) # Define save_changes_btn here
delete_entry_btn = gr.Button("Delete Entry", visible=False) # Define delete_entry_btn here
edit_status_output = gr.Textbox(label="Edit Status", interactive=False) # Define edit_status_output here, already visible
# Link navigation buttons and go_to_input to update the current_entry_index and display
# The .then() calls need to output to all 11 textboxes (1 system + 10 messages) and the buttons/status
prev_btn.click(
prev_entry,
inputs=[current_entry_index, dataset_entries],
outputs=[current_entry_index]
).then( # Chain to display the updated entry
display_entry,
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
)
next_btn.click(
next_entry,
inputs=[current_entry_index, dataset_entries],
outputs=[current_entry_index]
).then( # Chain to display the updated entry
display_entry,
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
)
go_to_input.submit( # Use submit event for number input
go_to_entry,
inputs=[go_to_input, dataset_entries],
outputs=[current_entry_index, edit_status_output] # Output to index and status
).then( # Chain to display the updated entry (or the default if invalid)
display_entry,
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
)
# Add event listener for the 'change' event on go_to_input
go_to_input.change( # Trigger on change as well
go_to_entry,
inputs=[go_to_input, dataset_entries],
outputs=[current_entry_index, edit_status_output] # Output to index and status
).then( # Chain to display the updated entry (or the default if invalid)
display_entry,
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
)
# Link save_changes_btn to the update_entry_messages function
save_changes_btn.click(
update_entry_messages,
inputs=[dataset_entries, current_entry_index, edited_system_message_input] + edited_message_inputs,
outputs=[dataset_entries, edit_status_output]
).then( # Chain to re-display the entry after saving
display_entry,
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
)
# Link delete_entry_btn to the delete_entry function
delete_entry_btn.click(
delete_entry,
inputs=[dataset_entries, current_entry_index], # Pass State objects as inputs to delete_entry
outputs=[dataset_entries, current_entry_index, edit_status_output] # delete_entry returns updated list, new index, and status
).then( # First chained event: display the new current entry
fn=display_entry,
# Take the outputs from delete_entry as inputs for display_entry
# Mapping: delete_entry outputs (dataset_entries, current_index, edit_status_output)
# display_entry expects (dataset_entries, current_index)
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output]
).then( # Second chained event: update dataset size
lambda entries: f"Number of entries: {len(entries)}",
inputs=[dataset_entries],
outputs=[dataset_size_output]
)
with gr.TabItem("Save/Load Dataset", id=2):
gr.Markdown("### Save Dataset")
# Use the state variable for the filename input's value
filename_to_save = gr.Textbox(label="Enter filename to save", value="dataset.jsonl", key="filename_to_save") # Added key
with gr.Row():
save_local_btn = gr.Button("Save to File") # Changed button label
hf_save_btn = gr.Button("Save to Hugging Face Hub")
save_output = gr.Textbox(label="Save Status", interactive=False) # Already visible
with gr.Accordion("Hugging Face Hub (Save)", open=False):
hf_token_save = gr.Textbox(label="HF API Token", type="password")
hf_repo_id_save = gr.Textbox(label="HF Repo Name", placeholder="user/repo")
hf_file_path_save = gr.Textbox(label="File Path in Repo", value="dataset.jsonl")
# Link save buttons to their respective functions
save_local_btn.click(
save_dataset,
inputs=[dataset_entries, filename_to_save],
outputs=[save_output]
)
hf_save_btn.click(
save_to_hf,
inputs=[dataset_entries, hf_token_save, hf_repo_id_save, hf_file_path_save],
outputs=[save_output]
)
gr.Markdown("---")
gr.Markdown("### Load Dataset")
# Local File Load - Simplified to directly show upload and path input
gr.Markdown("#### Load from Local File")
uploaded_file = gr.File(label="Upload a JSONL file", file_types=[".jsonl"]) # Specify file type
local_file_path_input = gr.Textbox(label="Or load from local path", placeholder="/path/to/your/dataset.jsonl") # New path input
load_local_btn = gr.Button("Load Local File") # Changed button label
# Hugging Face Hub Load
gr.Markdown("#### Load from Hugging Face Hub")
with gr.Column():
hf_token_load = gr.Textbox(label="HF API Token (optional for public repos)", type="password")
hf_repo_id_load = gr.Textbox(label="HF Repository ID (e.g., your_username/your_repo)")
hf_file_path_load = gr.Textbox(label="Path file JSONL in repository (e.g., dataset.jsonl)")
load_hf_btn = gr.Button("Muat dari Hugging Face Hub")
load_output = gr.Textbox(label="Load Status", interactive=False) # Already visible
# Removed Logic to show/hide load columns based on radio button
# Link load buttons to their respective functions
# Modified load_local_btn to handle both upload and path input
load_local_btn.click(
load_dataset_from_file,
inputs=[uploaded_file, local_file_path_input], # Pass both file object and path input
outputs=[dataset_entries, current_entry_index, load_output, current_loaded_filename] # Update state variables and status
).then( # Chain to update dataset size and display the first entry
display_entry, # Call display_entry first
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Update UI components
).then( # Then update dataset size and filename
lambda entries, loaded_filename: (f"Number of entries: {len(entries)}", loaded_filename),
inputs=[dataset_entries, current_loaded_filename],
outputs=[dataset_size_output, filename_to_save] # Update filename_to_save
)
load_hf_btn.click(
load_from_hf,
inputs=[hf_token_load, hf_repo_id_load, hf_file_path_load],
outputs=[dataset_entries, current_entry_index, load_output, current_loaded_filename] # Update state variables and status
).then( # Chain to update dataset size and display the first entry
display_entry, # Call display_entry first
inputs=[dataset_entries, current_entry_index],
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Update UI components
).then( # Then update dataset size and filename
lambda entries, loaded_filename: (f"Number of entries: {len(entries)}", loaded_filename),
inputs=[dataset_entries, current_loaded_filename],
outputs=[dataset_size_output, filename_to_save] # Update filename_to_save
)
# Add initial display of dataset size and first entry when the app loads
# This will also handle the case after loading
demo.load(
fn=lambda entries: (f"Number of entries: {len(entries)}",) + display_entry(entries, 0), # Also display the first entry
inputs=[dataset_entries],
outputs=[dataset_size_output, entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output]
)
# To run the app in Colab, you'll need to use the public interface
# demo.launch(share=True)
demo.launch(share=True)
|