Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,765 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from huggingface_hub import HfApi, snapshot_download
|
| 6 |
+
import threading
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Add basic logging
|
| 10 |
+
def log_message(message):
|
| 11 |
+
print(f"[APP_LOG] {message}", file=sys.stderr) # Use stderr so it appears in Colab output
|
| 12 |
+
|
| 13 |
+
# Function to handle saving dataset
|
| 14 |
+
def save_dataset(dataset_entries, filename):
|
| 15 |
+
"""Saves the dataset entries to a JSONL file."""
|
| 16 |
+
log_message(f"Attempting to save dataset to local file: {filename}")
|
| 17 |
+
if not dataset_entries:
|
| 18 |
+
log_message("No entries in dataset_entries to save.")
|
| 19 |
+
return "No entries to save."
|
| 20 |
+
|
| 21 |
+
jsonl_data = ""
|
| 22 |
+
try:
|
| 23 |
+
for entry in dataset_entries:
|
| 24 |
+
# Pastikan entri adalah dictionary sebelum di-dumps
|
| 25 |
+
if isinstance(entry, dict):
|
| 26 |
+
jsonl_data += json.dumps(entry, ensure_ascii=False) + "\n"
|
| 27 |
+
else:
|
| 28 |
+
log_message(f"Warning: Skipping non-dictionary entry during local save: {entry}") # Log warning
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
with open(filename, "w", encoding="utf-8") as f:
|
| 32 |
+
f.write(jsonl_data)
|
| 33 |
+
log_message(f"Dataset successfully saved to local file: {filename}")
|
| 34 |
+
return f"Dataset saved successfully to {filename}"
|
| 35 |
+
except Exception as e:
|
| 36 |
+
log_message(f"Error saving local file {filename}: {e}")
|
| 37 |
+
# Include the specific exception 'e' in the error message
|
| 38 |
+
return f"Error saving file: {e}"
|
| 39 |
+
|
| 40 |
+
# Function to handle saving to Hugging Face Hub
|
| 41 |
+
def save_to_hf(dataset_entries, hf_token, hf_repo_id, hf_file_path):
|
| 42 |
+
"""Saves the dataset entries to Hugging Face Hub."""
|
| 43 |
+
log_message(f"Attempting to save dataset to Hugging Face Hub: {hf_repo_id}/{hf_file_path}")
|
| 44 |
+
if not dataset_entries:
|
| 45 |
+
log_message("No dataset entries to save to Hugging Face Hub.")
|
| 46 |
+
return "No dataset entries to save to Hugging Face Hub."
|
| 47 |
+
elif not hf_token or not hf_repo_id or not hf_file_path:
|
| 48 |
+
log_message("Missing HF token, repo ID, or file path for saving.")
|
| 49 |
+
return "Please provide Hugging Face API Token, Repository Name, and file path."
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
api = HfApi(token=hf_token)
|
| 53 |
+
log_message("HfApi initialized.")
|
| 54 |
+
|
| 55 |
+
jsonl_data = ""
|
| 56 |
+
for entry in dataset_entries:
|
| 57 |
+
# Pastikan entri adalah dictionary sebelum di-dumps
|
| 58 |
+
if isinstance(entry, dict):
|
| 59 |
+
jsonl_data += json.dumps(entry, ensure_ascii=False) + "\n"
|
| 60 |
+
else:
|
| 61 |
+
log_message(f"Warning: Skipping non-dictionary entry during HF save: {entry}") # Log warning
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# Save the data to a temporary file to upload
|
| 65 |
+
temp_file_path = "temp_dataset.jsonl"
|
| 66 |
+
log_message(f"Saving to temporary file for upload: {temp_file_path}")
|
| 67 |
+
with open(temp_file_path, "w", encoding="utf-8") as f:
|
| 68 |
+
f.write(jsonl_data)
|
| 69 |
+
log_message("Temporary file created.")
|
| 70 |
+
|
| 71 |
+
# Upload the file
|
| 72 |
+
log_message(f"Uploading file to HF Hub: repo_id={hf_repo_id}, path_in_repo={hf_file_path}")
|
| 73 |
+
upload_info = api.upload_file(
|
| 74 |
+
path_or_fileobj=temp_file_path,
|
| 75 |
+
path_in_repo=hf_file_path,
|
| 76 |
+
repo_id=hf_repo_id,
|
| 77 |
+
repo_type="dataset", # Specify repo type as dataset
|
| 78 |
+
commit_message="Add or update dataset via Gradio app"
|
| 79 |
+
)
|
| 80 |
+
log_message(f"Upload successful. Info: {upload_info}")
|
| 81 |
+
|
| 82 |
+
# Clean up the temporary file
|
| 83 |
+
log_message(f"Removing temporary file: {temp_file_path}")
|
| 84 |
+
os.remove(temp_file_path)
|
| 85 |
+
|
| 86 |
+
return f"Dataset saved successfully to Hugging Face Hub: {upload_info.url}"
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
error_message = f"Error saving to Hugging Face Hub: {e}"
|
| 90 |
+
log_message(f"HF Save Error: {e}")
|
| 91 |
+
# Enhance specific error messages
|
| 92 |
+
if "Repository not found" in str(e):
|
| 93 |
+
error_message = f"Error: Repository '{hf_repo_id}' not found. Please check the repository ID. Original error: {e}"
|
| 94 |
+
elif "Authentication required" in str(e) or "Invalid token" in str(e):
|
| 95 |
+
error_message = f"Error: Authentication failed. Please check your Hugging Face API Token or ensure the repository is public. Original error: {e}"
|
| 96 |
+
else:
|
| 97 |
+
# Include the specific exception 'e' for other errors
|
| 98 |
+
error_message = f"Error saving to Hugging Face Hub: {e}"
|
| 99 |
+
return error_message
|
| 100 |
+
|
| 101 |
+
# Function to handle loading dataset from a file
|
| 102 |
+
def load_dataset_from_file(file_obj, local_file_path):
|
| 103 |
+
"""Loads dataset entries from an uploaded file object or a local file path."""
|
| 104 |
+
log_message("Attempting to load dataset from uploaded file or local path.")
|
| 105 |
+
log_message(f"Received file_obj type: {type(file_obj)}")
|
| 106 |
+
log_message(f"Received local_file_path type: {type(local_file_path)}")
|
| 107 |
+
log_message(f"Received local_file_path value: {local_file_path}")
|
| 108 |
+
|
| 109 |
+
loaded_entries = []
|
| 110 |
+
filename = ""
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
if file_obj is not None and hasattr(file_obj, 'read'): # Handle file object (upload) if provided
|
| 114 |
+
log_message(f"Loading from uploaded file object: {file_obj.name}")
|
| 115 |
+
jsonl_data = file_obj.read().decode("utf-8")
|
| 116 |
+
filename = os.path.basename(file_obj.name)
|
| 117 |
+
log_message(f"Read {len(jsonl_data)} characters from uploaded file object: {filename}")
|
| 118 |
+
elif local_file_path is not None and isinstance(local_file_path, str) and local_file_path.strip(): # Handle string (local path) if provided and not empty
|
| 119 |
+
file_path = local_file_path.strip()
|
| 120 |
+
log_message(f"Loading from local file path: {file_path}")
|
| 121 |
+
if not os.path.exists(file_path):
|
| 122 |
+
log_message(f"Local file not found: {file_path}")
|
| 123 |
+
return [], 0, f"Error loading file: Local file not found at {file_path}", ""
|
| 124 |
+
|
| 125 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 126 |
+
jsonl_data = f.read()
|
| 127 |
+
filename = os.path.basename(file_path)
|
| 128 |
+
log_message(f"Read {len(jsonl_data)} characters from local file path: {filename}")
|
| 129 |
+
else:
|
| 130 |
+
log_message("No file uploaded or local path provided.")
|
| 131 |
+
return [], 0, "Please upload a JSONL file or provide a local path.", "" # Return empty data, index, message, and filename
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
for i, line in enumerate(jsonl_data.strip().split('\n')):
|
| 135 |
+
if line.strip(): # Check if line is not empty after stripping whitespace
|
| 136 |
+
try:
|
| 137 |
+
loaded_entries.append(json.loads(line))
|
| 138 |
+
except json.JSONDecodeError as e:
|
| 139 |
+
log_message(f"Error decoding JSON on line {i+1}: {line.strip()} - {e}") # Log decoding errors
|
| 140 |
+
# Continue processing other lines even if one fails
|
| 141 |
+
pass
|
| 142 |
+
else:
|
| 143 |
+
log_message(f"Skipping empty line {i+1} in uploaded file.")
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
log_message(f"Successfully loaded {len(loaded_entries)} entries from file: {filename}")
|
| 147 |
+
|
| 148 |
+
# Return loaded entries, set index to 0, success message, and filename
|
| 149 |
+
return loaded_entries, 0, f"Successfully loaded {len(loaded_entries)} entries.", filename
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
log_message(f"Error loading file: {e}")
|
| 153 |
+
# Include the specific exception 'e' in the error message
|
| 154 |
+
return [], 0, f"Error loading file: {e}", "" # Return empty data, index, and error message
|
| 155 |
+
|
| 156 |
+
# Function to handle loading from Hugging Face Hub
|
| 157 |
+
def load_from_hf(hf_token, hf_repo_id, hf_file_path):
|
| 158 |
+
"""Loads dataset entries from Hugging Face Hub."""
|
| 159 |
+
log_message(f"Attempting to load dataset from Hugging Face Hub: {hf_repo_id}/{hf_file_path}")
|
| 160 |
+
if not hf_repo_id or not hf_file_path:
|
| 161 |
+
log_message("Missing HF repo ID or file path for loading.")
|
| 162 |
+
return [], 0, "Please provide Hugging Face Repository ID and file path.", ""
|
| 163 |
+
|
| 164 |
+
loaded_entries = []
|
| 165 |
+
filename_for_save = ""
|
| 166 |
+
try:
|
| 167 |
+
# Download the file from the Hugging Face Hub
|
| 168 |
+
log_message(f"Downloading file from HF Hub: repo_id={hf_repo_id}, path_in_repo={hf_file_path}")
|
| 169 |
+
# Pass token if provided, otherwise allow anonymous download for public repos
|
| 170 |
+
downloaded_folder = snapshot_download(repo_id=hf_repo_id, allow_patterns=hf_file_path, token=hf_token if hf_token else None)
|
| 171 |
+
downloaded_file_path = os.path.join(downloaded_folder, hf_file_path)
|
| 172 |
+
log_message(f"File downloaded to temporary path: {downloaded_file_path}")
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
if not os.path.exists(downloaded_file_path):
|
| 176 |
+
log_message(f"Downloaded file not found at expected path: {downloaded_file_path}")
|
| 177 |
+
# Provide a specific message if the file is not found in the repo
|
| 178 |
+
return [], 0, f"Error: File '{hf_file_path}' not found in repository '{hf_repo_id}'. Please check the file path.", ""
|
| 179 |
+
|
| 180 |
+
with open(downloaded_file_path, "r", encoding="utf-8") as f:
|
| 181 |
+
for i, line in enumerate(f):
|
| 182 |
+
if line.strip(): # Check if line is not empty after stripping whitespace
|
| 183 |
+
try:
|
| 184 |
+
loaded_entries.append(json.loads(line))
|
| 185 |
+
except json.JSONDecodeError as e:
|
| 186 |
+
log_message(f"Error decoding JSON on line {i+1} in HF file: {line.strip()} - {e}") # Log decoding errors
|
| 187 |
+
# Continue processing other lines even if one fails
|
| 188 |
+
pass
|
| 189 |
+
else:
|
| 190 |
+
log_message(f"Skipping empty line {i+1} in HF file.")
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# Extract filename for saving
|
| 194 |
+
filename_for_save = os.path.basename(hf_file_path)
|
| 195 |
+
log_message(f"Successfully loaded {len(loaded_entries)} entries from Hugging Face Hub file: {filename_for_save}")
|
| 196 |
+
|
| 197 |
+
# Return loaded entries, set index to 0, success message, and filename
|
| 198 |
+
return loaded_entries, 0, f"Successfully loaded {len(loaded_entries)} entri dari Hugging Face Hub.", filename_for_save
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
error_message = f"Gagal memuat dari Hugging Face Hub: {e}"
|
| 202 |
+
log_message(f"HF Load Error: {e}")
|
| 203 |
+
# Enhance specific error messages
|
| 204 |
+
if "Repository not found" in str(e):
|
| 205 |
+
error_message = f"Error: Repository '{hf_repo_id}' not found. Please check the repository ID. Original error: {e}"
|
| 206 |
+
elif "Authentication required" in str(e) or "Invalid token" in str(e):
|
| 207 |
+
error_message = f"Error: Authentication failed. Please check your Hugging Face API Token or ensure the repository is public. Original error: {e}"
|
| 208 |
+
elif "allow_patterns" in str(e): # Handle specific download errors related to patterns
|
| 209 |
+
error_message = f"Error: File path '{hf_file_path}' not found in repository '{hf_repo_id}' or pattern matching failed. Original error: {e}"
|
| 210 |
+
else:
|
| 211 |
+
# Include the specific exception 'e' for other errors
|
| 212 |
+
error_message = f"Error loading from Hugging Face Hub: {e}"
|
| 213 |
+
return [], 0, error_message, ""
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# Function to add a user/assistant turn
|
| 217 |
+
def add_turn(messages, user_input, assistant_response):
|
| 218 |
+
"""Adds a user and assistant turn to the current messages."""
|
| 219 |
+
log_message("Attempting to add user/assistant turn.")
|
| 220 |
+
if not user_input.strip() or not assistant_response.strip(): # Added strip() for validation
|
| 221 |
+
log_message("User input or assistant response is empty, not adding turn.")
|
| 222 |
+
# Return current state and a user-facing message
|
| 223 |
+
return messages, user_input, assistant_response, "Please provide both User Input and Assistant Response."
|
| 224 |
+
|
| 225 |
+
messages.append({"role": "user", "content": user_input.strip()}) # Added strip() for content
|
| 226 |
+
messages.append({"role": "assistant", "content": assistant_response.strip()}) # Added strip() for content
|
| 227 |
+
log_message("User/assistant turn added.")
|
| 228 |
+
# Return updated messages, clear input fields, and return an empty status message
|
| 229 |
+
return messages, "", "", "Turn added successfully." # Return updated messages and clear input fields
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# Function to clear turns
|
| 233 |
+
def clear_turns():
|
| 234 |
+
"""Clears the current messages."""
|
| 235 |
+
log_message("Clearing current turns.")
|
| 236 |
+
return [], "" # Return empty messages and clear status message
|
| 237 |
+
|
| 238 |
+
# Function to add an entry to the dataset
|
| 239 |
+
def add_entry_to_dataset(dataset_entries, system_message, messages):
|
| 240 |
+
"""Adds the current system message and turns as a new entry to the dataset."""
|
| 241 |
+
log_message("Attempting to add entry to dataset.")
|
| 242 |
+
new_entry_messages = []
|
| 243 |
+
if system_message.strip(): # Added strip() for validation
|
| 244 |
+
new_entry_messages.append({"role": "system", "content": system_message.strip()}) # Added strip() for content
|
| 245 |
+
log_message("System message added to new entry.")
|
| 246 |
+
new_entry_messages.extend(messages)
|
| 247 |
+
log_message(f"New entry messages: {new_entry_messages}")
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
if new_entry_messages:
|
| 251 |
+
dataset_entries.append({"messages": new_entry_messages})
|
| 252 |
+
log_message(f"Entry added to dataset. New dataset size: {len(dataset_entries)}")
|
| 253 |
+
# After adding, update the dataset size display
|
| 254 |
+
return dataset_entries, "", [], "Entry added to dataset!", f"Number of entries: {len(dataset_entries)}"
|
| 255 |
+
else:
|
| 256 |
+
log_message("No messages to add as an entry.")
|
| 257 |
+
# Return current state and a user-facing message
|
| 258 |
+
return dataset_entries, system_message, messages, "Cannot add empty entry. Add system message or user/assistant turns.", f"Number of entries: {len(dataset_entries)}"
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# Function to display current entry
|
| 262 |
+
def display_entry(dataset_entries, current_index):
|
| 263 |
+
"""Displays the messages of the current dataset entry and provides editable textboxes."""
|
| 264 |
+
log_message(f"Attempting to display entry at index: {current_index}")
|
| 265 |
+
log_message(f"Current dataset_entries size in display_entry: {len(dataset_entries) if dataset_entries is not None else 0}")
|
| 266 |
+
|
| 267 |
+
# Prepare default outputs for empty dataset or invalid index
|
| 268 |
+
empty_display_text = "No entries to display yet."
|
| 269 |
+
empty_system_message = ""
|
| 270 |
+
# Create a list of 10 gr.update objects for textboxes, setting initial values to "" and visible=False
|
| 271 |
+
hidden_textboxes = [gr.update(value="", visible=False) for _ in range(10)]
|
| 272 |
+
hide_buttons = gr.update(visible=False)
|
| 273 |
+
clear_status = ""
|
| 274 |
+
|
| 275 |
+
if not dataset_entries:
|
| 276 |
+
log_message("dataset_entries is empty, cannot display.")
|
| 277 |
+
# Return empty state and hide components
|
| 278 |
+
# Note the use of *hidden_textboxes to unpack the list into individual arguments
|
| 279 |
+
return empty_display_text, empty_system_message, *hidden_textboxes, hide_buttons, hide_buttons, clear_status
|
| 280 |
+
|
| 281 |
+
total_entries = len(dataset_entries)
|
| 282 |
+
# Ensure current_index is within bounds after operations like deletion
|
| 283 |
+
if not (0 <= current_index < total_entries):
|
| 284 |
+
log_message(f"Current index {current_index} out of bounds for dataset size {total_entries}. Adjusting.")
|
| 285 |
+
# Adjust index to the last entry if out of bounds high, or stay at 0 if empty
|
| 286 |
+
current_index = max(0, min(current_index, total_entries - 1)) if total_entries > 0 else 0
|
| 287 |
+
log_message(f"Adjusted index: {current_index}")
|
| 288 |
+
# Re-evaluate based on the adjusted index
|
| 289 |
+
if not (0 <= current_index < total_entries): # Check again if dataset became empty
|
| 290 |
+
log_message("Dataset is empty after index adjustment.")
|
| 291 |
+
# Return empty state and hide components
|
| 292 |
+
# Ensure all output components match the function's expected outputs
|
| 293 |
+
return empty_display_text, empty_system_message, *hidden_textboxes, hide_buttons, hide_buttons, clear_status
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
# Proceed with displaying the valid entry
|
| 297 |
+
entry = dataset_entries[current_index]
|
| 298 |
+
log_message(f"Displaying entry {current_index + 1} of {total_entries}. Entry content sample: {str(entry)[:100]}...") # Log sample of entry
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
display_text = f"Viewing Entry {current_index + 1} of {total_entries}\n\n"
|
| 302 |
+
|
| 303 |
+
system_message_content = ""
|
| 304 |
+
messages_content = []
|
| 305 |
+
|
| 306 |
+
# Separate system message from user/assistant messages
|
| 307 |
+
if entry and 'messages' in entry and isinstance(entry['messages'], list) and entry['messages']: # Added type check and emptiness check
|
| 308 |
+
if entry['messages'][0]['role'] == 'system':
|
| 309 |
+
system_message_content = entry['messages'][0]['content']
|
| 310 |
+
messages_content = entry['messages'][1:]
|
| 311 |
+
log_message("Found system message and user/assistant messages.")
|
| 312 |
+
else: # Assume all messages are user/assistant if the first is not system
|
| 313 |
+
messages_content = entry['messages']
|
| 314 |
+
log_message("No system message found, displaying all as user/assistant.")
|
| 315 |
+
elif entry and 'messages' in entry and isinstance(entry['messages'], list) and not entry['messages']:
|
| 316 |
+
log_message("Entry has empty messages list.")
|
| 317 |
+
# messages_content remains empty
|
| 318 |
+
else: # Handle invalid entry format or missing messages key
|
| 319 |
+
log_message(f"Warning: Invalid entry format or missing messages key at index {current_index}: {entry}")
|
| 320 |
+
# Return error state for this specific entry and hide components
|
| 321 |
+
# Ensure all output components match the function's expected outputs
|
| 322 |
+
return f"Error displaying entry {current_index + 1}: Invalid format.", "", *hidden_textboxes, hide_buttons, hide_buttons, ""
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
# Format display text for user/assistant messages
|
| 326 |
+
for msg in messages_content:
|
| 327 |
+
display_text += f"**{msg['role'].capitalize()}:** {msg['content']}\n\n"
|
| 328 |
+
|
| 329 |
+
# Prepare values for the editable textboxes
|
| 330 |
+
editable_system_message = system_message_content
|
| 331 |
+
# Ensure we only populate up to 10 textboxes
|
| 332 |
+
# Also ensure message objects have 'content' key
|
| 333 |
+
editable_messages = [msg.get('content', '') for msg in messages_content[:10] if isinstance(msg, dict)] + [""] * (10 - len(messages_content[:10])) # Pad with empty strings up to 10, added safety checks
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
# Update visibility of message textboxes
|
| 337 |
+
# Ensure visibility is based on the actual number of messages_content
|
| 338 |
+
textbox_updates = [gr.update(value=editable_messages[i], visible=(i < len(messages_content) and i < 10)) for i in range(10)] # Ensure max 10 textboxes
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
log_message("Successfully prepared display text and textbox updates.")
|
| 342 |
+
# Show edit/delete buttons and clear edit status
|
| 343 |
+
# Return all output components, including the updated value for edited_system_message_input
|
| 344 |
+
return display_text, gr.update(value=editable_system_message, visible=True), *textbox_updates, gr.update(visible=True), gr.update(visible=True), ""
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
# Function to navigate to the previous entry
|
| 348 |
+
def prev_entry(current_index, dataset_entries):
|
| 349 |
+
"""Navigates to the previous entry."""
|
| 350 |
+
log_message(f"Navigating to previous entry from index {current_index}")
|
| 351 |
+
if current_index > 0:
|
| 352 |
+
new_index = current_index - 1
|
| 353 |
+
log_message(f"New index: {new_index}")
|
| 354 |
+
return new_index
|
| 355 |
+
log_message("Already at the beginning (index 0). Staying at 0.")
|
| 356 |
+
return 0 # Stay at 0 if already at the beginning
|
| 357 |
+
|
| 358 |
+
# Function to navigate to the next entry
|
| 359 |
+
def next_entry(current_index, dataset_entries):
|
| 360 |
+
"""Navigates to the next entry."""
|
| 361 |
+
log_message(f"Navigating to next entry from index {current_index}")
|
| 362 |
+
if len(dataset_entries) > 0 and current_index < len(dataset_entries) - 1:
|
| 363 |
+
new_index = current_index + 1
|
| 364 |
+
log_message(f"New index: {new_index}")
|
| 365 |
+
return new_index
|
| 366 |
+
if len(dataset_entries) > 0:
|
| 367 |
+
log_message("Already at the end. Staying at last index.")
|
| 368 |
+
return len(dataset_entries) - 1 # Stay at the last index if already at the end
|
| 369 |
+
log_message("Dataset is empty. Staying at index 0.")
|
| 370 |
+
return 0 # If dataset is empty
|
| 371 |
+
|
| 372 |
+
# Function to go to a specific entry number
|
| 373 |
+
def go_to_entry(entry_number, dataset_entries):
|
| 374 |
+
"""Navigates to a specific entry number."""
|
| 375 |
+
log_message(f"Attempting to go to entry number: {entry_number}")
|
| 376 |
+
total_entries = len(dataset_entries)
|
| 377 |
+
default_index = 0 if total_entries == 0 else 0 # Default to 0 if empty, or first if not
|
| 378 |
+
|
| 379 |
+
try:
|
| 380 |
+
# Attempt to convert input to integer
|
| 381 |
+
index = int(entry_number) - 1
|
| 382 |
+
# Validate index range
|
| 383 |
+
if 0 <= index < total_entries:
|
| 384 |
+
log_message(f"Valid index calculated: {index}")
|
| 385 |
+
# Return valid index and empty status message
|
| 386 |
+
return index, ""
|
| 387 |
+
else:
|
| 388 |
+
log_message(f"Calculated index {index} is out of bounds (0 to {total_entries-1 if total_entries > 0 else 0}).")
|
| 389 |
+
# Return default index and error message for out of bounds
|
| 390 |
+
return default_index, f"Error: Entry number {entry_number} is out of bounds. Please enter a number between 1 and {total_entries if total_entries > 0 else 1}."
|
| 391 |
+
except (ValueError, TypeError):
|
| 392 |
+
log_message(f"Invalid input for entry number: {entry_number}")
|
| 393 |
+
# Return default index and error message for invalid input type
|
| 394 |
+
return default_index, f"Error: Invalid input '{entry_number}'. Please enter a valid integer number."
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
# Function to update messages in the current entry
|
| 398 |
+
def update_entry_messages(dataset_entries, current_index, edited_system_message, *edited_contents):
|
| 399 |
+
"""Updates the messages of the current entry with edited content."""
|
| 400 |
+
log_message(f"Attempting to update entry at index: {current_index}")
|
| 401 |
+
if not dataset_entries or not (0 <= current_index < len(dataset_entries)):
|
| 402 |
+
log_message("Cannot update entry: dataset_entries empty or index out of bounds.")
|
| 403 |
+
# Return current state and a specific error message
|
| 404 |
+
return dataset_entries, "Error: Cannot update entry. Dataset is empty or index is out of bounds."
|
| 405 |
+
|
| 406 |
+
updated_messages = []
|
| 407 |
+
|
| 408 |
+
# Handle the system message
|
| 409 |
+
if edited_system_message.strip(): # Added strip() for validation and content
|
| 410 |
+
updated_messages.append({"role": "system", "content": edited_system_message.strip()})
|
| 411 |
+
log_message("Updated system message added.")
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
# Get original user/assistant messages
|
| 415 |
+
original_messages_in_entry = dataset_entries[current_index].get('messages', [])
|
| 416 |
+
original_user_assistant_messages = [msg for msg in original_messages_in_entry if msg.get('role') in ['user', 'assistant']] # Added role check and get with default
|
| 417 |
+
original_user_assistant_count = len(original_user_assistant_messages)
|
| 418 |
+
|
| 419 |
+
# Iterate through the edited contents provided by the textboxes
|
| 420 |
+
# We are assuming a max of 10 editable message textboxes
|
| 421 |
+
for i in range(10): # Process up to 10 edited message textboxes
|
| 422 |
+
edited_content = edited_contents[i]
|
| 423 |
+
# Check if the edited content is not empty
|
| 424 |
+
if edited_content.strip():
|
| 425 |
+
# If it corresponds to an original message index, use its original role
|
| 426 |
+
if i < original_user_assistant_count:
|
| 427 |
+
updated_messages.append({"role": original_user_assistant_messages[i].get('role', 'user'), "content": edited_content.strip()}) # Corrected quote
|
| 428 |
+
log_message(f"Updated original message {i+1} with role {original_user_assistant_messages[i].get('role', 'user')}.")
|
| 429 |
+
# If it's a new message (beyond original count but within the 10 textboxes)
|
| 430 |
+
else:
|
| 431 |
+
# Determine role based on the last message added in the updated_messages list
|
| 432 |
+
if len(updated_messages) > 0:
|
| 433 |
+
last_role = updated_messages[-1]['role']
|
| 434 |
+
# Alternate roles, assuming the sequence is always user, assistant, user, assistant...
|
| 435 |
+
new_role = 'user' if last_role == 'assistant' else 'assistant'
|
| 436 |
+
else:
|
| 437 |
+
# If no messages exist yet (only system message or initially empty), the first new message is 'user'
|
| 438 |
+
new_role = 'user'
|
| 439 |
+
updated_messages.append({"role": new_role, "content": edited_content.strip()}) # Corrected quote
|
| 440 |
+
log_message(f"Added new message {i+1} with inferred role {new_role}.")
|
| 441 |
+
# If edited content is empty and it was an original message, it's effectively deleted (not added to updated_messages)
|
| 442 |
+
elif i < original_user_assistant_count:
|
| 443 |
+
log_message(f"Original message {i+1} was cleared, effectively deleting it.")
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
# Check if the updated entry has any messages (system or user/assistant)
|
| 447 |
+
if not updated_messages:
|
| 448 |
+
# Prevent saving an empty entry if it wasn't originally empty (unless system message was the only thing and is now empty)
|
| 449 |
+
# Allow saving an empty messages list if the original entry only had a system message and it was cleared
|
| 450 |
+
if not (len(original_messages_in_entry) == 1 and original_messages_in_entry[0]['role'] == 'system' and not edited_system_message.strip()):
|
| 451 |
+
log_message("Attempted to save an empty entry. Preventing save.")
|
| 452 |
+
# Return current state and a specific error message
|
| 453 |
+
return dataset_entries, "Error: Cannot save an empty entry. Add system message or user/assistant turns."
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
# Update the entry in the dataset_entries list
|
| 457 |
+
if 0 <= current_index < len(dataset_entries):
|
| 458 |
+
dataset_entries[current_index]['messages'] = updated_messages
|
| 459 |
+
log_message(f"Entry {current_index + 1} updated successfully. New message count: {len(updated_messages)}")
|
| 460 |
+
return dataset_entries, f"Changes saved for Entry {current_index + 1}."
|
| 461 |
+
else:
|
| 462 |
+
log_message(f"Error updating entry: index {current_index} out of bounds.")
|
| 463 |
+
# Return current state and a specific error message
|
| 464 |
+
return dataset_entries, "Error: Cannot update entry. Index out of bounds."
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
# Function to delete the current entry
|
| 468 |
+
def delete_entry(dataset_entries, current_index):
|
| 469 |
+
"""Deletes the current entry from the dataset."""
|
| 470 |
+
log_message(f"Attempting to delete entry at index: {current_index}")
|
| 471 |
+
if not dataset_entries or not (0 <= current_index < len(dataset_entries)):
|
| 472 |
+
log_message("Cannot delete entry: dataset_entries empty or index out of bounds.")
|
| 473 |
+
# If dataset is already empty or index is invalid, just return current state and an error message
|
| 474 |
+
# Return the current index as it hasn't changed due to deletion not happening
|
| 475 |
+
return dataset_entries, current_index, "Error: Cannot delete entry. Dataset is empty or index is out of bounds."
|
| 476 |
+
|
| 477 |
+
deleted_entry_index = current_index # Keep track of the index being deleted
|
| 478 |
+
log_message(f"Deleting entry at index {deleted_entry_index}.")
|
| 479 |
+
del dataset_entries[current_index]
|
| 480 |
+
|
| 481 |
+
# Adjust index after deletion
|
| 482 |
+
new_index = deleted_entry_index
|
| 483 |
+
if new_index >= len(dataset_entries) and len(dataset_entries) > 0:
|
| 484 |
+
new_index = len(dataset_entries) - 1
|
| 485 |
+
log_message(f"Adjusting index after deletion to last entry: {new_index}")
|
| 486 |
+
elif len(dataset_entries) == 0:
|
| 487 |
+
new_index = 0 # Reset index if dataset is empty
|
| 488 |
+
log_message("Dataset is empty after deletion. Resetting index to 0.")
|
| 489 |
+
else:
|
| 490 |
+
log_message(f"Index remains {new_index} after deletion.")
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
# Return updated dataset, new index, and a success message
|
| 494 |
+
return dataset_entries, new_index, f"Entry {deleted_entry_index + 1} deleted."
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
# Define the Gradio Interface
|
| 498 |
+
with gr.Blocks() as demo:
|
| 499 |
+
dataset_entries = gr.State([]) # Use Gradio State to maintain dataset entries
|
| 500 |
+
current_messages = gr.State([]) # Use Gradio State to maintain current messages for creation
|
| 501 |
+
current_entry_index = gr.State(0) # Use Gradio State for current viewing index
|
| 502 |
+
current_loaded_filename = gr.State("") # State to hold the name of the currently loaded file
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
gr.Markdown("## LLM Dataset Creator")
|
| 506 |
+
|
| 507 |
+
with gr.Tabs() as tabs:
|
| 508 |
+
with gr.TabItem("Create Entry", id=0):
|
| 509 |
+
gr.Markdown("### Create a new entry")
|
| 510 |
+
system_message_input = gr.Textbox(label="System Message", lines=5, placeholder="Instruksi peran yang sangat kuat (misalnya: Kamu adalah Yui Airi, teman yang santai...)")
|
| 511 |
+
|
| 512 |
+
gr.Markdown("### User and Assistant Messages")
|
| 513 |
+
user_input = gr.Textbox(label="User Input", lines=3)
|
| 514 |
+
assistant_response = gr.Textbox(label="Assistant Response", lines=3)
|
| 515 |
+
|
| 516 |
+
with gr.Row():
|
| 517 |
+
add_turn_btn = gr.Button("Add User/Assistant Turn")
|
| 518 |
+
clear_turns_btn = gr.Button("Clear Turns")
|
| 519 |
+
|
| 520 |
+
current_turns_output = gr.Markdown("Current Turns:")
|
| 521 |
+
# Add a dedicated status textbox for this tab
|
| 522 |
+
create_status_output = gr.Textbox(label="Status", interactive=False)
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
add_entry_btn = gr.Button("Add Entry to Dataset")
|
| 526 |
+
|
| 527 |
+
gr.Markdown("### Dataset Entries")
|
| 528 |
+
dataset_size_output = gr.Markdown("Number of entries: 0") # Define dataset_size_output here
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
# Link add_turn_btn to the add_turn function
|
| 532 |
+
add_turn_btn.click(
|
| 533 |
+
add_turn,
|
| 534 |
+
inputs=[current_messages, user_input, assistant_response],
|
| 535 |
+
outputs=[current_messages, user_input, assistant_response, create_status_output] # Update status output
|
| 536 |
+
).then( # Chain another event to update the displayed turns and clear status
|
| 537 |
+
lambda messages: ("Current Turns:\n" + "\n".join([f"**{msg['role'].capitalize()}:** {msg['content']}" for msg in messages])),
|
| 538 |
+
inputs=[current_messages],
|
| 539 |
+
outputs=[current_turns_output]
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# Link clear_turns_btn to the clear_turns function
|
| 543 |
+
clear_turns_btn.click(
|
| 544 |
+
clear_turns,
|
| 545 |
+
inputs=[],
|
| 546 |
+
outputs=[current_messages, create_status_output] # Clear messages and status output
|
| 547 |
+
).then( # Chain another event to clear the displayed turns
|
| 548 |
+
lambda: "Current Turns:",
|
| 549 |
+
inputs=[],
|
| 550 |
+
outputs=[current_turns_output]
|
| 551 |
+
)
|
| 552 |
+
|
| 553 |
+
# Link add_entry_btn to the add_entry_to_dataset function
|
| 554 |
+
add_entry_btn.click(
|
| 555 |
+
add_entry_to_dataset,
|
| 556 |
+
inputs=[dataset_entries, system_message_input, current_messages],
|
| 557 |
+
outputs=[dataset_entries, system_message_input, current_messages, create_status_output, dataset_size_output] # Update status output
|
| 558 |
+
).then( # Chain another event to clear turns output
|
| 559 |
+
lambda: "Current Turns:",
|
| 560 |
+
inputs=[],
|
| 561 |
+
outputs=[current_turns_output]
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
with gr.TabItem("View/Edit Entries", id=1):
|
| 566 |
+
gr.Markdown("### View Dataset Entries")
|
| 567 |
+
entry_display = gr.Markdown("No entries to display yet.") # Define entry_display here
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
# Components for navigation
|
| 571 |
+
with gr.Row():
|
| 572 |
+
prev_btn = gr.Button("Previous")
|
| 573 |
+
next_btn = gr.Button("Next")
|
| 574 |
+
go_to_input = gr.Number(label="Go to Entry #", value=1, precision=0)
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
# Textbox for editing system message
|
| 578 |
+
edited_system_message_input = gr.Textbox(label="System Message", lines=5, visible=False) # Define edited_system_message_input here
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
# Placeholder textboxes for editing user/assistant messages (assuming max 10 messages for simplicity)
|
| 582 |
+
# We need 10 output components for the textboxes
|
| 583 |
+
edited_message_inputs = [gr.Textbox(label=f"Message {i+1}", lines=3, visible=False) for i in range(10)] # Define edited_message_inputs here
|
| 584 |
+
|
| 585 |
+
|
| 586 |
+
save_changes_btn = gr.Button("Save Changes", visible=False) # Define save_changes_btn here
|
| 587 |
+
delete_entry_btn = gr.Button("Delete Entry", visible=False) # Define delete_entry_btn here
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
edit_status_output = gr.Textbox(label="Edit Status", interactive=False) # Define edit_status_output here, already visible
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
# Link navigation buttons and go_to_input to update the current_entry_index and display
|
| 594 |
+
# The .then() calls need to output to all 11 textboxes (1 system + 10 messages) and the buttons/status
|
| 595 |
+
prev_btn.click(
|
| 596 |
+
prev_entry,
|
| 597 |
+
inputs=[current_entry_index, dataset_entries],
|
| 598 |
+
outputs=[current_entry_index]
|
| 599 |
+
).then( # Chain to display the updated entry
|
| 600 |
+
display_entry,
|
| 601 |
+
inputs=[dataset_entries, current_entry_index],
|
| 602 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
next_btn.click(
|
| 606 |
+
next_entry,
|
| 607 |
+
inputs=[current_entry_index, dataset_entries],
|
| 608 |
+
outputs=[current_entry_index]
|
| 609 |
+
).then( # Chain to display the updated entry
|
| 610 |
+
display_entry,
|
| 611 |
+
inputs=[dataset_entries, current_entry_index],
|
| 612 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
go_to_input.submit( # Use submit event for number input
|
| 616 |
+
go_to_entry,
|
| 617 |
+
inputs=[go_to_input, dataset_entries],
|
| 618 |
+
outputs=[current_entry_index, edit_status_output] # Output to index and status
|
| 619 |
+
).then( # Chain to display the updated entry (or the default if invalid)
|
| 620 |
+
display_entry,
|
| 621 |
+
inputs=[dataset_entries, current_entry_index],
|
| 622 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
# Add event listener for the 'change' event on go_to_input
|
| 626 |
+
go_to_input.change( # Trigger on change as well
|
| 627 |
+
go_to_entry,
|
| 628 |
+
inputs=[go_to_input, dataset_entries],
|
| 629 |
+
outputs=[current_entry_index, edit_status_output] # Output to index and status
|
| 630 |
+
).then( # Chain to display the updated entry (or the default if invalid)
|
| 631 |
+
display_entry,
|
| 632 |
+
inputs=[dataset_entries, current_entry_index],
|
| 633 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
# Link save_changes_btn to the update_entry_messages function
|
| 638 |
+
save_changes_btn.click(
|
| 639 |
+
update_entry_messages,
|
| 640 |
+
inputs=[dataset_entries, current_entry_index, edited_system_message_input] + edited_message_inputs,
|
| 641 |
+
outputs=[dataset_entries, edit_status_output]
|
| 642 |
+
).then( # Chain to re-display the entry after saving
|
| 643 |
+
display_entry,
|
| 644 |
+
inputs=[dataset_entries, current_entry_index],
|
| 645 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Ensure all outputs are listed
|
| 646 |
+
)
|
| 647 |
+
|
| 648 |
+
# Link delete_entry_btn to the delete_entry function
|
| 649 |
+
delete_entry_btn.click(
|
| 650 |
+
delete_entry,
|
| 651 |
+
inputs=[dataset_entries, current_entry_index], # Pass State objects as inputs to delete_entry
|
| 652 |
+
outputs=[dataset_entries, current_entry_index, edit_status_output] # delete_entry returns updated list, new index, and status
|
| 653 |
+
).then( # First chained event: display the new current entry
|
| 654 |
+
fn=display_entry,
|
| 655 |
+
# Take the outputs from delete_entry as inputs for display_entry
|
| 656 |
+
# Mapping: delete_entry outputs (dataset_entries, current_index, edit_status_output)
|
| 657 |
+
# display_entry expects (dataset_entries, current_index)
|
| 658 |
+
inputs=[dataset_entries, current_entry_index],
|
| 659 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output]
|
| 660 |
+
).then( # Second chained event: update dataset size
|
| 661 |
+
lambda entries: f"Number of entries: {len(entries)}",
|
| 662 |
+
inputs=[dataset_entries],
|
| 663 |
+
outputs=[dataset_size_output]
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
with gr.TabItem("Save/Load Dataset", id=2):
|
| 668 |
+
gr.Markdown("### Save Dataset")
|
| 669 |
+
# Use the state variable for the filename input's value
|
| 670 |
+
filename_to_save = gr.Textbox(label="Enter filename to save", value="dataset.jsonl", key="filename_to_save") # Added key
|
| 671 |
+
with gr.Row():
|
| 672 |
+
save_local_btn = gr.Button("Save to File") # Changed button label
|
| 673 |
+
hf_save_btn = gr.Button("Save to Hugging Face Hub")
|
| 674 |
+
|
| 675 |
+
save_output = gr.Textbox(label="Save Status", interactive=False) # Already visible
|
| 676 |
+
|
| 677 |
+
|
| 678 |
+
with gr.Accordion("Hugging Face Hub (Save)", open=False):
|
| 679 |
+
hf_token_save = gr.Textbox(label="HF API Token", type="password")
|
| 680 |
+
hf_repo_id_save = gr.Textbox(label="HF Repo Name", placeholder="user/repo")
|
| 681 |
+
hf_file_path_save = gr.Textbox(label="File Path in Repo", value="dataset.jsonl")
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
# Link save buttons to their respective functions
|
| 685 |
+
save_local_btn.click(
|
| 686 |
+
save_dataset,
|
| 687 |
+
inputs=[dataset_entries, filename_to_save],
|
| 688 |
+
outputs=[save_output]
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
hf_save_btn.click(
|
| 692 |
+
save_to_hf,
|
| 693 |
+
inputs=[dataset_entries, hf_token_save, hf_repo_id_save, hf_file_path_save],
|
| 694 |
+
outputs=[save_output]
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
gr.Markdown("---")
|
| 699 |
+
gr.Markdown("### Load Dataset")
|
| 700 |
+
|
| 701 |
+
# Local File Load - Simplified to directly show upload and path input
|
| 702 |
+
gr.Markdown("#### Load from Local File")
|
| 703 |
+
uploaded_file = gr.File(label="Upload a JSONL file", file_types=[".jsonl"]) # Specify file type
|
| 704 |
+
local_file_path_input = gr.Textbox(label="Or load from local path", placeholder="/path/to/your/dataset.jsonl") # New path input
|
| 705 |
+
load_local_btn = gr.Button("Load Local File") # Changed button label
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
# Hugging Face Hub Load
|
| 709 |
+
gr.Markdown("#### Load from Hugging Face Hub")
|
| 710 |
+
with gr.Column():
|
| 711 |
+
hf_token_load = gr.Textbox(label="HF API Token (optional for public repos)", type="password")
|
| 712 |
+
hf_repo_id_load = gr.Textbox(label="HF Repository ID (e.g., your_username/your_repo)")
|
| 713 |
+
hf_file_path_load = gr.Textbox(label="Path file JSONL in repository (e.g., dataset.jsonl)")
|
| 714 |
+
load_hf_btn = gr.Button("Muat dari Hugging Face Hub")
|
| 715 |
+
|
| 716 |
+
load_output = gr.Textbox(label="Load Status", interactive=False) # Already visible
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
# Removed Logic to show/hide load columns based on radio button
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
# Link load buttons to their respective functions
|
| 723 |
+
# Modified load_local_btn to handle both upload and path input
|
| 724 |
+
load_local_btn.click(
|
| 725 |
+
load_dataset_from_file,
|
| 726 |
+
inputs=[uploaded_file, local_file_path_input], # Pass both file object and path input
|
| 727 |
+
outputs=[dataset_entries, current_entry_index, load_output, current_loaded_filename] # Update state variables and status
|
| 728 |
+
).then( # Chain to update dataset size and display the first entry
|
| 729 |
+
display_entry, # Call display_entry first
|
| 730 |
+
inputs=[dataset_entries, current_entry_index],
|
| 731 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Update UI components
|
| 732 |
+
).then( # Then update dataset size and filename
|
| 733 |
+
lambda entries, loaded_filename: (f"Number of entries: {len(entries)}", loaded_filename),
|
| 734 |
+
inputs=[dataset_entries, current_loaded_filename],
|
| 735 |
+
outputs=[dataset_size_output, filename_to_save] # Update filename_to_save
|
| 736 |
+
)
|
| 737 |
+
|
| 738 |
+
|
| 739 |
+
load_hf_btn.click(
|
| 740 |
+
load_from_hf,
|
| 741 |
+
inputs=[hf_token_load, hf_repo_id_load, hf_file_path_load],
|
| 742 |
+
outputs=[dataset_entries, current_entry_index, load_output, current_loaded_filename] # Update state variables and status
|
| 743 |
+
).then( # Chain to update dataset size and display the first entry
|
| 744 |
+
display_entry, # Call display_entry first
|
| 745 |
+
inputs=[dataset_entries, current_entry_index],
|
| 746 |
+
outputs=[entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output] # Update UI components
|
| 747 |
+
).then( # Then update dataset size and filename
|
| 748 |
+
lambda entries, loaded_filename: (f"Number of entries: {len(entries)}", loaded_filename),
|
| 749 |
+
inputs=[dataset_entries, current_loaded_filename],
|
| 750 |
+
outputs=[dataset_size_output, filename_to_save] # Update filename_to_save
|
| 751 |
+
)
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
# Add initial display of dataset size and first entry when the app loads
|
| 755 |
+
# This will also handle the case after loading
|
| 756 |
+
demo.load(
|
| 757 |
+
fn=lambda entries: (f"Number of entries: {len(entries)}",) + display_entry(entries, 0), # Also display the first entry
|
| 758 |
+
inputs=[dataset_entries],
|
| 759 |
+
outputs=[dataset_size_output, entry_display, edited_system_message_input, *edited_message_inputs, save_changes_btn, delete_entry_btn, edit_status_output]
|
| 760 |
+
)
|
| 761 |
+
|
| 762 |
+
# To run the app in Colab, you'll need to use the public interface
|
| 763 |
+
# demo.launch(share=True)
|
| 764 |
+
|
| 765 |
+
demo.launch(share=True)
|