Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -185,12 +185,13 @@ def push_file_to_huggingface_dataset(file_path, path_in_repo, commit_message_pre
|
|
| 185 |
print(log_message)
|
| 186 |
return log_message
|
| 187 |
|
| 188 |
-
# --- Main Generation and Push Function ---
|
| 189 |
def generate_and_display_conversations(num_conversations_input, custom_prompts_input, custom_system_prompt_input,
|
| 190 |
-
commit_subject, commit_body, selected_model_name_input):
|
| 191 |
num_conversations = int(num_conversations_input)
|
| 192 |
if num_conversations <= 0:
|
| 193 |
-
|
|
|
|
| 194 |
|
| 195 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 196 |
|
|
@@ -202,49 +203,44 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
|
|
| 202 |
try:
|
| 203 |
existing_conversations.append(json.loads(line.strip()))
|
| 204 |
except json.JSONDecodeError as e:
|
| 205 |
-
|
| 206 |
-
|
| 207 |
# Deduplicate existing conversations
|
| 208 |
seen_conversations = set()
|
| 209 |
cleaned_existing_conversations = []
|
| 210 |
for conv_entry in existing_conversations:
|
| 211 |
-
# Use a string representation of the whole entry for deduplication
|
| 212 |
conv_str = json.dumps(conv_entry, sort_keys=True)
|
| 213 |
if conv_str not in seen_conversations:
|
| 214 |
cleaned_existing_conversations.append(conv_entry)
|
| 215 |
seen_conversations.add(conv_str)
|
| 216 |
|
| 217 |
# Validate and filter existing conversations for completeness (expected length)
|
| 218 |
-
expected_msg_len = lambda n_exchanges: 1 + 1 + n_exchanges
|
| 219 |
|
| 220 |
validated_existing_conversations = []
|
| 221 |
initial_cleaned_count = len(cleaned_existing_conversations)
|
| 222 |
for conv_entry in cleaned_existing_conversations:
|
| 223 |
conv_list = conv_entry.get("conversations", [])
|
| 224 |
-
|
| 225 |
-
# Or more robustly, infer from length.
|
| 226 |
-
# Given the fixed num_exchanges=5 for generation, we can check for this.
|
| 227 |
-
if len(conv_list) == expected_msg_len(5):
|
| 228 |
validated_existing_conversations.append(conv_entry)
|
| 229 |
else:
|
| 230 |
-
|
| 231 |
|
| 232 |
all_conversations = list(validated_existing_conversations) # Start with clean existing ones
|
| 233 |
|
| 234 |
-
generation_log = []
|
| 235 |
current_time_loc = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " (An Nhơn, Binh Dinh, Vietnam)"
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
|
| 240 |
model_names_to_use = list(AVAILABLE_MODELS.keys())
|
| 241 |
if selected_model_name_input and selected_model_name_input in model_names_to_use:
|
| 242 |
-
|
| 243 |
-
|
| 244 |
else:
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
|
| 249 |
|
| 250 |
current_prompts = DEFAULT_INITIAL_PROMPTS
|
|
@@ -256,7 +252,7 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
|
|
| 256 |
new_conversations_generated = []
|
| 257 |
expected_conversation_length = expected_msg_len(5) # Always 5 exchanges for new generations
|
| 258 |
|
| 259 |
-
for i in
|
| 260 |
seed = random.randint(0, 1000000)
|
| 261 |
|
| 262 |
if custom_system_prompt_input:
|
|
@@ -268,49 +264,44 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
|
|
| 268 |
prompt_template = random.choice(current_prompts)
|
| 269 |
prompt = prompt_template.replace("[NAME]", random_name)
|
| 270 |
|
| 271 |
-
|
| 272 |
-
if selected_model_name_input and selected_model_name_input in model_names_to_use:
|
| 273 |
-
selected_model_for_this_conv = selected_model_name_input
|
| 274 |
-
else:
|
| 275 |
-
selected_model_for_this_conv = random.choice(model_names_to_use)
|
| 276 |
|
| 277 |
-
|
| 278 |
|
| 279 |
conversation = chat(system, prompt, selected_model_for_this_conv, seed=seed, num_exchanges=5)
|
| 280 |
|
| 281 |
if len(conversation) == expected_conversation_length and not any(d.get("from") == "error" for d in conversation):
|
| 282 |
new_conv_entry = {"model_used": selected_model_for_this_conv, "conversations": conversation}
|
| 283 |
-
# Add to all_conversations and new_conversations_generated only if not a duplicate of what's already *in memory*
|
| 284 |
-
# This handles duplicates from current batch or newly generated identical to existing
|
| 285 |
new_conv_str = json.dumps(new_conv_entry, sort_keys=True)
|
| 286 |
if new_conv_str not in seen_conversations:
|
| 287 |
all_conversations.append(new_conv_entry)
|
| 288 |
new_conversations_generated.append(new_conv_entry)
|
| 289 |
-
seen_conversations.add(new_conv_str)
|
| 290 |
-
|
| 291 |
else:
|
| 292 |
-
|
| 293 |
else:
|
| 294 |
-
|
| 295 |
if conversation and conversation[-1].get("from") == "error":
|
| 296 |
-
|
| 297 |
|
| 298 |
# Save all (cleaned existing + newly generated unique) conversations to JSONL
|
| 299 |
with open(DATA_FILE, "w") as f:
|
| 300 |
for conv in all_conversations:
|
| 301 |
f.write(json.dumps(conv) + "\n")
|
| 302 |
|
| 303 |
-
|
| 304 |
-
|
| 305 |
|
| 306 |
# --- Auto-push main conversations to Hugging Face Dataset ---
|
| 307 |
# Use the custom commit message
|
| 308 |
commit_message = f"{commit_subject.strip()}\n\n{commit_body.strip()}" if commit_body.strip() else commit_subject.strip()
|
| 309 |
push_status = push_file_to_huggingface_dataset(DATA_FILE, "conversations.jsonl", commit_message)
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
return
|
|
|
|
| 314 |
|
| 315 |
# --- Community Prompts Functions ---
|
| 316 |
def load_community_prompts():
|
|
@@ -453,16 +444,12 @@ with gr.Blocks() as demo:
|
|
| 453 |
num_conversations_input = gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Number of Conversations to Generate", info="More conversations take longer and might hit API limits.")
|
| 454 |
|
| 455 |
gr.Markdown("### Model Selection")
|
| 456 |
-
model_choices_with_descriptions = [
|
| 457 |
-
f"{name} ({info['description']}, Speed: {info['speed']})"
|
| 458 |
-
for name, info in AVAILABLE_MODELS.items()
|
| 459 |
-
]
|
| 460 |
model_selector_dropdown = gr.Dropdown(
|
| 461 |
label="Select Model (or leave empty for random)",
|
| 462 |
choices=list(AVAILABLE_MODELS.keys()), # The actual values passed will be model names
|
| 463 |
value=None, # Default to no selection, implying random
|
| 464 |
interactive=True,
|
| 465 |
-
info="Choose a specific model or let the app pick one randomly for each conversation."
|
| 466 |
)
|
| 467 |
# Add a Textbox for model description based on selection
|
| 468 |
model_description_output = gr.Textbox(
|
|
@@ -474,7 +461,7 @@ with gr.Blocks() as demo:
|
|
| 474 |
if model_name and model_name in AVAILABLE_MODELS:
|
| 475 |
info = AVAILABLE_MODELS[model_name]
|
| 476 |
return f"Description: {info['description']}\nSpeed: {info['speed']}"
|
| 477 |
-
return "No model selected
|
| 478 |
|
| 479 |
model_selector_dropdown.change(
|
| 480 |
fn=get_model_info,
|
|
@@ -522,7 +509,8 @@ with gr.Blocks() as demo:
|
|
| 522 |
generate_button = gr.Button("Generate & Push Conversations")
|
| 523 |
|
| 524 |
output_conversations = gr.JSON(label="Generated Conversations (Content of conversations.jsonl)")
|
| 525 |
-
|
|
|
|
| 526 |
|
| 527 |
# Link commit template dropdown to update fields
|
| 528 |
commit_template_dropdown.change(
|
|
@@ -548,7 +536,7 @@ with gr.Blocks() as demo:
|
|
| 548 |
model_selector_dropdown # Pass selected model name
|
| 549 |
],
|
| 550 |
outputs=[output_conversations, output_log],
|
| 551 |
-
show_progress=True
|
| 552 |
)
|
| 553 |
|
| 554 |
with gr.Tab("Community Prompts"):
|
|
|
|
| 185 |
print(log_message)
|
| 186 |
return log_message
|
| 187 |
|
| 188 |
+
# --- Main Generation and Push Function (Now a generator for streaming) ---
|
| 189 |
def generate_and_display_conversations(num_conversations_input, custom_prompts_input, custom_system_prompt_input,
|
| 190 |
+
commit_subject, commit_body, selected_model_name_input):
|
| 191 |
num_conversations = int(num_conversations_input)
|
| 192 |
if num_conversations <= 0:
|
| 193 |
+
yield gr.JSON.update(value={}), "Please enter a number of conversations greater than zero.\n"
|
| 194 |
+
return
|
| 195 |
|
| 196 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 197 |
|
|
|
|
| 203 |
try:
|
| 204 |
existing_conversations.append(json.loads(line.strip()))
|
| 205 |
except json.JSONDecodeError as e:
|
| 206 |
+
yield gr.JSON.update(value={}), f"Skipping malformed JSON line in {DATA_FILE}: {line.strip()} - {e}\n"
|
| 207 |
+
|
| 208 |
# Deduplicate existing conversations
|
| 209 |
seen_conversations = set()
|
| 210 |
cleaned_existing_conversations = []
|
| 211 |
for conv_entry in existing_conversations:
|
|
|
|
| 212 |
conv_str = json.dumps(conv_entry, sort_keys=True)
|
| 213 |
if conv_str not in seen_conversations:
|
| 214 |
cleaned_existing_conversations.append(conv_entry)
|
| 215 |
seen_conversations.add(conv_str)
|
| 216 |
|
| 217 |
# Validate and filter existing conversations for completeness (expected length)
|
| 218 |
+
expected_msg_len = lambda n_exchanges: 1 + 1 + n_exchanges * 2 # System + initial human + (AI_turn + human_followup) * n_exchanges
|
| 219 |
|
| 220 |
validated_existing_conversations = []
|
| 221 |
initial_cleaned_count = len(cleaned_existing_conversations)
|
| 222 |
for conv_entry in cleaned_existing_conversations:
|
| 223 |
conv_list = conv_entry.get("conversations", [])
|
| 224 |
+
if len(conv_list) == expected_msg_len(5): # Fixed to 5 exchanges for generation
|
|
|
|
|
|
|
|
|
|
| 225 |
validated_existing_conversations.append(conv_entry)
|
| 226 |
else:
|
| 227 |
+
yield gr.JSON.update(value={}), f"Skipping incomplete/malformed existing conversation (length {len(conv_list)} != {expected_msg_len(5)}): {conv_entry}\n"
|
| 228 |
|
| 229 |
all_conversations = list(validated_existing_conversations) # Start with clean existing ones
|
| 230 |
|
|
|
|
| 231 |
current_time_loc = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " (An Nhơn, Binh Dinh, Vietnam)"
|
| 232 |
+
yield gr.JSON.update(value={}), f"Starting conversation generation at {current_time_loc}\n"
|
| 233 |
+
yield gr.JSON.update(value={}), f"Loaded and cleaned {len(validated_existing_conversations)} existing conversations (initially {initial_cleaned_count} before validation).\n"
|
| 234 |
+
yield gr.JSON.update(value={}), f"Generating {num_conversations} *new* conversations.\n"
|
| 235 |
|
| 236 |
model_names_to_use = list(AVAILABLE_MODELS.keys())
|
| 237 |
if selected_model_name_input and selected_model_name_input in model_names_to_use:
|
| 238 |
+
model_selection_info = f"Specific model selected for all new conversations: '{selected_model_name_input}'\n"
|
| 239 |
+
selected_model_for_this_conv_batch = selected_model_name_input
|
| 240 |
else:
|
| 241 |
+
model_selection_info = f"No specific model selected or invalid. Models will be chosen randomly per conversation from: {', '.join(model_names_to_use)}\n"
|
| 242 |
+
selected_model_for_this_conv_batch = None # Indicate random selection per loop
|
| 243 |
+
yield gr.JSON.update(value={}), model_selection_info
|
| 244 |
|
| 245 |
|
| 246 |
current_prompts = DEFAULT_INITIAL_PROMPTS
|
|
|
|
| 252 |
new_conversations_generated = []
|
| 253 |
expected_conversation_length = expected_msg_len(5) # Always 5 exchanges for new generations
|
| 254 |
|
| 255 |
+
for i in range(num_conversations): # tqdm not directly compatible with yielding, so manually handle progress
|
| 256 |
seed = random.randint(0, 1000000)
|
| 257 |
|
| 258 |
if custom_system_prompt_input:
|
|
|
|
| 264 |
prompt_template = random.choice(current_prompts)
|
| 265 |
prompt = prompt_template.replace("[NAME]", random_name)
|
| 266 |
|
| 267 |
+
selected_model_for_this_conv = selected_model_for_this_conv_batch if selected_model_for_this_conv_batch else random.choice(model_names_to_use)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
+
yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Generating conv {i+1}/{num_conversations} with '{selected_model_for_this_conv}' (System: '{system[:50]}...')\n"
|
| 270 |
|
| 271 |
conversation = chat(system, prompt, selected_model_for_this_conv, seed=seed, num_exchanges=5)
|
| 272 |
|
| 273 |
if len(conversation) == expected_conversation_length and not any(d.get("from") == "error" for d in conversation):
|
| 274 |
new_conv_entry = {"model_used": selected_model_for_this_conv, "conversations": conversation}
|
|
|
|
|
|
|
| 275 |
new_conv_str = json.dumps(new_conv_entry, sort_keys=True)
|
| 276 |
if new_conv_str not in seen_conversations:
|
| 277 |
all_conversations.append(new_conv_entry)
|
| 278 |
new_conversations_generated.append(new_conv_entry)
|
| 279 |
+
seen_conversations.add(new_conv_str)
|
| 280 |
+
yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Successfully generated and added conv {i+1}/{num_conversations}.\n"
|
| 281 |
else:
|
| 282 |
+
yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipped conv {i+1}/{num_conversations} as it's a duplicate.\n"
|
| 283 |
else:
|
| 284 |
+
yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipping conv {i+1}/{num_conversations} due to error or incorrect length ({len(conversation)} messages, expected {expected_conversation_length}).\n"
|
| 285 |
if conversation and conversation[-1].get("from") == "error":
|
| 286 |
+
yield gr.JSON.update(value={}), f" Error details: {conversation[-1]['value']}\n"
|
| 287 |
|
| 288 |
# Save all (cleaned existing + newly generated unique) conversations to JSONL
|
| 289 |
with open(DATA_FILE, "w") as f:
|
| 290 |
for conv in all_conversations:
|
| 291 |
f.write(json.dumps(conv) + "\n")
|
| 292 |
|
| 293 |
+
yield gr.JSON.update(value={}), f"Saved {len(new_conversations_generated)} *new unique* conversations to {DATA_FILE} (total unique and validated: {len(all_conversations)}).\n"
|
| 294 |
+
yield gr.JSON.update(value={}), "Attempting to push main conversations file to Hugging Face Dataset...\n"
|
| 295 |
|
| 296 |
# --- Auto-push main conversations to Hugging Face Dataset ---
|
| 297 |
# Use the custom commit message
|
| 298 |
commit_message = f"{commit_subject.strip()}\n\n{commit_body.strip()}" if commit_body.strip() else commit_subject.strip()
|
| 299 |
push_status = push_file_to_huggingface_dataset(DATA_FILE, "conversations.jsonl", commit_message)
|
| 300 |
+
yield gr.JSON.update(value={}), push_status + "\n"
|
| 301 |
+
final_log_message = f"Process complete at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (An Nhơn, Binh Dinh, Vietnam)\n"
|
| 302 |
+
|
| 303 |
+
# Final return for both outputs
|
| 304 |
+
yield json.dumps(all_conversations, indent=2), final_log_message
|
| 305 |
|
| 306 |
# --- Community Prompts Functions ---
|
| 307 |
def load_community_prompts():
|
|
|
|
| 444 |
num_conversations_input = gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Number of Conversations to Generate", info="More conversations take longer and might hit API limits.")
|
| 445 |
|
| 446 |
gr.Markdown("### Model Selection")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
model_selector_dropdown = gr.Dropdown(
|
| 448 |
label="Select Model (or leave empty for random)",
|
| 449 |
choices=list(AVAILABLE_MODELS.keys()), # The actual values passed will be model names
|
| 450 |
value=None, # Default to no selection, implying random
|
| 451 |
interactive=True,
|
| 452 |
+
info="Choose a specific model or let the app pick one randomly for each conversation in the batch."
|
| 453 |
)
|
| 454 |
# Add a Textbox for model description based on selection
|
| 455 |
model_description_output = gr.Textbox(
|
|
|
|
| 461 |
if model_name and model_name in AVAILABLE_MODELS:
|
| 462 |
info = AVAILABLE_MODELS[model_name]
|
| 463 |
return f"Description: {info['description']}\nSpeed: {info['speed']}"
|
| 464 |
+
return "No specific model selected. Conversations will use randomly chosen models from the available list."
|
| 465 |
|
| 466 |
model_selector_dropdown.change(
|
| 467 |
fn=get_model_info,
|
|
|
|
| 509 |
generate_button = gr.Button("Generate & Push Conversations")
|
| 510 |
|
| 511 |
output_conversations = gr.JSON(label="Generated Conversations (Content of conversations.jsonl)")
|
| 512 |
+
# Changed output_log to stream
|
| 513 |
+
output_log = gr.Textbox(label="Process Log", interactive=False, lines=10, max_lines=20, streaming=True)
|
| 514 |
|
| 515 |
# Link commit template dropdown to update fields
|
| 516 |
commit_template_dropdown.change(
|
|
|
|
| 536 |
model_selector_dropdown # Pass selected model name
|
| 537 |
],
|
| 538 |
outputs=[output_conversations, output_log],
|
| 539 |
+
# show_progress=True is handled internally by yielding
|
| 540 |
)
|
| 541 |
|
| 542 |
with gr.Tab("Community Prompts"):
|