kulia-moon commited on
Commit
3f3bd1f
·
verified ·
1 Parent(s): 2f2a5f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -49
app.py CHANGED
@@ -185,12 +185,13 @@ def push_file_to_huggingface_dataset(file_path, path_in_repo, commit_message_pre
185
  print(log_message)
186
  return log_message
187
 
188
- # --- Main Generation and Push Function ---
189
  def generate_and_display_conversations(num_conversations_input, custom_prompts_input, custom_system_prompt_input,
190
- commit_subject, commit_body, selected_model_name_input): # New: selected_model_name_input
191
  num_conversations = int(num_conversations_input)
192
  if num_conversations <= 0:
193
- return "Please enter a number of conversations greater than zero.", ""
 
194
 
195
  os.makedirs(OUTPUT_DIR, exist_ok=True)
196
 
@@ -202,49 +203,44 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
202
  try:
203
  existing_conversations.append(json.loads(line.strip()))
204
  except json.JSONDecodeError as e:
205
- print(f"Skipping malformed JSON line in {DATA_FILE}: {line.strip()} - {e}")
206
-
207
  # Deduplicate existing conversations
208
  seen_conversations = set()
209
  cleaned_existing_conversations = []
210
  for conv_entry in existing_conversations:
211
- # Use a string representation of the whole entry for deduplication
212
  conv_str = json.dumps(conv_entry, sort_keys=True)
213
  if conv_str not in seen_conversations:
214
  cleaned_existing_conversations.append(conv_entry)
215
  seen_conversations.add(conv_str)
216
 
217
  # Validate and filter existing conversations for completeness (expected length)
218
- expected_msg_len = lambda n_exchanges: 1 + 1 + n_exchanges + (n_exchanges - 1) # System + initial human + AI turns + human follow-ups
219
 
220
  validated_existing_conversations = []
221
  initial_cleaned_count = len(cleaned_existing_conversations)
222
  for conv_entry in cleaned_existing_conversations:
223
  conv_list = conv_entry.get("conversations", [])
224
- # Assume num_exchanges was 5 for old conversations if not stored
225
- # Or more robustly, infer from length.
226
- # Given the fixed num_exchanges=5 for generation, we can check for this.
227
- if len(conv_list) == expected_msg_len(5):
228
  validated_existing_conversations.append(conv_entry)
229
  else:
230
- print(f"Skipping incomplete/malformed existing conversation (length {len(conv_list)} != {expected_msg_len(5)}): {conv_entry}")
231
 
232
  all_conversations = list(validated_existing_conversations) # Start with clean existing ones
233
 
234
- generation_log = []
235
  current_time_loc = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " (An Nhơn, Binh Dinh, Vietnam)"
236
- generation_log.append(f"Starting conversation generation at {current_time_loc}")
237
- generation_log.append(f"Loaded and cleaned {len(validated_existing_conversations)} existing conversations (initially {initial_cleaned_count} before validation).")
238
- generation_log.append(f"Generating {num_conversations} *new* conversations.")
239
 
240
  model_names_to_use = list(AVAILABLE_MODELS.keys())
241
  if selected_model_name_input and selected_model_name_input in model_names_to_use:
242
- # If a specific model is selected, only use that one
243
- model_selection_info = f"Specific model selected: '{selected_model_name_input}'"
244
  else:
245
- # If no specific model or invalid model, pick a random one
246
- model_selection_info = f"No specific model selected or invalid, picking randomly from: {', '.join(model_names_to_use)}"
247
- generation_log.append(model_selection_info)
248
 
249
 
250
  current_prompts = DEFAULT_INITIAL_PROMPTS
@@ -256,7 +252,7 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
256
  new_conversations_generated = []
257
  expected_conversation_length = expected_msg_len(5) # Always 5 exchanges for new generations
258
 
259
- for i in tqdm(range(num_conversations), desc="Generating conversations"):
260
  seed = random.randint(0, 1000000)
261
 
262
  if custom_system_prompt_input:
@@ -268,49 +264,44 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
268
  prompt_template = random.choice(current_prompts)
269
  prompt = prompt_template.replace("[NAME]", random_name)
270
 
271
- # Determine the model to use for this specific conversation
272
- if selected_model_name_input and selected_model_name_input in model_names_to_use:
273
- selected_model_for_this_conv = selected_model_name_input
274
- else:
275
- selected_model_for_this_conv = random.choice(model_names_to_use)
276
 
277
- generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Generating conv {i+1}/{num_conversations} with '{selected_model_for_this_conv}' (System: '{system[:50]}...')")
278
 
279
  conversation = chat(system, prompt, selected_model_for_this_conv, seed=seed, num_exchanges=5)
280
 
281
  if len(conversation) == expected_conversation_length and not any(d.get("from") == "error" for d in conversation):
282
  new_conv_entry = {"model_used": selected_model_for_this_conv, "conversations": conversation}
283
- # Add to all_conversations and new_conversations_generated only if not a duplicate of what's already *in memory*
284
- # This handles duplicates from current batch or newly generated identical to existing
285
  new_conv_str = json.dumps(new_conv_entry, sort_keys=True)
286
  if new_conv_str not in seen_conversations:
287
  all_conversations.append(new_conv_entry)
288
  new_conversations_generated.append(new_conv_entry)
289
- seen_conversations.add(new_conv_str) # Mark as seen
290
- generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Successfully generated and added conv {i+1}/{num_conversations}.")
291
  else:
292
- generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipped conv {i+1}/{num_conversations} as it's a duplicate.")
293
  else:
294
- generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipping conv {i+1}/{num_conversations} due to error or incorrect length ({len(conversation)} messages, expected {expected_conversation_length}).")
295
  if conversation and conversation[-1].get("from") == "error":
296
- generation_log.append(f" Error details: {conversation[-1]['value']}")
297
 
298
  # Save all (cleaned existing + newly generated unique) conversations to JSONL
299
  with open(DATA_FILE, "w") as f:
300
  for conv in all_conversations:
301
  f.write(json.dumps(conv) + "\n")
302
 
303
- generation_log.append(f"Saved {len(new_conversations_generated)} *new unique* conversations to {DATA_FILE} (total unique and validated: {len(all_conversations)}).")
304
- generation_log.append("Attempting to push main conversations file to Hugging Face Dataset...")
305
 
306
  # --- Auto-push main conversations to Hugging Face Dataset ---
307
  # Use the custom commit message
308
  commit_message = f"{commit_subject.strip()}\n\n{commit_body.strip()}" if commit_body.strip() else commit_subject.strip()
309
  push_status = push_file_to_huggingface_dataset(DATA_FILE, "conversations.jsonl", commit_message)
310
- generation_log.append(push_status)
311
- generation_log.append(f"Process complete at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (An Nhơn, Binh Dinh, Vietnam)")
312
-
313
- return json.dumps(all_conversations, indent=2), "\n".join(generation_log)
 
314
 
315
  # --- Community Prompts Functions ---
316
  def load_community_prompts():
@@ -453,16 +444,12 @@ with gr.Blocks() as demo:
453
  num_conversations_input = gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Number of Conversations to Generate", info="More conversations take longer and might hit API limits.")
454
 
455
  gr.Markdown("### Model Selection")
456
- model_choices_with_descriptions = [
457
- f"{name} ({info['description']}, Speed: {info['speed']})"
458
- for name, info in AVAILABLE_MODELS.items()
459
- ]
460
  model_selector_dropdown = gr.Dropdown(
461
  label="Select Model (or leave empty for random)",
462
  choices=list(AVAILABLE_MODELS.keys()), # The actual values passed will be model names
463
  value=None, # Default to no selection, implying random
464
  interactive=True,
465
- info="Choose a specific model or let the app pick one randomly for each conversation."
466
  )
467
  # Add a Textbox for model description based on selection
468
  model_description_output = gr.Textbox(
@@ -474,7 +461,7 @@ with gr.Blocks() as demo:
474
  if model_name and model_name in AVAILABLE_MODELS:
475
  info = AVAILABLE_MODELS[model_name]
476
  return f"Description: {info['description']}\nSpeed: {info['speed']}"
477
- return "No model selected, or model not found. A random model will be chosen per conversation."
478
 
479
  model_selector_dropdown.change(
480
  fn=get_model_info,
@@ -522,7 +509,8 @@ with gr.Blocks() as demo:
522
  generate_button = gr.Button("Generate & Push Conversations")
523
 
524
  output_conversations = gr.JSON(label="Generated Conversations (Content of conversations.jsonl)")
525
- output_log = gr.Textbox(label="Process Log", interactive=False, lines=10, max_lines=20)
 
526
 
527
  # Link commit template dropdown to update fields
528
  commit_template_dropdown.change(
@@ -548,7 +536,7 @@ with gr.Blocks() as demo:
548
  model_selector_dropdown # Pass selected model name
549
  ],
550
  outputs=[output_conversations, output_log],
551
- show_progress=True
552
  )
553
 
554
  with gr.Tab("Community Prompts"):
 
185
  print(log_message)
186
  return log_message
187
 
188
+ # --- Main Generation and Push Function (Now a generator for streaming) ---
189
  def generate_and_display_conversations(num_conversations_input, custom_prompts_input, custom_system_prompt_input,
190
+ commit_subject, commit_body, selected_model_name_input):
191
  num_conversations = int(num_conversations_input)
192
  if num_conversations <= 0:
193
+ yield gr.JSON.update(value={}), "Please enter a number of conversations greater than zero.\n"
194
+ return
195
 
196
  os.makedirs(OUTPUT_DIR, exist_ok=True)
197
 
 
203
  try:
204
  existing_conversations.append(json.loads(line.strip()))
205
  except json.JSONDecodeError as e:
206
+ yield gr.JSON.update(value={}), f"Skipping malformed JSON line in {DATA_FILE}: {line.strip()} - {e}\n"
207
+
208
  # Deduplicate existing conversations
209
  seen_conversations = set()
210
  cleaned_existing_conversations = []
211
  for conv_entry in existing_conversations:
 
212
  conv_str = json.dumps(conv_entry, sort_keys=True)
213
  if conv_str not in seen_conversations:
214
  cleaned_existing_conversations.append(conv_entry)
215
  seen_conversations.add(conv_str)
216
 
217
  # Validate and filter existing conversations for completeness (expected length)
218
+ expected_msg_len = lambda n_exchanges: 1 + 1 + n_exchanges * 2 # System + initial human + (AI_turn + human_followup) * n_exchanges
219
 
220
  validated_existing_conversations = []
221
  initial_cleaned_count = len(cleaned_existing_conversations)
222
  for conv_entry in cleaned_existing_conversations:
223
  conv_list = conv_entry.get("conversations", [])
224
+ if len(conv_list) == expected_msg_len(5): # Fixed to 5 exchanges for generation
 
 
 
225
  validated_existing_conversations.append(conv_entry)
226
  else:
227
+ yield gr.JSON.update(value={}), f"Skipping incomplete/malformed existing conversation (length {len(conv_list)} != {expected_msg_len(5)}): {conv_entry}\n"
228
 
229
  all_conversations = list(validated_existing_conversations) # Start with clean existing ones
230
 
 
231
  current_time_loc = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " (An Nhơn, Binh Dinh, Vietnam)"
232
+ yield gr.JSON.update(value={}), f"Starting conversation generation at {current_time_loc}\n"
233
+ yield gr.JSON.update(value={}), f"Loaded and cleaned {len(validated_existing_conversations)} existing conversations (initially {initial_cleaned_count} before validation).\n"
234
+ yield gr.JSON.update(value={}), f"Generating {num_conversations} *new* conversations.\n"
235
 
236
  model_names_to_use = list(AVAILABLE_MODELS.keys())
237
  if selected_model_name_input and selected_model_name_input in model_names_to_use:
238
+ model_selection_info = f"Specific model selected for all new conversations: '{selected_model_name_input}'\n"
239
+ selected_model_for_this_conv_batch = selected_model_name_input
240
  else:
241
+ model_selection_info = f"No specific model selected or invalid. Models will be chosen randomly per conversation from: {', '.join(model_names_to_use)}\n"
242
+ selected_model_for_this_conv_batch = None # Indicate random selection per loop
243
+ yield gr.JSON.update(value={}), model_selection_info
244
 
245
 
246
  current_prompts = DEFAULT_INITIAL_PROMPTS
 
252
  new_conversations_generated = []
253
  expected_conversation_length = expected_msg_len(5) # Always 5 exchanges for new generations
254
 
255
+ for i in range(num_conversations): # tqdm not directly compatible with yielding, so manually handle progress
256
  seed = random.randint(0, 1000000)
257
 
258
  if custom_system_prompt_input:
 
264
  prompt_template = random.choice(current_prompts)
265
  prompt = prompt_template.replace("[NAME]", random_name)
266
 
267
+ selected_model_for_this_conv = selected_model_for_this_conv_batch if selected_model_for_this_conv_batch else random.choice(model_names_to_use)
 
 
 
 
268
 
269
+ yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Generating conv {i+1}/{num_conversations} with '{selected_model_for_this_conv}' (System: '{system[:50]}...')\n"
270
 
271
  conversation = chat(system, prompt, selected_model_for_this_conv, seed=seed, num_exchanges=5)
272
 
273
  if len(conversation) == expected_conversation_length and not any(d.get("from") == "error" for d in conversation):
274
  new_conv_entry = {"model_used": selected_model_for_this_conv, "conversations": conversation}
 
 
275
  new_conv_str = json.dumps(new_conv_entry, sort_keys=True)
276
  if new_conv_str not in seen_conversations:
277
  all_conversations.append(new_conv_entry)
278
  new_conversations_generated.append(new_conv_entry)
279
+ seen_conversations.add(new_conv_str)
280
+ yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Successfully generated and added conv {i+1}/{num_conversations}.\n"
281
  else:
282
+ yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipped conv {i+1}/{num_conversations} as it's a duplicate.\n"
283
  else:
284
+ yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipping conv {i+1}/{num_conversations} due to error or incorrect length ({len(conversation)} messages, expected {expected_conversation_length}).\n"
285
  if conversation and conversation[-1].get("from") == "error":
286
+ yield gr.JSON.update(value={}), f" Error details: {conversation[-1]['value']}\n"
287
 
288
  # Save all (cleaned existing + newly generated unique) conversations to JSONL
289
  with open(DATA_FILE, "w") as f:
290
  for conv in all_conversations:
291
  f.write(json.dumps(conv) + "\n")
292
 
293
+ yield gr.JSON.update(value={}), f"Saved {len(new_conversations_generated)} *new unique* conversations to {DATA_FILE} (total unique and validated: {len(all_conversations)}).\n"
294
+ yield gr.JSON.update(value={}), "Attempting to push main conversations file to Hugging Face Dataset...\n"
295
 
296
  # --- Auto-push main conversations to Hugging Face Dataset ---
297
  # Use the custom commit message
298
  commit_message = f"{commit_subject.strip()}\n\n{commit_body.strip()}" if commit_body.strip() else commit_subject.strip()
299
  push_status = push_file_to_huggingface_dataset(DATA_FILE, "conversations.jsonl", commit_message)
300
+ yield gr.JSON.update(value={}), push_status + "\n"
301
+ final_log_message = f"Process complete at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (An Nhơn, Binh Dinh, Vietnam)\n"
302
+
303
+ # Final return for both outputs
304
+ yield json.dumps(all_conversations, indent=2), final_log_message
305
 
306
  # --- Community Prompts Functions ---
307
  def load_community_prompts():
 
444
  num_conversations_input = gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Number of Conversations to Generate", info="More conversations take longer and might hit API limits.")
445
 
446
  gr.Markdown("### Model Selection")
 
 
 
 
447
  model_selector_dropdown = gr.Dropdown(
448
  label="Select Model (or leave empty for random)",
449
  choices=list(AVAILABLE_MODELS.keys()), # The actual values passed will be model names
450
  value=None, # Default to no selection, implying random
451
  interactive=True,
452
+ info="Choose a specific model or let the app pick one randomly for each conversation in the batch."
453
  )
454
  # Add a Textbox for model description based on selection
455
  model_description_output = gr.Textbox(
 
461
  if model_name and model_name in AVAILABLE_MODELS:
462
  info = AVAILABLE_MODELS[model_name]
463
  return f"Description: {info['description']}\nSpeed: {info['speed']}"
464
+ return "No specific model selected. Conversations will use randomly chosen models from the available list."
465
 
466
  model_selector_dropdown.change(
467
  fn=get_model_info,
 
509
  generate_button = gr.Button("Generate & Push Conversations")
510
 
511
  output_conversations = gr.JSON(label="Generated Conversations (Content of conversations.jsonl)")
512
+ # Changed output_log to stream
513
+ output_log = gr.Textbox(label="Process Log", interactive=False, lines=10, max_lines=20, streaming=True)
514
 
515
  # Link commit template dropdown to update fields
516
  commit_template_dropdown.change(
 
536
  model_selector_dropdown # Pass selected model name
537
  ],
538
  outputs=[output_conversations, output_log],
539
+ # show_progress=True is handled internally by yielding
540
  )
541
 
542
  with gr.Tab("Community Prompts"):