DeepRethink

Sleeping

App Files Files Community

kulia-moon commited on Jun 22

Commit

3f3bd1f

verified ·

1 Parent(s): 2f2a5f4

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -49

app.py CHANGED Viewed

@@ -185,12 +185,13 @@ def push_file_to_huggingface_dataset(file_path, path_in_repo, commit_message_pre
         print(log_message)
         return log_message
-# --- Main Generation and Push Function ---
 def generate_and_display_conversations(num_conversations_input, custom_prompts_input, custom_system_prompt_input,
-                                     commit_subject, commit_body, selected_model_name_input): # New: selected_model_name_input
     num_conversations = int(num_conversations_input)
     if num_conversations <= 0:
-        return "Please enter a number of conversations greater than zero.", ""
     os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -202,49 +203,44 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
                 try:
                     existing_conversations.append(json.loads(line.strip()))
                 except json.JSONDecodeError as e:
-                    print(f"Skipping malformed JSON line in {DATA_FILE}: {line.strip()} - {e}")
     # Deduplicate existing conversations
     seen_conversations = set()
     cleaned_existing_conversations = []
     for conv_entry in existing_conversations:
-        # Use a string representation of the whole entry for deduplication
         conv_str = json.dumps(conv_entry, sort_keys=True)
         if conv_str not in seen_conversations:
             cleaned_existing_conversations.append(conv_entry)
             seen_conversations.add(conv_str)
     # Validate and filter existing conversations for completeness (expected length)
-    expected_msg_len = lambda n_exchanges: 1 + 1 + n_exchanges + (n_exchanges - 1) # System + initial human + AI turns + human follow-ups
     validated_existing_conversations = []
     initial_cleaned_count = len(cleaned_existing_conversations)
     for conv_entry in cleaned_existing_conversations:
         conv_list = conv_entry.get("conversations", [])
-        # Assume num_exchanges was 5 for old conversations if not stored
-        # Or more robustly, infer from length.
-        # Given the fixed num_exchanges=5 for generation, we can check for this.
-        if len(conv_list) == expected_msg_len(5):
             validated_existing_conversations.append(conv_entry)
         else:
-            print(f"Skipping incomplete/malformed existing conversation (length {len(conv_list)} != {expected_msg_len(5)}): {conv_entry}")
     all_conversations = list(validated_existing_conversations) # Start with clean existing ones
-    generation_log = []
     current_time_loc = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " (An Nhơn, Binh Dinh, Vietnam)"
-    generation_log.append(f"Starting conversation generation at {current_time_loc}")
-    generation_log.append(f"Loaded and cleaned {len(validated_existing_conversations)} existing conversations (initially {initial_cleaned_count} before validation).")
-    generation_log.append(f"Generating {num_conversations} *new* conversations.")
     model_names_to_use = list(AVAILABLE_MODELS.keys())
     if selected_model_name_input and selected_model_name_input in model_names_to_use:
-        # If a specific model is selected, only use that one
-        model_selection_info = f"Specific model selected: '{selected_model_name_input}'"
     else:
-        # If no specific model or invalid model, pick a random one
-        model_selection_info = f"No specific model selected or invalid, picking randomly from: {', '.join(model_names_to_use)}"
-    generation_log.append(model_selection_info)
     current_prompts = DEFAULT_INITIAL_PROMPTS
@@ -256,7 +252,7 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
     new_conversations_generated = []
     expected_conversation_length = expected_msg_len(5) # Always 5 exchanges for new generations
-    for i in tqdm(range(num_conversations), desc="Generating conversations"):
         seed = random.randint(0, 1000000)
         if custom_system_prompt_input:
@@ -268,49 +264,44 @@ def generate_and_display_conversations(num_conversations_input, custom_prompts_i
         prompt_template = random.choice(current_prompts)
         prompt = prompt_template.replace("[NAME]", random_name)
-        # Determine the model to use for this specific conversation
-        if selected_model_name_input and selected_model_name_input in model_names_to_use:
-            selected_model_for_this_conv = selected_model_name_input
-        else:
-            selected_model_for_this_conv = random.choice(model_names_to_use)
-        generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Generating conv {i+1}/{num_conversations} with '{selected_model_for_this_conv}' (System: '{system[:50]}...')")
         conversation = chat(system, prompt, selected_model_for_this_conv, seed=seed, num_exchanges=5)
         if len(conversation) == expected_conversation_length and not any(d.get("from") == "error" for d in conversation):
             new_conv_entry = {"model_used": selected_model_for_this_conv, "conversations": conversation}
-            # Add to all_conversations and new_conversations_generated only if not a duplicate of what's already *in memory*
-            # This handles duplicates from current batch or newly generated identical to existing
             new_conv_str = json.dumps(new_conv_entry, sort_keys=True)
             if new_conv_str not in seen_conversations:
                 all_conversations.append(new_conv_entry)
                 new_conversations_generated.append(new_conv_entry)
-                seen_conversations.add(new_conv_str) # Mark as seen
-                generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Successfully generated and added conv {i+1}/{num_conversations}.")
             else:
-                generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipped conv {i+1}/{num_conversations} as it's a duplicate.")
         else:
-            generation_log.append(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipping conv {i+1}/{num_conversations} due to error or incorrect length ({len(conversation)} messages, expected {expected_conversation_length}).")
             if conversation and conversation[-1].get("from") == "error":
-                generation_log.append(f"  Error details: {conversation[-1]['value']}")
     # Save all (cleaned existing + newly generated unique) conversations to JSONL
     with open(DATA_FILE, "w") as f:
         for conv in all_conversations:
             f.write(json.dumps(conv) + "\n")
-    generation_log.append(f"Saved {len(new_conversations_generated)} *new unique* conversations to {DATA_FILE} (total unique and validated: {len(all_conversations)}).")
-    generation_log.append("Attempting to push main conversations file to Hugging Face Dataset...")
     # --- Auto-push main conversations to Hugging Face Dataset ---
     # Use the custom commit message
     commit_message = f"{commit_subject.strip()}\n\n{commit_body.strip()}" if commit_body.strip() else commit_subject.strip()
     push_status = push_file_to_huggingface_dataset(DATA_FILE, "conversations.jsonl", commit_message)
-    generation_log.append(push_status)
-    generation_log.append(f"Process complete at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (An Nhơn, Binh Dinh, Vietnam)")
-    return json.dumps(all_conversations, indent=2), "\n".join(generation_log)
 # --- Community Prompts Functions ---
 def load_community_prompts():
@@ -453,16 +444,12 @@ with gr.Blocks() as demo:
                 num_conversations_input = gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Number of Conversations to Generate", info="More conversations take longer and might hit API limits.")
             gr.Markdown("### Model Selection")
-            model_choices_with_descriptions = [
-                f"{name} ({info['description']}, Speed: {info['speed']})"
-                for name, info in AVAILABLE_MODELS.items()
-            ]
             model_selector_dropdown = gr.Dropdown(
                 label="Select Model (or leave empty for random)",
                 choices=list(AVAILABLE_MODELS.keys()), # The actual values passed will be model names
                 value=None, # Default to no selection, implying random
                 interactive=True,
-                info="Choose a specific model or let the app pick one randomly for each conversation."
             )
             # Add a Textbox for model description based on selection
             model_description_output = gr.Textbox(
@@ -474,7 +461,7 @@ with gr.Blocks() as demo:
                 if model_name and model_name in AVAILABLE_MODELS:
                     info = AVAILABLE_MODELS[model_name]
                     return f"Description: {info['description']}\nSpeed: {info['speed']}"
-                return "No model selected, or model not found. A random model will be chosen per conversation."
             model_selector_dropdown.change(
                 fn=get_model_info,
@@ -522,7 +509,8 @@ with gr.Blocks() as demo:
             generate_button = gr.Button("Generate & Push Conversations")
             output_conversations = gr.JSON(label="Generated Conversations (Content of conversations.jsonl)")
-            output_log = gr.Textbox(label="Process Log", interactive=False, lines=10, max_lines=20)
             # Link commit template dropdown to update fields
             commit_template_dropdown.change(
@@ -548,7 +536,7 @@ with gr.Blocks() as demo:
                     model_selector_dropdown # Pass selected model name
                 ],
                 outputs=[output_conversations, output_log],
-                show_progress=True
             )
         with gr.Tab("Community Prompts"):

         print(log_message)
         return log_message
+# --- Main Generation and Push Function (Now a generator for streaming) ---
 def generate_and_display_conversations(num_conversations_input, custom_prompts_input, custom_system_prompt_input,
+                                     commit_subject, commit_body, selected_model_name_input):
     num_conversations = int(num_conversations_input)
     if num_conversations <= 0:
+        yield gr.JSON.update(value={}), "Please enter a number of conversations greater than zero.\n"
+        return
     os.makedirs(OUTPUT_DIR, exist_ok=True)
                 try:
                     existing_conversations.append(json.loads(line.strip()))
                 except json.JSONDecodeError as e:
+                    yield gr.JSON.update(value={}), f"Skipping malformed JSON line in {DATA_FILE}: {line.strip()} - {e}\n"
     # Deduplicate existing conversations
     seen_conversations = set()
     cleaned_existing_conversations = []
     for conv_entry in existing_conversations:
         conv_str = json.dumps(conv_entry, sort_keys=True)
         if conv_str not in seen_conversations:
             cleaned_existing_conversations.append(conv_entry)
             seen_conversations.add(conv_str)
     # Validate and filter existing conversations for completeness (expected length)
+    expected_msg_len = lambda n_exchanges: 1 + 1 + n_exchanges * 2 # System + initial human + (AI_turn + human_followup) * n_exchanges
     validated_existing_conversations = []
     initial_cleaned_count = len(cleaned_existing_conversations)
     for conv_entry in cleaned_existing_conversations:
         conv_list = conv_entry.get("conversations", [])
+        if len(conv_list) == expected_msg_len(5): # Fixed to 5 exchanges for generation
             validated_existing_conversations.append(conv_entry)
         else:
+            yield gr.JSON.update(value={}), f"Skipping incomplete/malformed existing conversation (length {len(conv_list)} != {expected_msg_len(5)}): {conv_entry}\n"
     all_conversations = list(validated_existing_conversations) # Start with clean existing ones
     current_time_loc = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " (An Nhơn, Binh Dinh, Vietnam)"
+    yield gr.JSON.update(value={}), f"Starting conversation generation at {current_time_loc}\n"
+    yield gr.JSON.update(value={}), f"Loaded and cleaned {len(validated_existing_conversations)} existing conversations (initially {initial_cleaned_count} before validation).\n"
+    yield gr.JSON.update(value={}), f"Generating {num_conversations} *new* conversations.\n"
     model_names_to_use = list(AVAILABLE_MODELS.keys())
     if selected_model_name_input and selected_model_name_input in model_names_to_use:
+        model_selection_info = f"Specific model selected for all new conversations: '{selected_model_name_input}'\n"
+        selected_model_for_this_conv_batch = selected_model_name_input
     else:
+        model_selection_info = f"No specific model selected or invalid. Models will be chosen randomly per conversation from: {', '.join(model_names_to_use)}\n"
+        selected_model_for_this_conv_batch = None # Indicate random selection per loop
+    yield gr.JSON.update(value={}), model_selection_info
     current_prompts = DEFAULT_INITIAL_PROMPTS
     new_conversations_generated = []
     expected_conversation_length = expected_msg_len(5) # Always 5 exchanges for new generations
+    for i in range(num_conversations): # tqdm not directly compatible with yielding, so manually handle progress
         seed = random.randint(0, 1000000)
         if custom_system_prompt_input:
         prompt_template = random.choice(current_prompts)
         prompt = prompt_template.replace("[NAME]", random_name)
+        selected_model_for_this_conv = selected_model_for_this_conv_batch if selected_model_for_this_conv_batch else random.choice(model_names_to_use)
+        yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Generating conv {i+1}/{num_conversations} with '{selected_model_for_this_conv}' (System: '{system[:50]}...')\n"
         conversation = chat(system, prompt, selected_model_for_this_conv, seed=seed, num_exchanges=5)
         if len(conversation) == expected_conversation_length and not any(d.get("from") == "error" for d in conversation):
             new_conv_entry = {"model_used": selected_model_for_this_conv, "conversations": conversation}
             new_conv_str = json.dumps(new_conv_entry, sort_keys=True)
             if new_conv_str not in seen_conversations:
                 all_conversations.append(new_conv_entry)
                 new_conversations_generated.append(new_conv_entry)
+                seen_conversations.add(new_conv_str)
+                yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Successfully generated and added conv {i+1}/{num_conversations}.\n"
             else:
+                yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipped conv {i+1}/{num_conversations} as it's a duplicate.\n"
         else:
+            yield gr.JSON.update(value={}), f"[{datetime.datetime.now().strftime('%H:%M:%S')}] Skipping conv {i+1}/{num_conversations} due to error or incorrect length ({len(conversation)} messages, expected {expected_conversation_length}).\n"
             if conversation and conversation[-1].get("from") == "error":
+                yield gr.JSON.update(value={}), f"  Error details: {conversation[-1]['value']}\n"
     # Save all (cleaned existing + newly generated unique) conversations to JSONL
     with open(DATA_FILE, "w") as f:
         for conv in all_conversations:
             f.write(json.dumps(conv) + "\n")
+    yield gr.JSON.update(value={}), f"Saved {len(new_conversations_generated)} *new unique* conversations to {DATA_FILE} (total unique and validated: {len(all_conversations)}).\n"
+    yield gr.JSON.update(value={}), "Attempting to push main conversations file to Hugging Face Dataset...\n"
     # --- Auto-push main conversations to Hugging Face Dataset ---
     # Use the custom commit message
     commit_message = f"{commit_subject.strip()}\n\n{commit_body.strip()}" if commit_body.strip() else commit_subject.strip()
     push_status = push_file_to_huggingface_dataset(DATA_FILE, "conversations.jsonl", commit_message)
+    yield gr.JSON.update(value={}), push_status + "\n"
+    final_log_message = f"Process complete at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (An Nhơn, Binh Dinh, Vietnam)\n"
+    # Final return for both outputs
+    yield json.dumps(all_conversations, indent=2), final_log_message
 # --- Community Prompts Functions ---
 def load_community_prompts():
                 num_conversations_input = gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Number of Conversations to Generate", info="More conversations take longer and might hit API limits.")
             gr.Markdown("### Model Selection")
             model_selector_dropdown = gr.Dropdown(
                 label="Select Model (or leave empty for random)",
                 choices=list(AVAILABLE_MODELS.keys()), # The actual values passed will be model names
                 value=None, # Default to no selection, implying random
                 interactive=True,
+                info="Choose a specific model or let the app pick one randomly for each conversation in the batch."
             )
             # Add a Textbox for model description based on selection
             model_description_output = gr.Textbox(
                 if model_name and model_name in AVAILABLE_MODELS:
                     info = AVAILABLE_MODELS[model_name]
                     return f"Description: {info['description']}\nSpeed: {info['speed']}"
+                return "No specific model selected. Conversations will use randomly chosen models from the available list."
             model_selector_dropdown.change(
                 fn=get_model_info,
             generate_button = gr.Button("Generate & Push Conversations")
             output_conversations = gr.JSON(label="Generated Conversations (Content of conversations.jsonl)")
+            # Changed output_log to stream
+            output_log = gr.Textbox(label="Process Log", interactive=False, lines=10, max_lines=20, streaming=True)
             # Link commit template dropdown to update fields
             commit_template_dropdown.change(
                     model_selector_dropdown # Pass selected model name
                 ],
                 outputs=[output_conversations, output_log],
+                # show_progress=True is handled internally by yielding
             )
         with gr.Tab("Community Prompts"):