Spaces:

ReallyFloppyPenguin
/

SynthGen

Sleeping

App Files Files Community

ReallyFloppyPenguin commited on Apr 5, 2025

Commit

bffeb3e

verified ·

1 Parent(s): 2c79988

Update synthgen.py

Browse files

Files changed (1) hide show

synthgen.py +229 -61

synthgen.py CHANGED Viewed

@@ -1,61 +1,229 @@
-import os
-from openai import OpenAI
-# Ensure the OPENROUTER_API_KEY environment variable is set
-api_key = "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"
-if not api_key:
-    raise ValueError("OPENROUTER_API_KEY environment variable not set.")
-# Point the OpenAI client to the OpenRouter API
-client = OpenAI(
-    base_url="https://openrouter.ai/api/v1",
-    api_key=api_key,
-)
-def generate_synthetic_text(prompt: str, model: str = "deepseek/deepseek-chat-v3-0324:free") -> str:
-    """
-    Generates synthetic text using an OpenRouter model.
-    Args:
-        prompt: The input prompt to guide the text generation.
-        model: The model to use on OpenRouter (default: gpt-3.5-turbo).
-               You can find model names on the OpenRouter website.
-    Returns:
-        The generated text string.
-    """
-    try:
-        response = client.chat.completions.create(
-            extra_headers={
-                # "HTTP-Referer": "https://www.google.com", # Optional. Site URL for rankings on openrouter.ai.
-                "X-Title": "SynthGen", # Optional. Site title for rankings on openrouter.ai.
-            },
-            model=model,
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant generating synthetic data."},
-                {"role": "user", "content": prompt},
-            ],
-        )
-        if response.choices and response.choices[0].message.content:
-            return response.choices[0].message.content.strip()
-        else:
-            return "Error: No content generated."
-    except Exception as e:
-        return f"Error during API call: {e}"
-# --- Main Execution ---
-if __name__ == "__main__":
-    # TODO: Define the kind of text and number of samples needed
-    num_samples = 5 # Example: generate 5 samples
-    prompt_template = "Generate a short, positive product review for a fictional gadget." # Example prompt
-    print(f"Generating {num_samples} synthetic text samples...")
-    for i in range(num_samples):
-        # You might want to vary the prompt slightly for each sample
-        # For now, we use the same template
-        generated_text = generate_synthetic_text(prompt_template)
-        print(f"\n--- Sample {i+1} ---")
-        print(generated_text)
-    print("\nGeneration complete.")

+import os
+from openai import OpenAI
+import re # Import regex for parsing conversation turns
+from typing import Optional, Union # Need Optional for settings
+# Ensure the OPENROUTER_API_KEY environment variable is set
+api_key = "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"
+if not api_key:
+    raise ValueError("OPENROUTER_API_KEY environment variable not set.")
+# Point the OpenAI client to the OpenRouter API
+client = OpenAI(
+    base_url="https://openrouter.ai/api/v1",
+    api_key=api_key,
+)
+# --- Core Generation Functions ---
+def generate_synthetic_text(
+    prompt: str,
+    model: str = "deepseek/deepseek-chat-v3-0324:free",
+    system_message: str = "You are a helpful assistant generating synthetic data.",
+    temperature: Optional[float] = 0.7, # Default temperature
+    top_p: Optional[float] = None,       # Default top_p (let API decide if None)
+    max_tokens: Optional[int] = None   # Default max_tokens (let API decide if None)
+) -> str:
+    """
+    Generates synthetic text using an OpenRouter model via Chat Completions,
+    including model parameter controls.
+    Args:
+        prompt: The user's input prompt.
+        model: The model ID.
+        system_message: The system message context.
+        temperature: Controls randomness (0.0 to 2.0). None means API default.
+        top_p: Nucleus sampling probability. None means API default.
+        max_tokens: Maximum number of tokens to generate. None means API default.
+    Returns:
+        The generated text string or an error message.
+    """
+    if not api_key or api_key == "YOUR_API_KEY_HERE_OR_SET_ENV_VAR":
+         return "Error: OPENROUTER_API_KEY not configured properly. Please set the environment variable."
+    # Prepare parameters, only including them if they are not None
+    params = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": prompt},
+        ],
+        "extra_headers": {
+             # "HTTP-Referer": "YOUR_SITE_URL",
+             "X-Title": "SynthGen",
+         }
+    }
+    if temperature is not None:
+        params["temperature"] = temperature
+    if top_p is not None:
+        params["top_p"] = top_p
+    if max_tokens is not None:
+        params["max_tokens"] = max_tokens
+    try:
+        response = client.chat.completions.create(**params) # Use dictionary unpacking
+        if response.choices and response.choices[0].message and response.choices[0].message.content:
+            return response.choices[0].message.content.strip()
+        else:
+            print(f"Warning: No content in response for model {model}. Response: {response}")
+            return "Error: No content generated by the model."
+    except Exception as e:
+        print(f"Error during API call to model {model}: {e}")
+        return f"Error during API call: {e}"
+def generate_prompts(
+    num_prompts: int,
+    model: str,
+    topic_hint: str = "diverse and interesting",
+    temperature: Optional[float] = 0.7, # Pass settings through
+    top_p: Optional[float] = None,
+    max_tokens: Optional[int] = 200 # Set a reasonable default max for prompts
+) -> list[str]:
+    """
+    Generates a list of conversation prompts using an AI model.
+    Args:
+        num_prompts: The number of prompts to generate.
+        model: The model ID to use for generation.
+        topic_hint: Optional hint for the kind of topics (e.g., "related to technology").
+        temperature: Controls randomness (0.0 to 2.0). None means API default.
+        top_p: Nucleus sampling probability. None means API default.
+        max_tokens: Maximum number of tokens to generate. None means API default.
+    Returns:
+        A list of generated prompts.
+    """
+    instruction = (
+        f"Generate exactly {num_prompts} unique, {topic_hint} system prompts or starting topics suitable "
+        f"for generating synthetic conversations between a user and an AI assistant. "
+        f"Each prompt should be concise (ideally one sentence) and focus on a clear task or subject. "
+        f"Present each prompt on a new line, with no other introductory or concluding text."
+        f"\n\nExamples:\n"
+        f"- Act as a travel agent planning a trip to Japan.\n"
+        f"- Explain the concept of black holes to a 5-year-old.\n"
+        f"- Write a python function to reverse a string."
+    )
+    system_msg = "You are an expert prompt generator. Follow the user's instructions precisely."
+    # Pass the settings down to generate_synthetic_text
+    generated_text = generate_synthetic_text(
+        instruction,
+        model,
+        system_message=system_msg,
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens
+    )
+    if generated_text.startswith("Error:"):
+        raise ValueError(generated_text)
+    # Split into lines and clean up any extra whitespace or empty lines
+    prompts = [p.strip() for p in generated_text.strip().split('\n') if p.strip()]
+    prompts = [p.replace("- ", "") for p in prompts]
+    if not prompts:
+        # Log the raw generated text if parsing failed
+        print(f"Warning: Failed to parse prompts from generated text. Raw text:\n{generated_text}")
+        raise ValueError("AI failed to generate prompts in the expected format.")
+    # Optional: Truncate or pad if the model didn't generate the exact number
+    return prompts[:num_prompts]
+def generate_synthetic_conversation(
+    system_prompt: str,
+    model: str,
+    num_turns: int,
+    temperature: Optional[float] = 0.7, # Pass settings through
+    top_p: Optional[float] = None,
+    max_tokens: Optional[int] = 1000 # Set a reasonable default max for conversations
+) -> str:
+    """
+    Generates a synthetic conversation with a specified number of turns.
+    Args:
+        system_prompt: The initial system prompt defining the context or AI persona.
+        model: The model ID to use for generation.
+        num_turns: The desired number of conversational turns (1 turn = 1 User + 1 Assistant).
+        temperature: Controls randomness (0.0 to 2.0). None means API default.
+        top_p: Nucleus sampling probability. None means API default.
+        max_tokens: Maximum number of tokens to generate. None means API default.
+    Returns:
+        A string containing the formatted conversation.
+    """
+    # We'll ask the model to generate the whole conversation in one go for simplicity.
+    # More complex approaches could involve iterative calls.
+    instruction = (
+        f"Generate a realistic conversation between a 'User' and an 'Assistant'. "
+        f"The conversation should start based on the following system prompt/topic: '{system_prompt}'.\n"
+        f"The conversation should have approximately {num_turns} pairs of User/Assistant turns.\n"
+        f"Format the output clearly, starting each line with 'User:' or 'Assistant:'.\n\n"
+        f"Example Format:\n"
+        f"User: Hello!\n"
+        f"Assistant: Hi there! How can I help you today?\n"
+        f"User: Can you explain photosynthesis?\n"
+        f"Assistant: Certainly! Photosynthesis is the process..."
+    )
+    # Use the user-provided system prompt for the *conversation's* context,
+    # but a generic one for the generation *task* itself.
+    system_msg_for_generation = f"You are an AI assistant simulating a conversation. The context for the conversation you generate is: {system_prompt}"
+    # Pass the settings down to generate_synthetic_text
+    conversation_text = generate_synthetic_text(
+        prompt=instruction,
+        model=model,
+        system_message=system_msg_for_generation,
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens
+    )
+    if conversation_text.startswith("Error:"):
+        # Propagate the error message
+        return f"Error generating conversation for prompt '{system_prompt}':\n{conversation_text}"
+    # Basic validation/cleanup (optional)
+    if not re.search(r"User:|Assistant:", conversation_text, re.IGNORECASE):
+         print(f"Warning: Generated text for conversation '{system_prompt}' might not be in the expected format. Raw text:\n{conversation_text}")
+         # Return the raw text anyway, maybe the model format is slightly different
+         return f"Generated conversation for prompt '{system_prompt}':\n(Format might vary)\n\n{conversation_text}"
+    return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
+# --- Main Execution (Example Usage) ---
+if __name__ == "__main__":
+    print("--- Testing Basic Text Generation ---")
+    test_prompt = "Describe the benefits of using synthetic data."
+    text_result = generate_synthetic_text(test_prompt, temperature=0.5, max_tokens=100) # Example with settings
+    print(f"Prompt: {test_prompt}\nResult:\n{text_result}\n")
+    print("\n--- Testing Prompt Generation ---")
+    try:
+        num_prompts_to_gen = 3
+        prompts_result = generate_prompts(num_prompts_to_gen, "deepseek/deepseek-chat-v3-0324:free")
+        print(f"Generated {len(prompts_result)} prompts:")
+        for i, p in enumerate(prompts_result):
+            print(f"{i+1}. {p}")
+    except ValueError as e:
+        print(f"Error generating prompts: {e}")
+    print("\n--- Testing Conversation Generation ---")
+    conv_prompt = "Act as a helpful expert explaining the difference between nuclear fission and fusion."
+    num_conv_turns = 3
+    conv_result = generate_synthetic_conversation(conv_prompt, "deepseek/deepseek-chat-v3-0324:free", num_conv_turns)
+    print(f"{conv_result}\n")
+    print("\n--- Testing with Invalid API Key (if applicable) ---")
+    # Temporarily use an invalid key for testing error handling
+    original_key = client.api_key
+    client.api_key = "invalid-key"
+    error_text_result = generate_synthetic_text("Test prompt")
+    print(f"Result with invalid key: {error_text_result}")
+    client.api_key = original_key # Restore original key
+    print("\nGeneration tests complete.")