Spaces:

rawpowertools
/

Template_Generator

Running

App Files Files

DJHumanRPT commited on Apr 30, 2025

Commit

1e4a486

verified ·

1 Parent(s): f0ee36f

Anna's changes to numeric logic

Browse files

Files changed (1) hide show

app.py +216 -125

app.py CHANGED Viewed

@@ -992,144 +992,194 @@ def generate_non_categorical_values(non_cat_vars, existing_values, max_retries):
     """Generate values for non-categorical variables given existing categorical values."""
     if not non_cat_vars:
         return {}
-    # Format the variables for the prompt
-    vars_text = "\n".join(
-        [
-            f"- {var['name']}: {var['description']} (Type: {var['type']})"
-            + (
-                f", Min: {var.get('min', 'N/A')}, Max: {var.get('max', 'N/A')}"
-                if var["type"] in ["string", "int", "float"]
-                else ""
-            )
-            for var in non_cat_vars
-        ]
-    )
-    # Create prompt with existing categorical values as context
-    prompt = f"""
-    As a synthetic data generator, create values for these variables:
-    {vars_text}
-    These values should be coherent with the existing categorical values:
-    {json.dumps(existing_values, indent=2)}
-    Return ONLY a JSON object with the new variable values:
-    {{
-      "variable_name_1": value1,
-      "variable_name_2": value2
-    }}
-    """
-    for attempt in range(max_retries):
-        try:
-            response = call_model_api(
-                model=st.session_state.model,
-                prompt=prompt,
-                max_tokens=1000,
-                temperature=st.session_state.temperature,
-            )
-            result = response.strip()
-            # Extract JSON
-            json_pattern = r"```json\s*([\s\S]*?)\s*```|^\s*\{[\s\S]*\}\s*$"
-            json_match = re.search(json_pattern, result)
-            if json_match:
-                json_str = json_match.group(1) if json_match.group(1) else result
-                json_str = re.sub(r"```.*|```", "", json_str).strip()
-                try:
-                    values = json.loads(json_str, strict=False)
-                    if isinstance(values, dict):
-                        return values
-                except:
-                    pass
-            else:
-                try:
-                    values = json.loads(result, strict=False)
-                    if isinstance(values, dict):
-                        return values
-                except:
-                    pass
-        except Exception as e:
-            if attempt == max_retries - 1:
-                st.warning(f"Failed to generate non-categorical values: {str(e)}")
-    # Fallback: generate empty values for all non-categorical variables
-    return {var["name"]: get_default_value(var) for var in non_cat_vars}
 def generate_single_row(all_vars, max_retries):
-    """Generate a complete row of data for all variables."""
-    # Format the variables for the prompt
-    vars_text = "\n".join(
-        [
-            f"- {var['name']}: {var['description']} (Type: {var['type']})"
-            + (
-                f", Min: {var.get('min', 'N/A')}, Max: {var.get('max', 'N/A')}"
-                if var["type"] in ["string", "int", "float", "categorical"]
-                else ""
-            )
-            + (f", Options: {var['options']}" if var.get("options") else "")
-            for var in all_vars
-        ]
-    )
-    prompt = f"""
-    You are a synthetic data generator. Generate 1 realistic sample with values for:
-    {vars_text}
-    Return ONLY a JSON object with all variable values:
-    {{
-      "variable_name_1": value1,
-      "variable_name_2": value2
-    }}
-    For categorical variables with multiple selections, return an array of values.
-    """
-    for attempt in range(max_retries):
-        try:
-            response = call_model_api(
-                model=st.session_state.model,
-                messages=[{"role": "user", "content": prompt}],
-                max_tokens=1000,
-                temperature=st.session_state.temperature,
-            )
-            result = response.strip()
-            # Extract JSON
-            json_pattern = r"```json\s*([\s\S]*?)\s*```|^\s*\{[\s\S]*\}\s*$"
-            json_match = re.search(json_pattern, result)
-            if json_match:
-                json_str = json_match.group(1) if json_match.group(1) else result
-                json_str = re.sub(r"```.*|```", "", json_str).strip()
-                try:
                     values = json.loads(json_str, strict=False)
                     if isinstance(values, dict):
-                        return values
-                except:
-                    pass
-            else:
-                try:
                     values = json.loads(result, strict=False)
                     if isinstance(values, dict):
-                        return values
-                except:
-                    pass
-        except Exception as e:
-            if attempt == max_retries - 1:
-                st.warning(f"Failed to generate row: {str(e)}")
-    # If all attempts fail, return None
-    return None
 def get_default_value(var):
@@ -1182,7 +1232,7 @@ def generate_synthetic_outputs(
     input_vars = template_spec["input"]
     input_vars_text = "\n".join(
         [
-            f"- {var['name']}: {var['description']} (Type: {var['type']}) {'Options: '+str(var['options']) if var.get('options') else ''}"
             for var in input_vars
         ]
     )
@@ -1198,7 +1248,11 @@ def generate_synthetic_outputs(
     progress_bar = st.progress(0)
     try:
         for i, input_item in enumerate(input_data):
             # Fill the prompt template with input values
             filled_prompt = prompt_template
             for var_name, var_value in input_item.items():
@@ -1230,7 +1284,9 @@ Generate realistic output data for these variables. Return ONLY a JSON object wi
 Use appropriate data types for each variable. Return ONLY the JSON object with no additional text or explanation.
 The response must be valid JSON that can be parsed directly.
 """
             output_data = None
             for attempt in range(max_retries):
                 try:
@@ -1527,8 +1583,14 @@ with st.sidebar:
     if model_provider == "OpenAI":
         st.session_state.model = st.selectbox(
             "Select OpenAI Model",
-            options=["gpt-4o-mini", "gpt-3.5-turbo", "gpt-4", "gpt-4o", "gpt-4-turbo"],
-            index=0,
         )
     else:  # Anthropic
         st.session_state.model = st.selectbox(
@@ -2605,7 +2667,10 @@ with tab2:
                             placeholder="Enter custom lore or background information here...",
                             height=150,
                         )
             # Generate Output button
             if st.button("Generate Output", key="generate_button"):
                 # Check if API key is provided
@@ -3186,6 +3251,15 @@ with tab3:
         # Display combined data if available
         if st.session_state.combined_data:
             st.subheader("Complete Dataset (Inputs + Outputs)")
             # Add this function before the prepare_dataframe_with_json_columns function
@@ -3227,12 +3301,23 @@ with tab3:
             # Create a function to prepare the dataframe with JSON columns
             def prepare_dataframe_with_json_columns(
-                data, template_spec, show_json_columns=False
             ):
                 df = pd.DataFrame(data)
                 # Create input and output JSON columns
-                input_vars = [var["name"] for var in template_spec["input"]]
                 output_vars = [var["name"] for var in template_spec["output"]]
                 # Create input JSON column
@@ -3254,9 +3339,11 @@ with tab3:
                 # If not showing JSON columns in UI, remove them for display only
                 if not show_json_columns:
                     display_df = df.drop(columns=["input", "output"])
-                    return df, display_df
-                return df, df
             # Toggle for showing JSON columns
             st.session_state.show_json_columns = st.checkbox(
@@ -3269,6 +3356,7 @@ with tab3:
                 st.session_state.combined_data,
                 st.session_state.template_spec,
                 st.session_state.show_json_columns,
             )
             # Show data in a table
@@ -3288,8 +3376,11 @@ with tab3:
                 )
             with col2:
-                # JSON download
-                combined_json = json.dumps(st.session_state.combined_data, indent=2)
                 st.download_button(
                     label="Download Dataset (JSON)",
                     data=combined_json,
@@ -3323,4 +3414,4 @@ with tab3:
     else:
         st.info(
             "No template has been generated yet. Go to the 'Setup' tab to create one."
-        )

     """Generate values for non-categorical variables given existing categorical values."""
     if not non_cat_vars:
         return {}
+    # Separate string and numeric variables
+    llm_vars = [var for var in non_cat_vars if var["type"] == "string"]
+    numeric_vars = [var for var in non_cat_vars if var["type"] in ["int", "float"]]
+    # Sample numeric values within the specified range
+    result_values = {}
+    # result_values_descr = {} # Uncomment to include the var description, i.e. units so the LLM understands the numerical values
+                            # Otherwise, good practice is to include units in numerical vars names (e.g. price_in_euros instead of price)
+    for var in numeric_vars:
+        name = var["name"]
+        var_min = var.get("min")
+        var_max = var.get("max")
+        # description = var.get("description")
+        if var_min is None or var_max is None:
+            result_values[name] = get_default_value(var)
+            # result_values_descr[name] = get_default_value(var)
+        else:
+            try:
+                if var["type"] == "int":
+                    result_values[name] = random.randint(int(var_min), int(var_max))
+                    # result_values_descr[name] = [result_values[name], description]
+                elif var["type"] == "float":
+                    result_values[name] = round(random.uniform(float(var_min), float(var_max)), 2)
+                    # result_values_descr[name] = [result_values[name], description]
+            except:
+                result_values[name] = get_default_value(var)
+                # result_values_descr[name] = get_default_value(var)
+    # Format the string variables for the prompt
+    if llm_vars:
+        vars_text = "\n".join(
+            [f"- {var['name']}: {var['description']} (Type: string)" for var in llm_vars]
+        )
+        # Combine categorical and numeric values for LLM context
+        # context_values = {**existing_values, **result_values_descr}
+        context_values = {**existing_values, **result_values}
+        print(context_values)
+        # Create prompt with existing categorical and numerical values as context
+        prompt = f"""
+        As a synthetic data generator, create values for these variables:
+        {vars_text}
+        These values should be coherent with the existing categorical and/or numerical values:
+        {json.dumps(context_values, indent=2)}
+        Return ONLY a JSON object with the new variable values:
+        {{
+        "variable_name_1": value1,
+        "variable_name_2": value2
+        }}
+        """
+        # print("*************** PROMPT FOR STR VAR:", prompt)
+        for attempt in range(max_retries):
+            try:
+                response = call_model_api(
+                    model=st.session_state.model,
+                    prompt=prompt,
+                    max_tokens=1000,
+                    temperature=st.session_state.temperature,
+                )
+                result = response.strip()
+                # Extract JSON
+                json_pattern = r"```json\s*([\s\S]*?)\s*```|^\s*\{[\s\S]*\}\s*$"
+                json_match = re.search(json_pattern, result)
+                if json_match:
+                    json_str = json_match.group(1) if json_match.group(1) else result
+                    json_str = re.sub(r"```.*|```", "", json_str).strip()
+                    try:
+                        values = json.loads(json_str, strict=False)
+                        if isinstance(values, dict):
+                            result_values.update(values)
+                            return result_values
+                    except:
+                        pass
+                else:
+                    try:
+                        values = json.loads(result, strict=False)
+                        if isinstance(values, dict):
+                            result_values.update(values)
+                            return result_values
+                    except:
+                        pass
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    st.warning(f"Failed to generate string values: {str(e)}")
+        # Fallback: generate empty values for all string variables
+        for var in llm_vars:
+            result_values[var["name"]] = get_default_value(var)
+    return result_values
 def generate_single_row(all_vars, max_retries):
+    """Generate a complete row of data using hybrid logic:
+       - Use LLM for string/categorical vars
+       - Sample int/float within range
+    """
+    numeric_vars = [var for var in all_vars if var["type"] in ["int", "float"]]
+    llm_vars = [var for var in all_vars if var["type"] in ["string", "categorical"]]
+    row = {}
+    # Sample numeric vars
+    for var in numeric_vars:
+        name = var["name"]
+        var_min = var.get("min")
+        var_max = var.get("max")
+        if var_min is None or var_max is None:
+            row[name] = get_default_value(var)
+        else:
+            try:
+                if var["type"] == "int":
+                    row[name] = random.randint(int(var_min), int(var_max))
+                elif var["type"] == "float":
+                    row[name] = round(random.uniform(float(var_min), float(var_max)), 2)
+            except:
+                row[name] = get_default_value(var)
+    # Generate string and categorical via LLM
+    if llm_vars:
+        vars_text = "\n".join(
+            [
+                f"- {var['name']}: {var['description']} (Type: {var['type']})"
+                + (
+                    f", Options: {var['options']}" if var["type"] == "categorical" and var.get("options") else ""
+                )
+                for var in llm_vars
+            ]
+        )
+        prompt = f"""
+        You are a synthetic data generator. Generate values for the following variables:
+        {vars_text}
+        Based on this partial row:
+        {json.dumps(row, indent=2)}
+        Return ONLY a JSON object with the new values:
+        {{
+          "var_name_1": value1,
+          "var_name_2": value2
+        }}
+        For categorical variables that allow multiple selections, return a list of values.
+        """
+        # print("*************** PROMPT FOR STR,CAT VAR:", prompt)
+        for attempt in range(max_retries):
+            try:
+                response = call_model_api(
+                    model=st.session_state.model,
+                    prompt=prompt,
+                    max_tokens=1000,
+                    temperature=st.session_state.temperature,
+                )
+                result = response.strip()
+                json_pattern = r"```json\s*([\s\S]*?)\s*```|^\s*\{[\s\S]*\}\s*$"
+                json_match = re.search(json_pattern, result)
+                if json_match:
+                    json_str = json_match.group(1) if json_match.group(1) else result
+                    json_str = re.sub(r"```.*|```", "", json_str).strip()
                     values = json.loads(json_str, strict=False)
                     if isinstance(values, dict):
+                        row.update(values)
+                        break
+                else:
                     values = json.loads(result, strict=False)
                     if isinstance(values, dict):
+                        row.update(values)
+                        break
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    st.warning(f"Failed to generate string/categorical values: {str(e)}")
+    return row if row else None
 def get_default_value(var):
     input_vars = template_spec["input"]
     input_vars_text = "\n".join(
         [
+            f"- {var['name']}: {var['description']} (Type: {var['type']})"
             for var in input_vars
         ]
     )
     progress_bar = st.progress(0)
     try:
+        input_var_names = [var["name"] for var in template_spec["input"]]
         for i, input_item in enumerate(input_data):
+            # Filter out variables not defined in the template spec
+            input_item = {k: v for k, v in input_item.items() if k in input_var_names}
             # Fill the prompt template with input values
             filled_prompt = prompt_template
             for var_name, var_value in input_item.items():
 Use appropriate data types for each variable. Return ONLY the JSON object with no additional text or explanation.
 The response must be valid JSON that can be parsed directly.
 """
+            # debug logs:
+            # print("*************Filtered Input:", input_item)
+            # print("*************Generated Prompt:", generation_prompt)
             output_data = None
             for attempt in range(max_retries):
                 try:
     if model_provider == "OpenAI":
         st.session_state.model = st.selectbox(
             "Select OpenAI Model",
+            options=[
+                "gpt-4o-mini",
+                "gpt-4.1-mini",
+                "gpt-4.1",
+                "gpt-4o",
+                "gpt-4.1-nano",
+            ],
+            index=1,
         )
     else:  # Anthropic
         st.session_state.model = st.selectbox(
                             placeholder="Enter custom lore or background information here...",
                             height=150,
                         )
+            # Temperature control slider
+            st.session_state.temperature = st.slider(
+                "Temperature (creativity level)", min_value=0.0, max_value=1.0, value=0.7, step=0.05
+            )
             # Generate Output button
             if st.button("Generate Output", key="generate_button"):
                 # Check if API key is provided
         # Display combined data if available
         if st.session_state.combined_data:
             st.subheader("Complete Dataset (Inputs + Outputs)")
+            # Get all available column names from the data
+            all_columns = pd.DataFrame(st.session_state.combined_data).columns.tolist()
+            # Let the user select columns to exclude from input JSON
+            st.session_state.columns_to_drop = st.multiselect(
+                "Select input variables to exclude:",
+                options=all_columns,
+                default=st.session_state.get("columns_to_drop", []),
+            )
             # Add this function before the prepare_dataframe_with_json_columns function
             # Create a function to prepare the dataframe with JSON columns
             def prepare_dataframe_with_json_columns(
+                data, template_spec, show_json_columns=False, columns_to_drop=None
             ):
                 df = pd.DataFrame(data)
+                # Drop specified columns from the dataframe
+                if columns_to_drop:
+                    df = df.drop(
+                        columns=[col for col in columns_to_drop if col in df.columns]
+                    )
+                else:
+                    columns_to_drop = []
                 # Create input and output JSON columns
+                input_vars = [
+                    var["name"]
+                    for var in template_spec["input"]
+                    if var["name"] not in columns_to_drop
+                ]
                 output_vars = [var["name"] for var in template_spec["output"]]
                 # Create input JSON column
                 # If not showing JSON columns in UI, remove them for display only
                 if not show_json_columns:
                     display_df = df.drop(columns=["input", "output"])
+                else:
+                    display_df = df
+                # Return the same filtered df for export (full_df)
+                return df, display_df
             # Toggle for showing JSON columns
             st.session_state.show_json_columns = st.checkbox(
                 st.session_state.combined_data,
                 st.session_state.template_spec,
                 st.session_state.show_json_columns,
+                columns_to_drop=st.session_state.columns_to_drop,
             )
             # Show data in a table
                 )
             with col2:
+                # JSON download using cleaned dataframe
+                json_ready_df = full_df.drop(columns=["input", "output"])
+                combined_json = json.dumps(
+                    json_ready_df.to_dict(orient="records"), indent=2
+                )
                 st.download_button(
                     label="Download Dataset (JSON)",
                     data=combined_json,
     else:
         st.info(
             "No template has been generated yet. Go to the 'Setup' tab to create one."
+        )