Spaces:

our-sci
/

data-translation-experiments

Sleeping

App Files Files Community

rosemariafontana commited on Dec 18, 2024

Commit

080b93d

verified ·

1 Parent(s): f10f4aa

Update script_for_automation.py

Browse files

Files changed (1) hide show

script_for_automation.py +101 -14

script_for_automation.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import requests
 from jsondiff import diff
 # The purpose of this script is to automate running a bunch of tests
 # This script will take an input folder
@@ -255,12 +258,57 @@ def fill_out_survey(recipe_dict, input_data):
     except requests.exceptions.RequestException as e:
         print(f"An error occurred while submitting the data: {e}")
-def get_data_ready(recipe_dict, input_data):
 def drive_process():
     # this is to drive the processing process
     gold_standards, input_data = get_baserow_data()
     my_recipes = get_recipes()
     # Input chunk structure
@@ -269,31 +317,70 @@ def drive_process():
     #            "otter_summary": liz_carrot_otter_summary_preprocessing,
     #            "greg_summary": liz_carrot_greg_summary_preprocessing
     #        },
     for recipe_dict in my_recipes:
         for key, input_chunks in input_data.items():
             if recipe_dict["pre_processing_strategy"] == "Otter.ai Summary":
-                input_data = input_chunks["otter_summary"]
             elif recipe_dict["pre_processing_strategy"] == "Greg Summary":
-                input_data = input_chunks["greg_summary"]
             else:
-                input_data = input_chunks["raw_interview"]
             fill_out_survey(recipe_dict, input_data)
-            proc_spec = get_data_ready(recipe_dict, input_data)
             completed_json = process_specifications(proc_spec)
-            # This is for after doing the stuff with chatgpt actually
-            gold_standard_dict = gold_standard[key]
-            difference = diff(gold_standard_dict, completed_json)
             # Convert to yaml
-            # Convert BOTH to yaml
-            # Side by side
-            # build file
-    return "banana"

 import requests
 from jsondiff import diff
+import yaml
+import pandas as pd
+import os
 # The purpose of this script is to automate running a bunch of tests
 # This script will take an input folder
     except requests.exceptions.RequestException as e:
         print(f"An error occurred while submitting the data: {e}")
+def get_data_ready(recipe_dict, input_data_piece):
+    ## Input chunk structure
+    #     "raw_interview": liz_carrot_input_data_raw_interview,
+    #
+    #
+    # recipe_dict = {
+    #            "recipe_id": recipe_id,
+    #            "testing_strategy_text": testing_strategy_text,
+    #            "schema_processing_model", schema_processing_model,
+    #            "pre_processing_strategy", pre_processing_strategy,
+    #            "pre_processing_text", pre_processing_text,
+    #            "pre_processing_model", pre_processing_model,
+    #            "prompting_strategy", prompting_strategy,
+    #            "plantings_and_fields_prompt", plantings_and_fields_prompt,
+    #            "interactions_prompt", interactions_prompt,
+    #            "treatments_prompt", treatments_prompt
+    #        }
+    #
+    processed_data = {}
+    processed_data["input_style"] = 'big-block-input-text'
+    processed_data["input_text"] = input_data_piece
+    processed_data["prompts"]["firstschemaprompt"] = recipe_dict["plantings_and_fields_prompt"]
+    processed_data["prompts"]["secondschemaprompt"] = recipe_dict["interactions_prompt"]
+    processed_data["prompts"]["thirdschemaprompt"] = recipe_dict["treatments_prompt"]
+    processed_data["parameters"] = {}
+    processed_data["parameters"]["modelversion"] = recipe_dict["schema_processing_model"]
+    processed_data["parameters"]["promptstyle"] = recipe_dict["prompting_strategy"]
+    if recipe_dict["pre_processing_text"] is None and recipe_dict["pre_processing_strategy"] is None:
+        processed_data["parameters"]["preprocessdata"] = "no"
+    else:
+        processed_data["parameters"]["preprocessdata"] = "yes"
+        processed_data["parameters"]["preprocessmodelversion"] = recipe_dict["pre_processing_model"]
+        processed_data["parameters"]["multiplepreprompts"] =  "no"
+        processed_data["parameters"]["prepromptstyle"] = recipe_dict["pre_processing_strategy"]
+        processed_data["parameters"]["preprocessingprompt1"] = recipe_dict["pre_processing_text"]
+        processed_data["parameters"]["preprocessingprompt2"] = ""
+        processed_data["parameters"]["preprocessingprompt3"] = ""
+    return processed_data
 def drive_process():
     # this is to drive the processing process
+    # Get the data from baserow (gold standards JSON and Input data)
     gold_standards, input_data = get_baserow_data()
+    # Get the recipes from baserow too
     my_recipes = get_recipes()
     # Input chunk structure
     #            "otter_summary": liz_carrot_otter_summary_preprocessing,
     #            "greg_summary": liz_carrot_greg_summary_preprocessing
     #        },
+    output_rows = []
+    output_folder = "output_files"
+    if not os.path.exists(output_folder)
+        os.makedirs(output_folder)
     for recipe_dict in my_recipes:
         for key, input_chunks in input_data.items():
+            # Get the input data based on the recipe
             if recipe_dict["pre_processing_strategy"] == "Otter.ai Summary":
+                input_data_piece = input_chunks["otter_summary"]
             elif recipe_dict["pre_processing_strategy"] == "Greg Summary":
+                input_data_piece = input_chunks["greg_summary"]
             else:
+                input_data_piece = input_chunks["raw_interview"]
+            # Fill out a Surveystack submission
             fill_out_survey(recipe_dict, input_data)
+            # Prepare the data for the structured output setup
+            proc_spec = get_data_ready(recipe_dict, input_data_piece)
             completed_json = process_specifications(proc_spec)
+            # Get the gold standard for this input_chunk (liz_carrot, ben_soybean, wally_squash)
+            # Compare the generated JSON to the gold standard
+            gold_standard_json = gold_standard[key]
+            differences = list(diff(gold_standard_json, completed_json))
             # Convert to yaml
+            gold_standard_yaml = yaml.dump(gold_standard_json, default_flow_style=False)
+            comparison_yaml = yaml.dump(completed_json, default_flow_style=False)
+            recipe_id = recipe_dict.get("recipe_id", "N/A")
+            output_rows.append({
+                "Recipe_ID": recipe_id,
+                "Testing_Strategy_Text": recipe_dict.get("testing_strategy_text", "N/A"),
+                "Schema_Processing_Model": recipe_dict.get("schema_processing_model", "N/A"),
+                "Pre_Processing_Strategy": recipe_dict.get("pre_processing_strategy", "N/A"),
+                "Pre_Processing_Text": recipe_dict.get("pre_processing_text", "N/A"),
+                "Pre_Processing_Model": recipe_dict.get("pre_processing_model", "N/A"),
+                "Prompting_Strategy": recipe_dict.get("prompting_strategy", "N/A"),
+                "Plantings_and_Fields_Prompt": recipe_dict.get("plantings_and_fields_prompt", "N/A"),
+                "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
+                "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
+                "Input_Transcript": input_data,
+                "Gold_Standard_Key_Values": json.dumps(gold_standard_json, indent=2),
+                "Machine_Generated_Key_Values": json.dumps(completed_json, indent=2),
+                "Differences": json.dumps(differences, indent=2),
+                "Gold_Standard_YAML": gold_standard_yaml,
+                "Machine_Generated_YAML": comparison_yaml
+            })
+            df = pd.DataFrame(output_rows)
+            markdown_output = generate_markdown_output(df)
+            output_file = f"/output/recipe_run_{recipe_id}_{key}.md"
+            output_file_path = os.path.join(output_folder, output_file_name)
+            with open(output_file_path, "w") as file:
+                file.write(markdown_output)
+            print(f"Markdown file saved at {output_file_path}")
+    return output_folder