data-translation-experiments / script_for_automation.py
rosemariafontana's picture
Update script_for_automation.py
bb2ea04 verified
raw
history blame
12.4 kB
import requests
from jsondiff import diff
# The purpose of this script is to automate running a bunch of tests
# This script will take an input folder
# The input folder should contain:
# 1. A file containing a list of the recipe parameters
# 2. A file containing the input data for each of the schemas
# 3. ....
# Steps to do this that we will outline then perform
# First, get the gold standard JSONs from baserow
# Next, get the recipe parameter list from the input folder
# Iterate through the recipe parameter list one at a time
# In the iteration, first fill out a surveystack submission - is this possible with the current surveystack API?
# Next, save the surveystack submission ID (?)
# Use the iteration parameters to then get the three JSONs back from chatgpt
# Compare the JSONs to the gold standard JSONs
# Print out the differences in a .csv
# Print out a side by side of the yaml
# store all these together
# continue through iterations
# create downloadables of the results
BASEROW_API_KEY = os.getenv("BASEROW_API_KEY")
from process_data import process_specifications
def get_baserow_url(table_id):
BASEROW_API_BASE = "https://baserow.f11804a1.federatedcomputer.net/api"
return f"{BASEROW_API_BASE}/database/rows/table/{table_id}/?user_field_names=true"
def get_baserow_data():
# This is to get the gold standards from baserow
# We will also get the input data
TABLE_ID = "560"
BASEROW_URL = get_baserow_url(TABLE_ID)
headers = {
"Authorization": f"Token {os.environ["BASEROW_API_KEY"]}",
"Content-Type": "application/json"
}
try:
response = requests.get(BASEROW_URL, headers=headers)
response.raise_for_status()
rows = response.json()
results = rows.get("results", [])
for row in results:
print(f"Row ID: {row.get('id')}, Data: {row}")
if row.get("id") == 2:
liz_carrot_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard")
liz_carrot_interactions_gold_standard = row.get("Interactions - Gold Standard")
liz_carrot_trials_gold_standard = row.get("Trials - Gold Standard")
liz_carrot_input_data_raw_interview = row.get("Raw Interview")
liz_carrot_otter_summary_preprocessing = row.get("Otter Summary")
liz_carrot_greg_summary_preprocessing = row.get("Post-Interview Summary")
elif row.get("id") == 3:
ben_soybean_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard")
ben_soybean_interactions_gold_standard = row.get("Interactions - Gold Standard")
ben_soybean_trials_gold_standard = row.get("Trials - Gold Standard")
ben_soybean_input_data_raw_interview = row.get("Raw Interview")
ben_soybean_otter_summary_preprocessing = row.get("Otter Summary")
ben_soybean_greg_summary_preprocessing = row.get("Post-Interview Summary")
elif row.get("id") == 5:
wally_squash_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard")
wally_squash_interactions_gold_standard = row.get("Interactions - Gold Standard")
wally_squash_trials_gold_standard = row.get("Trials - Gold Standard")
wally_squash_input_data_raw_interview = row.get("Raw Interview")
wally_squash_otter_summary_preprocessing = row.get("Otter Summary")
wally_squash_greg_summary_preprocessing = row.get("Post-Interview Summary")
gold_standards = {
"liz_carrot": {
"planting": liz_carrot_plantings_gold_standard,
"interactions": liz_carrot_interactions_gold_standard,
"trials": liz_carrot_trials_gold_standard,
},
"ben_soybean": {
"planting": ben_soybean_plantings_gold_standard,
"interactions": ben_soybean_interactions_gold_standard,
"trials": ben_soybean_trials_gold_standard,
},
"wally_squash": {
"planting": wally_squash_plantings_gold_standard,
"interactions": wally_squash_interactions_gold_standard,
"trials": wally_squash_trials_gold_standard,
}
}
# How to retrieve this data
# liz_carrot_planting = gold_standards["planting_gold_standards"]["liz_carrot"]
# ben_soybean_interactions = gold_standards["interactions_gold_standards"]["ben_soybean"]
# wally_squash_trial = gold_standards["trial_gold_standards"]["wally_squash"]
input_data = {
"liz_carrot": {
"raw_interview": liz_carrot_input_data_raw_interview,
"otter_summary": liz_carrot_otter_summary_preprocessing,
"greg_summary": liz_carrot_greg_summary_preprocessing
},
"ben_soybean": {
"raw_interview": ben_soybean_input_data_raw_interview,
"otter_summary": ben_soybean_otter_summary_preprocessing,
"greg_summary": ben_soybean_greg_summary_preprocessing
},
"wally_squash": {
"raw_interview": wally_squash_input_data_raw_interview,
"otter_summary": wally_squash_otter_summary_preprocessing,
"greg_summary": wally_squash_greg_summary_preprocessing
}
}
return gold_standards, input_data
except requests.exceptions.RequestException as e:
print(f"Failed to fetch rows: {e}")
def get_recipes():
TABLE_ID = "578"
BASEROW_URL = get_baserow_url(TABLE_ID)
headers = {
"Authorization": f"Token {os.environ["BASEROW_API_KEY"]}",
"Content-Type": "application/json"
}
try:
response = requests.get(BASEROW_URL, headers=headers)
response.raise_for_status()
rows = response.json()
results = rows.get("results", [])
my_recipes = []
for row in results:
print(f"Row ID: {row.get('id')}, Data: {row}")
recipe_id = row.get("Recipe ID")
testing_strategy_text = row.get("Testing Strategy for Set")
schema_processing_model = row.get("Schema Processing Model")
pre_processing_strategy = row.get("Pre-Processing Strategy")
pre_processing_text = row.get("Pre-Prompt Text")
pre_processing_model = row.get("Preprocessing Model")
prompting_strategy = row.get("Prompting Strategy")
plantings_and_fields_prompt = row.get("Plantings and Fields Prompting Text")
interactions_prompt = row.get("Interactions Prompting Text")
treatments_prompt = row.get("Treatments Prompting Text")
recipe_dict = {
"recipe_id": recipe_id,
"testing_strategy_text": testing_strategy_text,
"schema_processing_model", schema_processing_model,
"pre_processing_strategy", pre_processing_strategy,
"pre_processing_text", pre_processing_text,
"pre_processing_model", pre_processing_model,
"prompting_strategy", prompting_strategy,
"plantings_and_fields_prompt", plantings_and_fields_prompt,
"interactions_prompt", interactions_prompt,
"treatments_prompt", treatments_prompt
}
my_recipes.append(recipe_dict)
return my_recipes
except requests.exceptions.RequestException as e:
print(f"Failed to fetch rows: {e}")
def fill_out_survey(recipe_dict, input_data):
survey_id = "673b4994aef86f0533b3546c"
base_url = "https://app.surveystack.io/api/submissions"
if recipe_dict.get("pre_processing_text") is None:
pre_processing = False
pre_process = "no"
pre_process_model_version = "None"
else:
pre_process = recipe_dict
# Set the prompting strategy to be a variable from the list
# Do this here
if pre_processing:
submission_data = {
"survey": survey_id,
"data": {
"inputstyle": "big-block-input-text",
"onelonginputtext": input_data,
"schema_prompt": {
"firstschemaprompt": recipe_dict["plantings_and_fields_prompt"],
"secondschemaprompt": recipe_dict["interactions_prompt"],
"thirdschemaprompt": recipe_dict["treatments_prompt"],
},
},
"parameters": {
"modelversion": recipe_dict["schema_processing_model"],
"preprocessdata": ["yes"],
"promptstyle": recipe_dict["prompting_strategy"],
"preprocessmodelversion": recipe_dict["prompting_strategy"],
"multiplepreprompts": "no",
"prepromptstyle": recipe_dict["pre_processing_strategy"],
"preprocessingprompt1": recipe_dict["pre_processing_text"],
"preprocessingprompt2": "",
"preprocessingprompt3": ""
}
}
else:
submission_data = {
"survey": survey_id,
"data": {
"inputstyle": "big-block-input-text",
"onelonginputtext": input_data,
"schema_prompt": {
"firstschemaprompt": recipe_dict["plantings_and_fields_prompt"],
"secondschemaprompt": recipe_dict["interactions_prompt"],
"thirdschemaprompt": recipe_dict["treatments_prompt"],
},
}
"parameters": {
"modelversion": recipe_dict["schema_processing_model"],
"preprocessdata": ["no"],
"promptstyle": recipe_dict["prompting_strategy"],
"preprocessmodelversion": None,
"multiplepreprompts": "no",
"prepromptstyle": None,
"preprocessingprompt1": None,
"preprocessingprompt2": None,
"preprocessingprompt3": None
}
}
headers = {
"Content-Type": "application/json",
}
try:
response = requests.post(base_url, headers=headers, data=json.dumps(submission_data))
response.raise_for_status()
if response.status_code == 200:
print("Submission successful to SurveyStack!")
print(response.json())
return submission_data
else:
print(f"Failed to submit: {response.status_code} - {response.text}")
except requests.exceptions.RequestException as e:
print(f"An error occurred while submitting the data: {e}")
def drive_process():
# this is to drive the processing process
gold_standards, input_data = get_baserow_data()
my_recipes = get_recipes()
# Input chunk structure
# "liz_carrot": {
# "raw_interview": liz_carrot_input_data_raw_interview,
# "otter_summary": liz_carrot_otter_summary_preprocessing,
# "greg_summary": liz_carrot_greg_summary_preprocessing
# },
for recipe_dict in my_recipes:
for key, input_chunks in input_data.items():
if recipe_dict["pre_processing_strategy"] == "Otter.ai Summary":
input_data = input_chunks["otter_summary"]
elif recipe_dict["pre_processing_strategy"] == "Greg Summary":
input_data = input_chunks["greg_summary"]
else:
input_data = input_chunks["raw_interview"]
fill_out_survey(recipe_dict, input_data)
proc_spec = get_data_ready(recipe_dict, input_data)
completed_json = process_specifications(proc_spec)
# This is for after doing the stuff with chatgpt actually
gold_standard_dict = gold_standard[key]
difference = diff(gold_standard_dict, completed_json)
# Convert to yaml
# Convert BOTH to yaml
# Side by side
# build file
return "banana"