Spaces:

our-sci
/

data-translation-experiments

Sleeping

App Files Files Community

data-translation-experiments / script_for_automation.py

rosemariafontana

Update script_for_automation.py

73600ae verified over 1 year ago

raw

history blame

23.1 kB

	import requests
	from jsondiff import diff
	import yaml
	import pandas as pd
	import os
	import shutil
	import json
	from datetime import datetime

	# The purpose of this script is to automate running a bunch of tests
	# This script will take an input folder
	# The input folder should contain:
	# 1. A file containing a list of the recipe parameters
	# 2. A file containing the input data for each of the schemas
	# 3. ....

	# Steps to do this that we will outline then perform
	# First, get the gold standard JSONs from baserow
	# Next, get the recipe parameter list from the input folder
	# Iterate through the recipe parameter list one at a time
	# In the iteration, first fill out a surveystack submission - is this possible with the current surveystack API?
	# Next, save the surveystack submission ID (?)
	# Use the iteration parameters to then get the three JSONs back from chatgpt
	# Compare the JSONs to the gold standard JSONs
	# Print out the differences in a .csv
	# Print out a side by side of the yaml
	# store all these together
	# continue through iterations
	# create downloadables of the results

	BASEROW_API_KEY = os.getenv("BASEROW_API_KEY")
	from process_data import process_specifications

	def get_baserow_url(table_id):
	print("GETTING BASEROW URL")
	BASEROW_API_BASE = "https://baserow.f11804a1.federatedcomputer.net/api"
	return f"{BASEROW_API_BASE}/database/rows/table/{table_id}/?user_field_names=true"

	def get_baserow_data():
	# This is to get the gold standards from baserow
	# We will also get the input data

	print("GETTING BASEROW DATA")

	TABLE_ID = "560"

	BASEROW_URL = get_baserow_url(TABLE_ID)

	headers = {
	"Authorization": f"Token {os.environ['BASEROW_API_KEY']}",
	"Content-Type": "application/json"
	}

	print("STARTING TO TRY RESPONSE REQUEST")
	try:
	response = requests.get(BASEROW_URL, headers=headers)
	print("GOT")
	response.raise_for_status()
	rows = response.json()
	results = rows.get("results", [])

	print("PARSING ROWS NOW")

	for row in results:
	print(f"Row ID: {row.get('id')}, Data: {row}")

	if row.get("id") == 2:
	liz_carrot_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard")
	liz_carrot_interactions_gold_standard = row.get("Interactions - Gold Standard")
	liz_carrot_trials_gold_standard = row.get("Trials - Gold Standard")

	liz_carrot_input_data_raw_interview = row.get("Raw Interview")
	liz_carrot_otter_summary_preprocessing = row.get("Otter Summary")
	liz_carrot_greg_summary_preprocessing = row.get("Post-Interview Summary")
	elif row.get("id") == 3:
	ben_soybean_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard")
	ben_soybean_interactions_gold_standard = row.get("Interactions - Gold Standard")
	ben_soybean_trials_gold_standard = row.get("Trials - Gold Standard")

	ben_soybean_input_data_raw_interview = row.get("Raw Interview")
	ben_soybean_otter_summary_preprocessing = row.get("Otter Summary")
	ben_soybean_greg_summary_preprocessing = row.get("Post-Interview Summary")
	elif row.get("id") == 5:
	wally_squash_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard")
	wally_squash_interactions_gold_standard = row.get("Interactions - Gold Standard")
	wally_squash_trials_gold_standard = row.get("Trials - Gold Standard")

	wally_squash_input_data_raw_interview = row.get("Raw Interview")
	wally_squash_otter_summary_preprocessing = row.get("Otter Summary")
	wally_squash_greg_summary_preprocessing = row.get("Post-Interview Summary")

	gold_standards = {
	"liz_carrot": {
	"planting": liz_carrot_plantings_gold_standard,
	"interactions": liz_carrot_interactions_gold_standard,
	"trials": liz_carrot_trials_gold_standard,
	},
	"ben_soybean": {
	"planting": ben_soybean_plantings_gold_standard,
	"interactions": ben_soybean_interactions_gold_standard,
	"trials": ben_soybean_trials_gold_standard,
	},
	"wally_squash": {
	"planting": wally_squash_plantings_gold_standard,
	"interactions": wally_squash_interactions_gold_standard,
	"trials": wally_squash_trials_gold_standard,
	}
	}

	# How to retrieve this data
	# liz_carrot_planting = gold_standards["planting_gold_standards"]["liz_carrot"]
	# ben_soybean_interactions = gold_standards["interactions_gold_standards"]["ben_soybean"]
	# wally_squash_trial = gold_standards["trial_gold_standards"]["wally_squash"]

	input_data = {
	"liz_carrot": {
	"raw_interview": liz_carrot_input_data_raw_interview,
	"otter_summary": liz_carrot_otter_summary_preprocessing,
	"greg_summary": liz_carrot_greg_summary_preprocessing
	},
	"ben_soybean": {
	"raw_interview": ben_soybean_input_data_raw_interview,
	"otter_summary": ben_soybean_otter_summary_preprocessing,
	"greg_summary": ben_soybean_greg_summary_preprocessing
	},
	"wally_squash": {
	"raw_interview": wally_squash_input_data_raw_interview,
	"otter_summary": wally_squash_otter_summary_preprocessing,
	"greg_summary": wally_squash_greg_summary_preprocessing
	}
	}

	print("BASEROW DATA DONE GOT")
	print("GOLD STANDARDS HERE")
	print(gold_standards)
	print("INPUT DATA HERE")
	print(input_data)
	return gold_standards, input_data

	except requests.exceptions.RequestException as e:
	print(f"Failed to fetch rows: {e}")

	def get_recipes():
	print("GETTING RECIPES FROM BASEROW NOW")

	TABLE_ID = "578"

	BASEROW_URL = get_baserow_url(TABLE_ID)

	headers = {
	"Authorization": f"Token {os.environ['BASEROW_API_KEY']}",
	"Content-Type": "application/json"
	}

	print("TRYING TO GET A RESPONSE")
	try:
	response = requests.get(BASEROW_URL, headers=headers)
	response.raise_for_status()
	rows = response.json()
	results = rows.get("results", [])

	my_recipes = []
	print("PARSING ROWS")
	for row in results:
	print(f"Row ID: {row.get('id')}, Data: {row}")
	recipe_id = row.get("Recipe ID")
	testing_strategy_text = row.get("Testing Strategy for Set")
	schema_processing_model = row.get("Schema Processing Model")
	pre_processing_strategy = row.get("Pre-Processing Strategy")
	pre_processing_text = row.get("Pre-Prompt Text")
	pre_processing_model = row.get("Preprocessing Model")
	prompting_strategy = row.get("Prompting Strategy")
	plantings_and_fields_prompt = row.get("Plantings and Fields Prompting Text")
	interactions_prompt = row.get("Interactions Prompting Text")
	treatments_prompt = row.get("Treatments Prompting Text")

	recipe_dict = {
	"recipe_id": recipe_id,
	"testing_strategy_text": testing_strategy_text,
	"schema_processing_model": schema_processing_model,
	"pre_processing_strategy": pre_processing_strategy,
	"pre_processing_text": pre_processing_text,
	"pre_processing_model": pre_processing_model,
	"prompting_strategy": prompting_strategy,
	"plantings_and_fields_prompt": plantings_and_fields_prompt,
	"interactions_prompt": interactions_prompt,
	"treatments_prompt": treatments_prompt
	}

	my_recipes.append(recipe_dict)

	print("FINISHED GETTING THE RECIPE DATA")
	print("RECIPES HERE")
	print(my_recipes)
	return my_recipes

	except requests.exceptions.RequestException as e:
	print(f"Failed to fetch rows: {e}")

	def fill_out_survey(recipe_dict, input_data):
	print("filling out survey")
	survey_id = "673b4994aef86f0533b3546c"

	base_url = "https://app.surveystack.io/api/submissions"

	if recipe_dict.get("pre_processing_text") is None:
	pre_processing = False
	pre_process = "no"
	pre_process_model_version = "None"
	else:
	pre_process = recipe_dict

	# Set the prompting strategy to be a variable from the list
	# Do this here

	if pre_processing:
	submission_data = {
	"survey": survey_id,
	"data": {
	"inputstyle": "big-block-input-text",
	"onelonginputtext": input_data,
	"schema_prompt": {
	"firstschemaprompt": recipe_dict["plantings_and_fields_prompt"],
	"secondschemaprompt": recipe_dict["interactions_prompt"],
	"thirdschemaprompt": recipe_dict["treatments_prompt"],
	},
	},
	"parameters": {
	"modelversion": recipe_dict["schema_processing_model"],
	"preprocessdata": ["yes"],
	"promptstyle": recipe_dict["prompting_strategy"],
	"preprocessmodelversion": recipe_dict["prompting_strategy"],
	"multiplepreprompts": "no",
	"prepromptstyle": recipe_dict["pre_processing_strategy"],
	"preprocessingprompt1": recipe_dict["pre_processing_text"],
	"preprocessingprompt2": "",
	"preprocessingprompt3": ""
	}
	}

	else:
	submission_data = {
	"survey": survey_id,
	"data": {
	"inputstyle": "big-block-input-text",
	"onelonginputtext": input_data,
	"schema_prompt": {
	"firstschemaprompt": recipe_dict["plantings_and_fields_prompt"],
	"secondschemaprompt": recipe_dict["interactions_prompt"],
	"thirdschemaprompt": recipe_dict["treatments_prompt"],
	},
	},
	"parameters": {
	"modelversion": recipe_dict["schema_processing_model"],
	"preprocessdata": ["no"],
	"promptstyle": recipe_dict["prompting_strategy"],
	"preprocessmodelversion": None,
	"multiplepreprompts": "no",
	"prepromptstyle": None,
	"preprocessingprompt1": None,
	"preprocessingprompt2": None,
	"preprocessingprompt3": None

	}
	}

	headers = {
	"Content-Type": "application/json",
	}

	print("GETTING SURVEY RESPONSE")
	try:
	response = requests.post(base_url, headers=headers, data=json.dumps(submission_data))
	response.raise_for_status()

	if response.status_code == 200:
	print("Submission successful to SurveyStack!")
	print(response.json())
	return submission_data
	else:
	print(f"Failed to submit: {response.status_code} - {response.text}")
	except requests.exceptions.RequestException as e:
	print(f"An error occurred while submitting the data: {e}")

	def get_data_ready(recipe_dict, input_data_piece):
	## Input chunk structure
	# "raw_interview": liz_carrot_input_data_raw_interview,
	#
	#
	# recipe_dict = {
	# "recipe_id": recipe_id,
	# "testing_strategy_text": testing_strategy_text,
	# "schema_processing_model", schema_processing_model,
	# "pre_processing_strategy", pre_processing_strategy,
	# "pre_processing_text", pre_processing_text,
	# "pre_processing_model", pre_processing_model,
	# "prompting_strategy", prompting_strategy,
	# "plantings_and_fields_prompt", plantings_and_fields_prompt,
	# "interactions_prompt", interactions_prompt,
	# "treatments_prompt", treatments_prompt
	# }
	#
	print("GETTING DATA READY")
	processed_data = {}
	processed_data["input_style"] = 'big-block-input-text'
	processed_data["input_text"] = input_data_piece
	processed_data["prompts"]["firstschemaprompt"] = recipe_dict["plantings_and_fields_prompt"]
	processed_data["prompts"]["secondschemaprompt"] = recipe_dict["interactions_prompt"]
	processed_data["prompts"]["thirdschemaprompt"] = recipe_dict["treatments_prompt"]

	processed_data["parameters"] = {}
	processed_data["parameters"]["modelversion"] = recipe_dict["schema_processing_model"]
	processed_data["parameters"]["promptstyle"] = recipe_dict["prompting_strategy"]
	if recipe_dict["pre_processing_text"] is None and recipe_dict["pre_processing_strategy"] is None:
	processed_data["parameters"]["preprocessdata"] = "no"
	else:
	processed_data["parameters"]["preprocessdata"] = "yes"
	processed_data["parameters"]["preprocessmodelversion"] = recipe_dict["pre_processing_model"]
	processed_data["parameters"]["multiplepreprompts"] = "no"
	processed_data["parameters"]["prepromptstyle"] = recipe_dict["pre_processing_strategy"]
	processed_data["parameters"]["preprocessingprompt1"] = recipe_dict["pre_processing_text"]
	processed_data["parameters"]["preprocessingprompt2"] = ""
	processed_data["parameters"]["preprocessingprompt3"] = ""

	print("DID THAT NOW")
	return processed_data

	def generate_markdown_output(df):
	# Start the markdown output string
	markdown = ""

	# Table for Recipe Fields (recipe_id to prompting_strategy)
	markdown += "### Recipe Fields (Basic Information)\n"
	markdown += "\| Recipe ID \| Testing Strategy \| Schema Processing Model \| Pre-Processing Strategy \| Pre-Processing Text \| Pre-Processing Model \| Prompting Strategy \|\n"
	markdown += "\|-----------\|------------------\|-------------------------\|--------------------------\|---------------------\|----------------------\|-------------------\|\n"

	# Iterate over rows to create the first table (recipe_id to prompting_strategy)
	for _, row in df.iterrows():
	markdown += f"\| {row['Recipe_ID']} \| {row['Testing_Strategy_Text']} \| {row['Schema_Processing_Model']} \| {row['Pre_Processing_Strategy']} \| {row['Pre_Processing_Text']} \| {row['Pre_Processing_Model']} \| {row['Prompting_Strategy']} \|\n"

	# Separate section for Prompts
	markdown += "\n### Prompts\n"
	markdown += "\| Plantings and Fields Prompt \| Interactions Prompt \| Treatments Prompt \|\n"
	markdown += "\|-----------------------------\|---------------------\|-------------------\|\n"

	# Iterate over rows to create the second table (plantings_and_fields_prompt, interactions_prompt, treatments_prompt)
	for _, row in df.iterrows():
	markdown += f"\| {row['Plantings_and_Fields_Prompt']} \| {row['Interactions_Prompt']} \| {row['Treatments_Prompt']} \|\n"

	# Separate section for Input Transcript
	markdown += "\n### Input Transcript\n"
	markdown += "Since the input transcript might be very long, it is truncated here for readability:\n"

	# Display a truncated version of the input transcript to avoid long text in the table
	for _, row in df.iterrows():
	truncated_input = (row['Input_Transcript'][:500] + '...') if len(row['Input_Transcript']) > 500 else row['Input_Transcript']
	markdown += f"Recipe ID {row['Recipe_ID']}: {truncated_input}\n\n"

	# Side-by-side comparison of Gold Standard and Machine Generated Key-Values
	markdown += "\n### Gold Standard vs Machine Generated Key-Values\n"
	markdown += "\| Key \| Gold Standard \| Machine Generated \|\n"
	markdown += "\|-----\|---------------\|-------------------\|\n"

	# Iterate over rows to create the comparison table
	for _, row in df.iterrows():
	markdown += f"\| {row['Recipe_ID']} \| {row['Gold_Standard_Key_Values']} \| {row['Machine_Generated_Key_Values']} \|\n"

	# Display differences in a readable format
	markdown += "\n### Differences\n"
	markdown += "The following differences were found between the gold standard and the machine-generated output:\n"
	markdown += "\| Key \| Difference \|\n"
	markdown += "\|-----\|------------\|\n"

	for _, row in df.iterrows():
	# Assuming 'Differences' is a list of dictionaries with keys and changes
	differences = json.loads(row['Differences'])
	for diff in differences:
	if diff.get("values_changed"):
	for change in diff["values_changed"]:
	markdown += f"\| {change['path']} \| {change['old_value']} -> {change['new_value']} \|\n"

	# Side-by-side YAML comparison for human visual inspection
	markdown += "\n### Gold Standard vs Machine Generated YAML\n"
	markdown += "\| Gold Standard YAML \| Machine Generated YAML \|\n"
	markdown += "\|--------------------\|------------------------\|\n"

	# Add the side-by-side YAML comparison
	for _, row in df.iterrows():
	markdown += f"\| ```yaml\n{row['Gold_Standard_YAML']}\n``` \| ```yaml\n{row['Machine_Generated_YAML']}\n``` \|\n"

	return markdown


	def drive_process():
	# this is to drive the processing process
	print("We are starting to DRIVE PROCESS")

	# Get the data from baserow (gold standards JSON and Input data)
	gold_standards, input_data = get_baserow_data()

	# Get the recipes from baserow too
	my_recipes = get_recipes()

	# Input chunk structure
	# "liz_carrot": {
	# "raw_interview": liz_carrot_input_data_raw_interview,
	# "otter_summary": liz_carrot_otter_summary_preprocessing,
	# "greg_summary": liz_carrot_greg_summary_preprocessing
	# },

	print("Making the OUTPUT STUFF")
	output_rows = []
	output_folder = "output_results_" +datetime.now().strftime("%Y%m%d_%H%M%S")
	os.makedirs(output_folder, exist_ok=True)

	print("GOING THROUGH RECIPES NOW")
	for recipe_dict in my_recipes:
	for key, input_chunks in input_data.items():
	print("RECIPE INFO")
	print(key)
	print(recipe_dict["recipe_id"])

	# Get the input data based on the recipe
	if recipe_dict["pre_processing_strategy"] == "Otter.ai Summary":
	input_data_piece = input_chunks["otter_summary"]
	elif recipe_dict["pre_processing_strategy"] == "Greg Summary":
	input_data_piece = input_chunks["greg_summary"]
	else:
	input_data_piece = input_chunks["raw_interview"]

	print("DECIDED INPUT DATA")
	print(input_data_piece)

	# Fill out a Surveystack submission
	fill_out_survey(recipe_dict, input_data)

	# Prepare the data for the structured output setup
	proc_spec = get_data_ready(recipe_dict, input_data_piece)

	print("PROCESSING SPECIFICATIONS!!!!!!!!!!!!!!!")
	completed_json = process_specifications(proc_spec)


	print("Gold Standard diff and stuff")
	# Get the gold standard for this input_chunk (liz_carrot, ben_soybean, wally_squash)
	# Compare the generated JSON to the gold standard
	gold_standard_json = gold_standard[key]
	differences = list(diff(gold_standard_json, completed_json))

	print("yaml world")
	# Convert to yaml
	gold_standard_yaml = yaml.dump(gold_standard_json, default_flow_style=False)
	comparison_yaml = yaml.dump(completed_json, default_flow_style=False)

	recipe_id = recipe_dict.get("recipe_id", "N/A")
	output_rows.append({
	"Recipe_ID": recipe_id,
	"Testing_Strategy_Text": recipe_dict.get("testing_strategy_text", "N/A"),
	"Schema_Processing_Model": recipe_dict.get("schema_processing_model", "N/A"),
	"Pre_Processing_Strategy": recipe_dict.get("pre_processing_strategy", "N/A"),
	"Pre_Processing_Text": recipe_dict.get("pre_processing_text", "N/A"),
	"Pre_Processing_Model": recipe_dict.get("pre_processing_model", "N/A"),
	"Prompting_Strategy": recipe_dict.get("prompting_strategy", "N/A"),
	"Plantings_and_Fields_Prompt": recipe_dict.get("plantings_and_fields_prompt", "N/A"),
	"Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
	"Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
	"Input_Transcript": input_data,
	"Gold_Standard_Key_Values": json.dumps(gold_standard_json, indent=2),
	"Machine_Generated_Key_Values": json.dumps(completed_json, indent=2),
	"Differences": json.dumps(differences, indent=2),
	"Gold_Standard_YAML": gold_standard_yaml,
	"Machine_Generated_YAML": comparison_yaml
	})

	df = pd.DataFrame(output_rows)

	print("dataframe done now onto markdown")

	markdown_output = generate_markdown_output(df)
	recipe_folder = os.path.join(output_folder, f"recipe_{recipe_dict['recipe_id']}")
	os.makedirs(recipe_folder, exist_ok=True)

	# Save markdown to file
	markdown_file = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_output.md")
	with open(markdown_file, 'w') as f:
	f.write(markdown_output)

	# Save JSON files
	json_file_gold = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_gold_standard.json")
	json_file_generated = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_generated.json")
	with open(json_file_gold, 'w') as f:
	json.dump(gold_standard_json, f, indent=2)
	with open(json_file_generated, 'w') as f:
	json.dump(completed_json, f, indent=2)

	# Optionally save differences as a separate file
	differences_file = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_differences.json")
	with open(differences_file, 'w') as f:
	json.dump(differences, f, indent=2)

	print("ZIPPING UP WHOLE THING")
	# Zip the entire output folder
	zip_filename = f"{output_folder}.zip"
	shutil.make_archive(output_folder, 'zip', output_folder)

	# Cleanup by removing the unzipped folder after zipping it
	shutil.rmtree(output_folder)

	# Return the zip file for downloading
	return zip_filename

	return output_folder