| import requests |
| from jsondiff import diff |
| import yaml |
| import pandas as pd |
| import os |
| import shutil |
| import json |
| from datetime import datetime |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| BASEROW_API_KEY = os.getenv("BASEROW_API_KEY") |
| from process_data import process_specifications |
|
|
| def get_baserow_url(table_id): |
| print("GETTING BASEROW URL") |
| BASEROW_API_BASE = "https://baserow.f11804a1.federatedcomputer.net/api" |
| return f"{BASEROW_API_BASE}/database/rows/table/{table_id}/?user_field_names=true" |
|
|
| def get_baserow_data(): |
| |
| |
|
|
| print("GETTING BASEROW DATA") |
| |
| TABLE_ID = "560" |
|
|
| BASEROW_URL = get_baserow_url(TABLE_ID) |
|
|
| headers = { |
| "Authorization": f"Token {os.environ['BASEROW_API_KEY']}", |
| "Content-Type": "application/json" |
| } |
|
|
| print("STARTING TO TRY RESPONSE REQUEST") |
| try: |
| response = requests.get(BASEROW_URL, headers=headers) |
| print("GOT") |
| response.raise_for_status() |
| rows = response.json() |
| results = rows.get("results", []) |
|
|
| print("PARSING ROWS NOW") |
|
|
| for row in results: |
| print(f"Row ID: {row.get('id')}, Data: {row}") |
|
|
| if row.get("id") == 2: |
| liz_carrot_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard") |
| liz_carrot_interactions_gold_standard = row.get("Interactions - Gold Standard") |
| liz_carrot_trials_gold_standard = row.get("Trials - Gold Standard") |
| |
| liz_carrot_input_data_raw_interview = row.get("Raw Interview") |
| liz_carrot_otter_summary_preprocessing = row.get("Otter Summary") |
| liz_carrot_greg_summary_preprocessing = row.get("Post-Interview Summary") |
| elif row.get("id") == 3: |
| ben_soybean_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard") |
| ben_soybean_interactions_gold_standard = row.get("Interactions - Gold Standard") |
| ben_soybean_trials_gold_standard = row.get("Trials - Gold Standard") |
| |
| ben_soybean_input_data_raw_interview = row.get("Raw Interview") |
| ben_soybean_otter_summary_preprocessing = row.get("Otter Summary") |
| ben_soybean_greg_summary_preprocessing = row.get("Post-Interview Summary") |
| elif row.get("id") == 5: |
| wally_squash_plantings_gold_standard = row.get("Plantings and Fields - Gold Standard") |
| wally_squash_interactions_gold_standard = row.get("Interactions - Gold Standard") |
| wally_squash_trials_gold_standard = row.get("Trials - Gold Standard") |
| |
| wally_squash_input_data_raw_interview = row.get("Raw Interview") |
| wally_squash_otter_summary_preprocessing = row.get("Otter Summary") |
| wally_squash_greg_summary_preprocessing = row.get("Post-Interview Summary") |
|
|
| gold_standards = { |
| "liz_carrot": { |
| "planting": liz_carrot_plantings_gold_standard, |
| "interactions": liz_carrot_interactions_gold_standard, |
| "trials": liz_carrot_trials_gold_standard, |
| }, |
| "ben_soybean": { |
| "planting": ben_soybean_plantings_gold_standard, |
| "interactions": ben_soybean_interactions_gold_standard, |
| "trials": ben_soybean_trials_gold_standard, |
| }, |
| "wally_squash": { |
| "planting": wally_squash_plantings_gold_standard, |
| "interactions": wally_squash_interactions_gold_standard, |
| "trials": wally_squash_trials_gold_standard, |
| } |
| } |
|
|
| |
| |
|
|
| input_data = { |
| "liz_carrot": { |
| "raw_interview": liz_carrot_input_data_raw_interview, |
| "otter_summary": liz_carrot_otter_summary_preprocessing, |
| "greg_summary": liz_carrot_greg_summary_preprocessing |
| }, |
| "ben_soybean": { |
| "raw_interview": ben_soybean_input_data_raw_interview, |
| "otter_summary": ben_soybean_otter_summary_preprocessing, |
| "greg_summary": ben_soybean_greg_summary_preprocessing |
| }, |
| "wally_squash": { |
| "raw_interview": wally_squash_input_data_raw_interview, |
| "otter_summary": wally_squash_otter_summary_preprocessing, |
| "greg_summary": wally_squash_greg_summary_preprocessing |
| } |
| } |
|
|
| print("BASEROW DATA DONE GOT") |
| print("GOLD STANDARDS HERE") |
| print(gold_standards) |
| print("INPUT DATA HERE") |
| print(input_data) |
| return gold_standards, input_data |
|
|
| except requests.exceptions.RequestException as e: |
| print(f"Failed to fetch rows: {e}") |
| |
| def get_recipes(): |
| print("GETTING RECIPES FROM BASEROW NOW") |
|
|
| |
| |
| |
| TABLE_ID = "589" |
|
|
| BASEROW_URL = get_baserow_url(TABLE_ID) |
|
|
| headers = { |
| "Authorization": f"Token {os.environ['BASEROW_API_KEY']}", |
| "Content-Type": "application/json" |
| } |
|
|
| print("TRYING TO GET A RESPONSE") |
| try: |
| response = requests.get(BASEROW_URL, headers=headers) |
| response.raise_for_status() |
| rows = response.json() |
| results = rows.get("results", []) |
|
|
| my_recipes = [] |
| print("PARSING ROWS") |
| for row in results: |
| print(f"Row ID: {row.get('id')}, Data: {row}") |
| recipe_id = row.get("Recipe ID") |
| testing_strategy_text = row.get("Testing Strategy for Set") |
| |
| schema_processing_model = row.get("Schema Processing Model", {}).get("value", None) |
| pre_processing_strategy = row.get("Pre-Processing Strategy", [{}])[0].get("value", None) |
| pre_processing_text = row.get("Pre-Prompt Text") |
| pre_processing_model = row.get("Preprocessing Model", {}).get("value", None) |
| prompting_strategy = row.get("Prompting Strategy", [{}])[0].get("value", None) |
| plantings_and_fields_prompt = row.get("Plantings and Fields Prompting Text") |
| interactions_prompt = row.get("Interactions Prompting Text") |
| treatments_prompt = row.get("Treatments Prompting Text") |
|
|
| recipe_dict = { |
| "recipe_id": recipe_id, |
| "testing_strategy_text": testing_strategy_text, |
| "schema_processing_model": schema_processing_model, |
| "pre_processing_strategy": pre_processing_strategy, |
| "pre_processing_text": pre_processing_text, |
| "pre_processing_model": pre_processing_model, |
| "prompting_strategy": prompting_strategy, |
| "plantings_and_fields_prompt": plantings_and_fields_prompt, |
| "interactions_prompt": interactions_prompt, |
| "treatments_prompt": treatments_prompt |
| } |
| |
| my_recipes.append(recipe_dict) |
|
|
| print("FINISHED GETTING THE RECIPE DATA") |
| print("RECIPES HERE") |
| print(my_recipes) |
| return my_recipes |
|
|
| except requests.exceptions.RequestException as e: |
| print(f"Failed to fetch rows: {e}") |
|
|
| def fill_out_survey(recipe_dict, input_data): |
| print("filling out survey") |
| survey_id = "673b4994aef86f0533b3546c" |
| |
| base_url = "https://app.surveystack.io/api/submissions" |
|
|
| if recipe_dict.get("pre_processing_text") is None: |
| pre_processing = False |
| pre_process = "no" |
| pre_process_model_version = "None" |
| else: |
| pre_processing = True |
| pre_process = recipe_dict |
|
|
| |
| |
| |
| if pre_processing: |
| submission_data = { |
| "survey": survey_id, |
| "data": { |
| "inputstyle": "big-block-input-text", |
| "onelonginputtext": input_data, |
| "schema_prompt": { |
| "firstschemaprompt": recipe_dict["plantings_and_fields_prompt"], |
| "secondschemaprompt": recipe_dict["interactions_prompt"], |
| "thirdschemaprompt": recipe_dict["treatments_prompt"], |
| }, |
| }, |
| "parameters": { |
| "modelversion": recipe_dict["schema_processing_model"], |
| "preprocessdata": ["yes"], |
| "promptstyle": recipe_dict["prompting_strategy"], |
| "preprocessmodelversion": recipe_dict["prompting_strategy"], |
| "multiplepreprompts": "no", |
| "prepromptstyle": recipe_dict["pre_processing_strategy"], |
| "preprocessingprompt1": recipe_dict["pre_processing_text"], |
| "preprocessingprompt2": "", |
| "preprocessingprompt3": "" |
| } |
| } |
| |
| else: |
| submission_data = { |
| "survey": survey_id, |
| "data": { |
| "inputstyle": "big-block-input-text", |
| "onelonginputtext": input_data, |
| "schema_prompt": { |
| "firstschemaprompt": recipe_dict["plantings_and_fields_prompt"], |
| "secondschemaprompt": recipe_dict["interactions_prompt"], |
| "thirdschemaprompt": recipe_dict["treatments_prompt"], |
| }, |
| }, |
| "parameters": { |
| "modelversion": recipe_dict["schema_processing_model"], |
| "preprocessdata": ["no"], |
| "promptstyle": recipe_dict["prompting_strategy"], |
| "preprocessmodelversion": None, |
| "multiplepreprompts": "no", |
| "prepromptstyle": None, |
| "preprocessingprompt1": None, |
| "preprocessingprompt2": None, |
| "preprocessingprompt3": None |
| |
| } |
| } |
|
|
| headers = { |
| "Content-Type": "application/json", |
| } |
|
|
| print("GETTING SURVEY RESPONSE") |
| try: |
| response = requests.post(base_url, headers=headers, data=json.dumps(submission_data)) |
| response.raise_for_status() |
|
|
| if response.status_code == 200: |
| print("Submission successful to SurveyStack!") |
| print(response.json()) |
| return submission_data |
| else: |
| print(f"Failed to submit: {response.status_code} - {response.text}") |
| except requests.exceptions.RequestException as e: |
| print(f"An error occurred while submitting the data: {e}") |
|
|
| def get_data_ready(recipe_dict, input_data_piece): |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| print("GETTING DATA READY") |
| processed_data = {} |
| processed_data["prompts"] = {} |
| |
| processed_data["inputstyle"] = 'big-block-input-text' |
| processed_data["input_text"] = input_data_piece |
| processed_data["prompts"]["firstschemaprompt"] = recipe_dict["plantings_and_fields_prompt"] |
| processed_data["prompts"]["secondschemaprompt"] = recipe_dict["interactions_prompt"] |
| processed_data["prompts"]["thirdschemaprompt"] = recipe_dict["treatments_prompt"] |
|
|
| processed_data["parameters"] = {} |
| processed_data["parameters"]["modelversion"] = recipe_dict["schema_processing_model"] |
| processed_data["parameters"]["promptstyle"] = recipe_dict["prompting_strategy"] |
|
|
| if (recipe_dict["pre_processing_strategy"] == "None") and (recipe_dict["pre_processing_model"] == "No preprocessing"): |
| processed_data["parameters"]["preprocessdata"] = "no" |
| else: |
| processed_data["parameters"]["preprocessdata"] = "yes" |
| processed_data["parameters"]["preprocessmodelversion"] = recipe_dict["pre_processing_model"] |
| processed_data["parameters"]["multiplepreprompts"] = "no" |
| processed_data["parameters"]["prepromptstyle"] = recipe_dict["pre_processing_strategy"] |
| processed_data["parameters"]["preprocessingprompt1"] = recipe_dict["pre_processing_text"] |
| processed_data["parameters"]["preprocessingprompt2"] = "" |
| processed_data["parameters"]["preprocessingprompt3"] = "" |
|
|
| print("DID THAT NOW") |
| return processed_data |
|
|
| def format_json(json_data, truncate_length=500): |
| try: |
| |
| parsed_data = json.loads(json_data) |
| |
| formatted_json = json.dumps(parsed_data, indent=2) |
| |
| return formatted_json[:truncate_length] + "..." if len(formatted_json) > truncate_length else formatted_json |
| except json.JSONDecodeError: |
| |
| return json_data[:truncate_length] + "..." if len(json_data) > truncate_length else json_data |
|
|
| |
| def custom_serializer(obj): |
| if isinstance(obj, Enum): |
| return obj.name |
| if isinstance(obj, Soil): |
| return obj.to_dict() |
| if isinstance(obj, Yield): |
| return obj.to_dict() |
| return obj.__dict__ |
|
|
| def sanitize_json_for_yaml(data): |
| if isinstance(data, dict): |
| return {key: sanitize_json_for_yaml(value) for key, value in data.items()} |
| elif isinstance(data, list): |
| return [sanitize_json_for_yaml(item) for item in data] |
| elif isinstance(data, tuple): |
| return list(data) |
| else: |
| return data |
|
|
| def generate_markdown_output(df): |
| |
| markdown = "" |
|
|
| |
| markdown += "\n## Input Transcript\n" |
| for _, row in df.iterrows(): |
| truncated_input = row['Input_Transcript'][:500] + "..." if len(row['Input_Transcript']) > 500 else row['Input_Transcript'] |
| markdown += f"**Recipe ID {row['Recipe_ID']}**:\n```\n{truncated_input}\n```\n\n" |
|
|
| |
| markdown += "\n## Recipe Fields\n" |
| recipe_columns = [ |
| "Recipe ID", "Testing Strategy", "Schema Processing Model", "Pre-Processing Strategy", |
| "Pre-Processing Text", "Pre-Processing Model", "Prompting Strategy" |
| ] |
| recipe_table = "| " + " | ".join(recipe_columns) + " |\n" |
| recipe_table += "| " + " | ".join(["-" * len(col) for col in recipe_columns]) + " |\n" |
| for _, row in df.iterrows(): |
| recipe_table += f"| {row['Recipe_ID']} | {row['Testing_Strategy_Text']} | {row['Schema_Processing_Model']} | {row['Pre_Processing_Strategy']} | {row['Pre_Processing_Text']} | {row['Pre_Processing_Model']} | {row['Prompting_Strategy']} |\n" |
| markdown += recipe_table + "\n" |
|
|
| |
| markdown += "\n## Differences\n" |
| for _, row in df.iterrows(): |
| markdown += f"\n### Recipe ID: {row['Recipe_ID']}\n" |
| differences = row['Differences'] |
| |
| |
| for key, value in differences.items(): |
| markdown += f"#### {key.capitalize()}\n" |
| for item in value: |
| markdown += f" - {item}\n" |
|
|
| |
| markdown += "\n## Prompts\n" |
| prompt_columns = ["Plantings and Fields Prompt", "Interactions Prompt", "Treatments Prompt"] |
| prompt_table = "| " + " | ".join(prompt_columns) + " |\n" |
| prompt_table += "| " + " | ".join(["-" * len(col) for col in prompt_columns]) + " |\n" |
| for _, row in df.iterrows(): |
| prompt_table += f"| {row['Plantings_and_Fields_Prompt']} | {row['Interactions_Prompt']} | {row['Treatments_Prompt']} |\n" |
| markdown += prompt_table + "\n" |
|
|
| |
| markdown += "\n## Gold Standard vs Machine Generated JSON\n" |
| for _, row in df.iterrows(): |
| markdown += f"\n### Recipe ID: {row['Recipe_ID']}\n" |
| for key in ["planting", "interactions", "trials"]: |
| gold = json.dumps(row['Gold_Standard_JSON'].get(key, {}), indent=2) |
| machine = json.dumps(row['Machine_Generated_JSON'].get(key, {}), default=custom_serializer, indent=2) |
| markdown += f"#### {key.capitalize()}\n" |
| markdown += f"**Gold Standard JSON**:\n```json\n{gold}\n```\n" |
| markdown += f"**Machine Generated JSON**:\n```json\n{machine}\n```\n" |
|
|
| |
| markdown += "\n## Gold Standard vs Machine Generated YAML\n" |
| for _, row in df.iterrows(): |
| markdown += f"\n### Recipe ID: {row['Recipe_ID']}\n" |
| for key in ["planting", "interactions", "trials"]: |
| gold = yaml.dump(row['Gold_Standard_JSON'].get(key, {}), default_flow_style=False, sort_keys=True) |
| machine = yaml.dump(row['Machine_Generated_JSON'].get(key, {}), default_flow_style=False, sort_keys=True) |
| markdown += f"#### {key.capitalize()}\n" |
| markdown += f"**Gold Standard YAML**:\n```yaml\n{gold}\n```\n" |
| markdown += f"**Machine Generated YAML**:\n```yaml\n{machine}\n```\n" |
| |
| return markdown |
|
|
| |
| def drive_process(): |
| |
| print("We are starting to DRIVE PROCESS") |
| |
| |
| gold_standards, input_data = get_baserow_data() |
|
|
| |
| my_recipes = get_recipes() |
|
|
| |
| |
| |
| |
| |
| |
|
|
| print("Making the OUTPUT STUFF") |
| output_folder = "output_results_" +datetime.now().strftime("%Y%m%d_%H%M%S") |
| os.makedirs(output_folder, exist_ok=True) |
|
|
| print("GOING THROUGH RECIPES NOW") |
| for recipe_dict in my_recipes: |
| for key, input_chunks in input_data.items(): |
| output_rows = [] |
| print("RECIPE INFO") |
| print(key) |
| print(recipe_dict["recipe_id"]) |
|
|
| |
| if recipe_dict["pre_processing_strategy"] == "Otter.ai Summary": |
| input_data_piece = input_chunks["otter_summary"] |
| elif recipe_dict["pre_processing_strategy"] == "Greg Summary": |
| input_data_piece = input_chunks["greg_summary"] |
| else: |
| input_data_piece = input_chunks["raw_interview"] |
|
|
| print("DECIDED INPUT DATA") |
| print(input_data_piece) |
|
|
| |
| |
| |
|
|
| |
| proc_spec = get_data_ready(recipe_dict, input_data_piece) |
|
|
| print("Gold Standard") |
| |
| gold_standard_json = gold_standards[key] |
|
|
| |
| |
| |
| |
| |
| |
| gold_standard_planting_json = json.loads(gold_standard_json["planting"]) |
| gold_standard_interactions_json = json.loads(gold_standard_json["interactions"]) |
| gold_standard_trials_json = json.loads(gold_standard_json["trials"]) |
|
|
| print("Gold standard json after loading") |
| print(gold_standard_planting_json) |
| |
| print("PROCESSING SPECIFICATIONS!!!!!!!!!!!!!!!") |
| processed_farm_activity_json, processed_interactions_json, processed_trials_json = process_specifications(proc_spec) |
|
|
| |
| |
| |
| |
|
|
| processed_farm_activity_json = json.loads(processed_farm_activity_json) |
| processed_interactions_json = json.loads(processed_interactions_json) |
| processed_trials_json = json.loads(processed_trials_json) |
|
|
| print("Processed and loaded 1st json from machine gen") |
| print(processed_farm_activity_json) |
| |
| |
| differences_planting = list(diff(gold_standard_planting_json, processed_farm_activity_json)) |
| differences_interactions = list(diff(gold_standard_interactions_json, processed_interactions_json)) |
| differences_trials = list(diff(gold_standard_trials_json, processed_trials_json)) |
|
|
| print("Diff planting") |
| print(differences_planting) |
|
|
| |
| completed_gold_standard_planting_json = sanitize_json_for_yaml(gold_standard_planting_json) |
| completed_gold_standard_interactions_json = sanitize_json_for_yaml(gold_standard_interactions_json) |
| completed_gold_standard_trials_json = sanitize_json_for_yaml(gold_standard_trials_json) |
|
|
| completed_processed_farm_activity_json = sanitize_json_for_yaml(processed_farm_activity_json) |
| completed_processed_interactions_json = sanitize_json_for_yaml(processed_interactions_json) |
| completed_processed_trials_json = sanitize_json_for_yaml(processed_trials_json) |
|
|
| json_diff = { |
| "planting": differences_planting, |
| "interactions": differences_interactions, |
| "trials": differences_trials |
| } |
| |
| gold_standard_json = { |
| "planting": completed_gold_standard_planting_json, |
| "interactions": completed_gold_standard_interactions_json, |
| "trials": completed_gold_standard_trials_json |
| } |
|
|
| comparison_json = { |
| "planting": completed_processed_farm_activity_json, |
| "interactions": completed_processed_interactions_json, |
| "trials": completed_processed_trials_json |
| } |
|
|
| recipe_id = recipe_dict.get("recipe_id", "N/A") |
| output_rows.append({ |
| "Recipe_ID": recipe_id, |
| "Testing_Strategy_Text": recipe_dict.get("testing_strategy_text", "N/A"), |
| "Schema_Processing_Model": recipe_dict.get("schema_processing_model", "N/A"), |
| "Pre_Processing_Strategy": recipe_dict.get("pre_processing_strategy", "N/A"), |
| "Pre_Processing_Text": recipe_dict.get("pre_processing_text", "N/A"), |
| "Pre_Processing_Model": recipe_dict.get("pre_processing_model", "N/A"), |
| "Prompting_Strategy": recipe_dict.get("prompting_strategy", "N/A"), |
| "Plantings_and_Fields_Prompt": recipe_dict.get("plantings_and_fields_prompt", "N/A"), |
| "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"), |
| "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"), |
| "Input_Transcript": input_chunks, |
| "Gold_Standard_JSON": gold_standard_json, |
| "Machine_Generated_JSON": comparison_json, |
| "Differences": json_diff |
| }) |
|
|
| df = pd.DataFrame(output_rows) |
|
|
| print("dataframe done now onto markdown") |
|
|
| markdown_output = generate_markdown_output(df) |
| recipe_folder = os.path.join(output_folder, f"recipe_{recipe_dict['recipe_id']}") |
| os.makedirs(recipe_folder, exist_ok=True) |
|
|
| |
| markdown_file = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_output.md") |
| with open(markdown_file, 'w') as f: |
| f.write(markdown_output) |
|
|
| |
| json_file_gold = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_gold_standard.json") |
| json_file_generated = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_generated.json") |
| with open(json_file_gold, 'w') as f: |
| json.dump(gold_standard_json, f, indent=2) |
| with open(json_file_generated, 'w') as f: |
| json.dump(comparison_json, f, indent=2) |
|
|
| |
| differences_file = os.path.join(recipe_folder, f"recipe_{recipe_dict['recipe_id']}_data_{key}_differences.json") |
| with open(differences_file, 'w') as f: |
| f.write(str(differences_file)) |
|
|
| print("ZIPPING UP WHOLE THING") |
| |
| zip_filename = f"{output_folder}.zip" |
| shutil.make_archive(output_folder, 'zip', output_folder) |
|
|
| |
| shutil.rmtree(output_folder) |
|
|
| |
| return zip_filename |
| |
| return output_folder |
| |
|
|