rosemariafontana commited on
Commit
04df73d
·
verified ·
1 Parent(s): 1f5854f

Update script_for_automation.py

Browse files
Files changed (1) hide show
  1. script_for_automation.py +111 -65
script_for_automation.py CHANGED
@@ -108,9 +108,7 @@ def get_baserow_data():
108
  }
109
 
110
  # How to retrieve this data
111
- # liz_carrot_planting = gold_standards["planting_gold_standards"]["liz_carrot"]
112
- # ben_soybean_interactions = gold_standards["interactions_gold_standards"]["ben_soybean"]
113
- # wally_squash_trial = gold_standards["trial_gold_standards"]["wally_squash"]
114
 
115
  input_data = {
116
  "liz_carrot": {
@@ -330,16 +328,6 @@ def get_data_ready(recipe_dict, input_data_piece):
330
  print("DID THAT NOW")
331
  return processed_data
332
 
333
- def sanitize_json_for_yaml(data):
334
- if isinstance(data, dict):
335
- return {key: sanitize_json_for_yaml(value) for key, value in data.items()}
336
- elif isinstance(data, list):
337
- return [sanitize_json_for_yaml(item) for item in data]
338
- elif isinstance(data, tuple): # Convert tuples to lists
339
- return list(data)
340
- else:
341
- return data # Keep other types as-is
342
-
343
  def format_json(json_data, truncate_length=500):
344
  try:
345
  # Try to load the JSON data
@@ -352,7 +340,6 @@ def format_json(json_data, truncate_length=500):
352
  # If it's not valid JSON, return the string as it is
353
  return json_data[:truncate_length] + "..." if len(json_data) > truncate_length else json_data
354
 
355
- import yaml
356
 
357
  def sanitize_json_for_yaml(data):
358
  if isinstance(data, dict):
@@ -369,13 +356,13 @@ def generate_markdown_output(df):
369
  markdown = ""
370
 
371
  # 1. Input Transcript
372
- markdown += "\n## Input Transcript\n" # Add space before header for consistency
373
  for _, row in df.iterrows():
374
  truncated_input = row['Input_Transcript'][:500] + "..." if len(row['Input_Transcript']) > 500 else row['Input_Transcript']
375
  markdown += f"**Recipe ID {row['Recipe_ID']}**:\n```\n{truncated_input}\n```\n\n"
376
 
377
  # 2. Recipe Fields
378
- markdown += "\n## Recipe Fields\n" # Add space before header for consistency
379
  recipe_columns = [
380
  "Recipe ID", "Testing Strategy", "Schema Processing Model", "Pre-Processing Strategy",
381
  "Pre-Processing Text", "Pre-Processing Model", "Prompting Strategy"
@@ -386,8 +373,16 @@ def generate_markdown_output(df):
386
  recipe_table += f"| {row['Recipe_ID']} | {row['Testing_Strategy_Text']} | {row['Schema_Processing_Model']} | {row['Pre_Processing_Strategy']} | {row['Pre_Processing_Text']} | {row['Pre_Processing_Model']} | {row['Prompting_Strategy']} |\n"
387
  markdown += recipe_table + "\n"
388
 
389
- # 3. Prompts
390
- markdown += "\n## Prompts\n" # Add space before header for consistency
 
 
 
 
 
 
 
 
391
  prompt_columns = ["Plantings and Fields Prompt", "Interactions Prompt", "Treatments Prompt"]
392
  prompt_table = "| " + " | ".join(prompt_columns) + " |\n"
393
  prompt_table += "| " + " | ".join(["-" * len(col) for col in prompt_columns]) + " |\n"
@@ -395,44 +390,34 @@ def generate_markdown_output(df):
395
  prompt_table += f"| {row['Plantings_and_Fields_Prompt']} | {row['Interactions_Prompt']} | {row['Treatments_Prompt']} |\n"
396
  markdown += prompt_table + "\n"
397
 
398
- # 4. Side-by-Side Comparisons
399
- markdown += "\n## Gold Standard vs Machine Generated Key-Values\n" # Add space before header for consistency
400
- markdown += "| Key | Gold Standard | Machine Generated |\n"
401
- markdown += "|-----|---------------|-------------------|\n"
402
- for _, row in df.iterrows():
403
- markdown += f"| {row['Recipe_ID']} | {row['Gold_Standard_Key_Values']} | {row['Machine_Generated_Key_Values']} |\n"
404
- markdown += "\n"
405
-
406
- # 5. Differences
407
- markdown += "\n## Differences\n" # Add space before header for consistency
408
- markdown += "| Key | Difference |\n"
409
- markdown += "|-----|------------|\n"
410
  for _, row in df.iterrows():
411
- differences = row['Differences']
412
- if isinstance(differences, list):
413
- for diff in differences:
414
- if isinstance(diff, dict) and 'values_changed' in diff:
415
- for path, change in diff['values_changed'].items():
416
- if 'old_value' in change and 'new_value' in change:
417
- markdown += f"| {path} | {change['old_value']} → {change['new_value']} |\n"
418
- else:
419
- markdown += "| No differences found | |\n"
420
-
421
- # 6. YAML Comparisons
422
- markdown += "\n## Gold Standard vs Machine Generated YAML\n" # Add space before header for consistency
423
  for _, row in df.iterrows():
424
- gold_yaml = yaml.safe_dump(yaml.safe_load(row['Gold_Standard_YAML']), default_flow_style=False)
425
- machine_yaml = yaml.safe_dump(yaml.safe_load(row['Machine_Generated_YAML']), default_flow_style=False)
426
-
427
- markdown += f"**Recipe ID {row['Recipe_ID']}**:\n\n"
428
- markdown += "**Gold Standard YAML:**\n"
429
- markdown += f"```yaml\n{gold_yaml}\n```\n"
430
- markdown += "**Machine Generated YAML:**\n"
431
- markdown += f"```yaml\n{machine_yaml}\n```\n\n"
 
432
 
433
- # Ensure clean separation
434
- markdown += "---\n\n"
435
  return markdown
 
436
 
437
  def drive_process():
438
  # this is to drive the processing process
@@ -475,36 +460,97 @@ def drive_process():
475
  print(input_data_piece)
476
 
477
  # Fill out a Surveystack submission
 
478
  #fill_out_survey(recipe_dict, input_data)
479
 
480
  # Prepare the data for the structured output setup
481
  proc_spec = get_data_ready(recipe_dict, input_data_piece)
482
 
483
  print("PROCESSING SPECIFICATIONS!!!!!!!!!!!!!!!")
484
- completed_json = process_specifications(proc_spec)
485
 
486
 
487
  print("Gold Standard diff and stuff")
488
- # Get the gold standard for this input_chunk (liz_carrot, ben_soybean, wally_squash)
489
- # Compare the generated JSON to the gold standard
490
  gold_standard_json = gold_standards[key]
491
- differences = list(diff(gold_standard_json, completed_json))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
 
493
  print("yaml world")
 
494
  # Convert to yaml
495
- gold_standard_json = sanitize_json_for_yaml(gold_standard_json)
496
- completed_json = sanitize_json_for_yaml(completed_json)
 
497
 
498
- gold_standard_yaml = yaml.dump(gold_standard_json, default_flow_style=False)
499
- comparison_yaml = yaml.dump(completed_json, default_flow_style=False)
 
500
 
 
 
 
 
 
 
 
 
501
  try:
502
- yaml.safe_load(gold_standard_yaml)
503
- yaml.safe_load(comparison_yaml)
 
 
 
 
 
504
  print("YAML output is valid!")
505
  except yaml.YAMLError as e:
506
  print("YAML output is invalid:", e)
507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  recipe_id = recipe_dict.get("recipe_id", "N/A")
509
  output_rows.append({
510
  "Recipe_ID": recipe_id,
@@ -518,9 +564,9 @@ def drive_process():
518
  "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
519
  "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
520
  "Input_Transcript": input_chunks,
521
- "Gold_Standard_Key_Values": gold_standard_json,
522
- "Machine_Generated_Key_Values": completed_json,
523
- "Differences": differences,
524
  "Gold_Standard_YAML": gold_standard_yaml,
525
  "Machine_Generated_YAML": comparison_yaml
526
  })
 
108
  }
109
 
110
  # How to retrieve this data
111
+ # liz_carrot_planting = gold_standards["liz_carrot"]["planting"]
 
 
112
 
113
  input_data = {
114
  "liz_carrot": {
 
328
  print("DID THAT NOW")
329
  return processed_data
330
 
 
 
 
 
 
 
 
 
 
 
331
  def format_json(json_data, truncate_length=500):
332
  try:
333
  # Try to load the JSON data
 
340
  # If it's not valid JSON, return the string as it is
341
  return json_data[:truncate_length] + "..." if len(json_data) > truncate_length else json_data
342
 
 
343
 
344
  def sanitize_json_for_yaml(data):
345
  if isinstance(data, dict):
 
356
  markdown = ""
357
 
358
  # 1. Input Transcript
359
+ markdown += "\n## Input Transcript\n"
360
  for _, row in df.iterrows():
361
  truncated_input = row['Input_Transcript'][:500] + "..." if len(row['Input_Transcript']) > 500 else row['Input_Transcript']
362
  markdown += f"**Recipe ID {row['Recipe_ID']}**:\n```\n{truncated_input}\n```\n\n"
363
 
364
  # 2. Recipe Fields
365
+ markdown += "\n## Recipe Fields\n"
366
  recipe_columns = [
367
  "Recipe ID", "Testing Strategy", "Schema Processing Model", "Pre-Processing Strategy",
368
  "Pre-Processing Text", "Pre-Processing Model", "Prompting Strategy"
 
373
  recipe_table += f"| {row['Recipe_ID']} | {row['Testing_Strategy_Text']} | {row['Schema_Processing_Model']} | {row['Pre_Processing_Strategy']} | {row['Pre_Processing_Text']} | {row['Pre_Processing_Model']} | {row['Prompting_Strategy']} |\n"
374
  markdown += recipe_table + "\n"
375
 
376
+ # 3. Differences
377
+ markdown += "\n## Differences\n"
378
+ for _, row in df.iterrows():
379
+ markdown += f"\n### Recipe ID: {row['Recipe_ID']}\n"
380
+ differences = row['Differences']
381
+ for key, diff in differences.items():
382
+ markdown += f"#### {key.capitalize()}\n```\n{json.dumps(diff, indent=2)}\n```\n"
383
+
384
+ # 4. Prompts
385
+ markdown += "\n## Prompts\n"
386
  prompt_columns = ["Plantings and Fields Prompt", "Interactions Prompt", "Treatments Prompt"]
387
  prompt_table = "| " + " | ".join(prompt_columns) + " |\n"
388
  prompt_table += "| " + " | ".join(["-" * len(col) for col in prompt_columns]) + " |\n"
 
390
  prompt_table += f"| {row['Plantings_and_Fields_Prompt']} | {row['Interactions_Prompt']} | {row['Treatments_Prompt']} |\n"
391
  markdown += prompt_table + "\n"
392
 
393
+ # 5. Side-by-Side JSON Comparisons
394
+ markdown += "\n## Gold Standard vs Machine Generated JSON\n"
 
 
 
 
 
 
 
 
 
 
395
  for _, row in df.iterrows():
396
+ markdown += f"\n### Recipe ID: {row['Recipe_ID']}\n"
397
+ for key in ["planting", "interactions", "trials"]:
398
+ gold = json.dumps(row['Gold_Standard_JSON'].get(key, {}), indent=2)
399
+ machine = json.dumps(row['Machine_Generated_JSON'].get(key, {}), indent=2)
400
+ markdown += f"#### {key.capitalize()}\n"
401
+ markdown += "| Type | JSON Content |\n"
402
+ markdown += "|------|--------------|\n"
403
+ markdown += f"| Gold Standard | ```json\n{gold}\n``` |\n"
404
+ markdown += f"| Machine Generated | ```json\n{machine}\n``` |\n"
405
+
406
+ # 6. Side-by-Side YAML Comparisons
407
+ markdown += "\n## Gold Standard vs Machine Generated YAML\n"
408
  for _, row in df.iterrows():
409
+ markdown += f"\n### Recipe ID: {row['Recipe_ID']}\n"
410
+ for key in ["planting", "interactions", "trials"]:
411
+ gold = yaml.dump(row['Gold_Standard_YAML'].get(key, {}), default_flow_style=False)
412
+ machine = yaml.dump(row['Machine_Generated_YAML'].get(key, {}), default_flow_style=False)
413
+ markdown += f"#### {key.capitalize()}\n"
414
+ markdown += "| Type | YAML Content |\n"
415
+ markdown += "|------|--------------|\n"
416
+ markdown += f"| Gold Standard | ```yaml\n{gold}\n``` |\n"
417
+ markdown += f"| Machine Generated | ```yaml\n{machine}\n``` |\n"
418
 
 
 
419
  return markdown
420
+
421
 
422
  def drive_process():
423
  # this is to drive the processing process
 
460
  print(input_data_piece)
461
 
462
  # Fill out a Surveystack submission
463
+ # This isn't accepted by the data
464
  #fill_out_survey(recipe_dict, input_data)
465
 
466
  # Prepare the data for the structured output setup
467
  proc_spec = get_data_ready(recipe_dict, input_data_piece)
468
 
469
  print("PROCESSING SPECIFICATIONS!!!!!!!!!!!!!!!")
470
+ processed_farm_activity_json, processed_interactions_json, processed_trials_json = process_specifications(proc_spec)
471
 
472
 
473
  print("Gold Standard diff and stuff")
474
+ # Get the gold standard for this input_chunk (key = liz_carrot, ben_soybean, wally_squash)
 
475
  gold_standard_json = gold_standards[key]
476
+
477
+ # "liz_carrot": {
478
+ # "planting": liz_carrot_plantings_gold_standard,
479
+ # "interactions": liz_carrot_interactions_gold_standard,
480
+ # "trials": liz_carrot_trials_gold_standard,
481
+ # },
482
+
483
+ gold_standard_planting_json = gold_standard_json["planting"]
484
+ gold_standard_interactions_json = gold_standard_json["interactions"]
485
+ gold_standard_trials_json = gold_standard_json["trials"]
486
+
487
+ # Compare the generated JSON to the gold standard
488
+ differences_planting = list(diff(gold_standard_planting_json, processed_farm_activity_json))
489
+ differences_interactions = list(diff(gold_standard_interactions_json, processed_interactions_json))
490
+ differences_trials = list(diff(gold_standard_trials_json, processed_trials_json))
491
 
492
  print("yaml world")
493
+
494
  # Convert to yaml
495
+ completed_gold_standard_planting_json = sanitize_json_for_yaml(gold_standard_planting_json)
496
+ completed_gold_standard_interactions_json = sanitize_json_for_yaml(gold_standard_interactions_json)
497
+ completed_gold_standard_trials_json = sanitize_json_for_yaml(gold_standard_trials_json)
498
 
499
+ completed_processed_farm_activity_json = sanitize_json_for_yaml(processed_farm_activity_json)
500
+ completed_processed_interactions_json = sanitize_json_for_yaml(processed_interactions_json)
501
+ completed_processed_trials_json = sanitize_json_for_yaml(processed_trials_json)
502
 
503
+ completed_gold_standard_planting_yaml = yaml.dump(completed_gold_standard_planting_json, default_flow_style=False)
504
+ completed_gold_standard_interactions_yaml = yaml.dump(completed_gold_standard_interactions_json, default_flow_style=False)
505
+ completed_gold_standard_trials_yaml = yaml.dump(completed_gold_standard_trials_json, default_flow_style=False)
506
+
507
+ completed_comparison_planting_yaml = yaml.dump(completed_processed_farm_activity_json, default_flow_style=False)
508
+ completed_comparison_interactions_yaml = yaml.dump(completed_processed_interactions_json, default_flow_style=False)
509
+ completed_comparison_trials_yaml = yaml.dump(completed_processed_trials_json, default_flow_style=False)
510
+
511
  try:
512
+ yaml.safe_load(completed_gold_standard_planting_yaml)
513
+ yaml.safe_load(completed_gold_standard_interactions_yaml)
514
+ yaml.safe_load(completed_gold_standard_trials_yaml)
515
+
516
+ yaml.safe_load(completed_comparison_planting_yaml)
517
+ yaml.safe_load(completed_comparison_interactions_yaml)
518
+ yaml.safe_load(completed_comparison_trials_yaml)
519
  print("YAML output is valid!")
520
  except yaml.YAMLError as e:
521
  print("YAML output is invalid:", e)
522
 
523
+
524
+ json_diff = {
525
+ "planting": differences_planting,
526
+ "interactions": differences_interactions,
527
+ "trials": differences_trials
528
+ }
529
+
530
+ gold_standard_json = {
531
+ "planting": completed_gold_standard_planting_json,
532
+ "interactions": completed_gold_standard_interactions_json,
533
+ "trials": completed_gold_standard_trials_json
534
+ }
535
+
536
+ comparison_json = {
537
+ "planting": completed_processed_farm_activity_json,
538
+ "interactions": completed_processed_interactions_json,
539
+ "trials": completed_processed_trials_json
540
+ }
541
+
542
+ gold_standard_yaml = {
543
+ "planting": completed_gold_standard_planting_yaml,
544
+ "interactions": completed_gold_standard_interactions_yaml,
545
+ "trials": completed_gold_standard_trials_yaml
546
+ }
547
+
548
+ comparison_yaml = {
549
+ "planting": completed_comparison_planting_yaml,
550
+ "interactions": completed_comparison_interactions_yaml,
551
+ "trials": completed_comparison_trials_yaml
552
+ }
553
+
554
  recipe_id = recipe_dict.get("recipe_id", "N/A")
555
  output_rows.append({
556
  "Recipe_ID": recipe_id,
 
564
  "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
565
  "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
566
  "Input_Transcript": input_chunks,
567
+ "Gold_Standard_JSON": gold_standard_json,
568
+ "Machine_Generated_JSON": comparison_json,
569
+ "Differences": json_diff,
570
  "Gold_Standard_YAML": gold_standard_yaml,
571
  "Machine_Generated_YAML": comparison_yaml
572
  })