rosemariafontana commited on
Commit
c494d38
·
verified ·
1 Parent(s): a379e6a

Update script_for_automation.py

Browse files
Files changed (1) hide show
  1. script_for_automation.py +25 -31
script_for_automation.py CHANGED
@@ -330,84 +330,78 @@ def get_data_ready(recipe_dict, input_data_piece):
330
  print("DID THAT NOW")
331
  return processed_data
332
 
 
 
 
333
  def generate_markdown_output(df):
334
  # Start the markdown output string
335
  markdown = ""
336
 
337
- # Table for Recipe Fields (recipe_id to prompting_strategy)
338
- markdown += "### Recipe Fields (Basic Information)\n"
 
 
 
 
 
 
 
 
339
  markdown += "| Recipe ID | Testing Strategy | Schema Processing Model | Pre-Processing Strategy | Pre-Processing Text | Pre-Processing Model | Prompting Strategy |\n"
340
  markdown += "|-----------|------------------|-------------------------|--------------------------|---------------------|----------------------|-------------------|\n"
341
 
342
- # Iterate over rows to create the first table (recipe_id to prompting_strategy)
343
  for _, row in df.iterrows():
344
- # Ensure all fields have a consistent length (pad with empty string if None)
345
  markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Testing_Strategy_Text']).ljust(20)} | {str(row['Schema_Processing_Model']).ljust(25)} | {str(row['Pre_Processing_Strategy']).ljust(23)} | {str(row['Pre_Processing_Text']).ljust(20)} | {str(row['Pre_Processing_Model']).ljust(20)} | {str(row['Prompting_Strategy']).ljust(25)} |\n"
346
 
347
- # Separate section for Prompts
348
  markdown += "\n### Prompts\n"
349
  markdown += "| Plantings and Fields Prompt | Interactions Prompt | Treatments Prompt |\n"
350
  markdown += "|-----------------------------|---------------------|-------------------|\n"
351
 
352
- # Iterate over rows to create the second table (plantings_and_fields_prompt, interactions_prompt, treatments_prompt)
353
  for _, row in df.iterrows():
354
- # Ensure all fields are of consistent length
355
  markdown += f"| {str(row['Plantings_and_Fields_Prompt']).ljust(30)} | {str(row['Interactions_Prompt']).ljust(20)} | {str(row['Treatments_Prompt']).ljust(20)} |\n"
356
 
357
- # Separate section for Input Transcript
358
- markdown += "\n### Input Transcript\n"
359
- markdown += "Since the input transcript might be very long, it is truncated here for readability:\n"
360
-
361
- # Display a truncated version of the input transcript to avoid long text in the table
362
- for _, row in df.iterrows():
363
- truncated_input = (row['Input_Transcript'][:500] + '...') if len(row['Input_Transcript']) > 500 else row['Input_Transcript']
364
- markdown += f"**Recipe ID {row['Recipe_ID']}**: {truncated_input}\n\n"
365
-
366
- # Side-by-side comparison of Gold Standard and Machine Generated Key-Values
367
  markdown += "\n### Gold Standard vs Machine Generated Key-Values\n"
368
  markdown += "| Key | Gold Standard | Machine Generated |\n"
369
  markdown += "|-----|---------------|-------------------|\n"
370
 
371
- # Iterate over rows to create the comparison table
372
  for _, row in df.iterrows():
373
  markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Gold_Standard_Key_Values']).ljust(25)} | {str(row['Machine_Generated_Key_Values']).ljust(25)} |\n"
374
 
375
- # Display differences in a readable format
376
  markdown += "\n### Differences\n"
377
- markdown += "The following differences were found between the gold standard and the machine-generated output:\n"
378
  markdown += "| Key | Difference |\n"
379
  markdown += "|-----|------------|\n"
380
 
381
  for _, row in df.iterrows():
382
- # Assuming 'Differences' is a list of dictionaries with keys and changes
383
  differences = row['Differences']
384
  if isinstance(differences, list):
385
  for diff in differences:
386
- # Ensure that diff has a 'values_changed' key
387
  if isinstance(diff, dict) and 'values_changed' in diff:
388
  for path, change in diff['values_changed'].items():
389
- # Ensure we have both 'old_value' and 'new_value'
390
  if 'old_value' in change and 'new_value' in change:
391
  markdown += f"| {str(path).ljust(20)} | {str(change['old_value']).ljust(20)} -> {str(change['new_value']).ljust(20)} |\n"
392
  else:
393
- print(f"Skipping change at {path} due to missing old or new value")
394
  else:
395
- print(f"Skipping non-dictionary diff or missing 'values_changed' key: {diff}")
396
  else:
397
- print(f"Expected a list for differences, but got: {type(differences)}")
398
 
399
- # Side-by-side YAML comparison for human visual inspection
400
  markdown += "\n### Gold Standard vs Machine Generated YAML\n"
401
  markdown += "| Gold Standard YAML | Machine Generated YAML |\n"
402
  markdown += "|--------------------|------------------------|\n"
403
 
404
- # Add the side-by-side YAML comparison
405
  for _, row in df.iterrows():
406
- markdown += f"| ```yaml\n{str(row['Gold_Standard_YAML']).ljust(50)}\n``` | ```yaml\n{str(row['Machine_Generated_YAML']).ljust(50)}\n``` |\n"
 
 
407
 
408
  return markdown
409
 
410
-
411
  def drive_process():
412
  # this is to drive the processing process
413
  print("We are starting to DRIVE PROCESS")
@@ -481,7 +475,7 @@ def drive_process():
481
  "Plantings_and_Fields_Prompt": recipe_dict.get("plantings_and_fields_prompt", "N/A"),
482
  "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
483
  "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
484
- "Input_Transcript": input_data,
485
  "Gold_Standard_Key_Values": gold_standard_json,
486
  "Machine_Generated_Key_Values": completed_json,
487
  "Differences": differences,
 
330
  print("DID THAT NOW")
331
  return processed_data
332
 
333
+ import yaml
334
+ import json
335
+
336
  def generate_markdown_output(df):
337
  # Start the markdown output string
338
  markdown = ""
339
 
340
+ # Input Transcript Section
341
+ markdown += "### Input Transcript\n"
342
+ markdown += "Since the input transcript might be very long, it is truncated here for readability:\n\n"
343
+
344
+ for _, row in df.iterrows():
345
+ truncated_input = (row['Input_Transcript'][:500] + '...') if len(row['Input_Transcript']) > 500 else row['Input_Transcript']
346
+ markdown += f"**Recipe ID {row['Recipe_ID']}**:\n\n{truncated_input}\n\n"
347
+
348
+ # Recipe Fields Section
349
+ markdown += "\n### Recipe Fields (Basic Information)\n"
350
  markdown += "| Recipe ID | Testing Strategy | Schema Processing Model | Pre-Processing Strategy | Pre-Processing Text | Pre-Processing Model | Prompting Strategy |\n"
351
  markdown += "|-----------|------------------|-------------------------|--------------------------|---------------------|----------------------|-------------------|\n"
352
 
 
353
  for _, row in df.iterrows():
 
354
  markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Testing_Strategy_Text']).ljust(20)} | {str(row['Schema_Processing_Model']).ljust(25)} | {str(row['Pre_Processing_Strategy']).ljust(23)} | {str(row['Pre_Processing_Text']).ljust(20)} | {str(row['Pre_Processing_Model']).ljust(20)} | {str(row['Prompting_Strategy']).ljust(25)} |\n"
355
 
356
+ # Prompts Section
357
  markdown += "\n### Prompts\n"
358
  markdown += "| Plantings and Fields Prompt | Interactions Prompt | Treatments Prompt |\n"
359
  markdown += "|-----------------------------|---------------------|-------------------|\n"
360
 
 
361
  for _, row in df.iterrows():
 
362
  markdown += f"| {str(row['Plantings_and_Fields_Prompt']).ljust(30)} | {str(row['Interactions_Prompt']).ljust(20)} | {str(row['Treatments_Prompt']).ljust(20)} |\n"
363
 
364
+ # Side-by-Side Comparison
 
 
 
 
 
 
 
 
 
365
  markdown += "\n### Gold Standard vs Machine Generated Key-Values\n"
366
  markdown += "| Key | Gold Standard | Machine Generated |\n"
367
  markdown += "|-----|---------------|-------------------|\n"
368
 
 
369
  for _, row in df.iterrows():
370
  markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Gold_Standard_Key_Values']).ljust(25)} | {str(row['Machine_Generated_Key_Values']).ljust(25)} |\n"
371
 
372
+ # Differences Section
373
  markdown += "\n### Differences\n"
374
+ markdown += "The following differences were found between the gold standard and the machine-generated output:\n\n"
375
  markdown += "| Key | Difference |\n"
376
  markdown += "|-----|------------|\n"
377
 
378
  for _, row in df.iterrows():
 
379
  differences = row['Differences']
380
  if isinstance(differences, list):
381
  for diff in differences:
 
382
  if isinstance(diff, dict) and 'values_changed' in diff:
383
  for path, change in diff['values_changed'].items():
 
384
  if 'old_value' in change and 'new_value' in change:
385
  markdown += f"| {str(path).ljust(20)} | {str(change['old_value']).ljust(20)} -> {str(change['new_value']).ljust(20)} |\n"
386
  else:
387
+ markdown += f"| {str(path).ljust(20)} | (Missing old/new value) |\n"
388
  else:
389
+ markdown += f"| (Invalid diff) | |\n"
390
  else:
391
+ markdown += f"| (No differences) | |\n"
392
 
393
+ # YAML Comparison Section
394
  markdown += "\n### Gold Standard vs Machine Generated YAML\n"
395
  markdown += "| Gold Standard YAML | Machine Generated YAML |\n"
396
  markdown += "|--------------------|------------------------|\n"
397
 
 
398
  for _, row in df.iterrows():
399
+ gold_yaml = yaml.dump(yaml.safe_load(row['Gold_Standard_YAML']), default_flow_style=False)
400
+ machine_yaml = yaml.dump(yaml.safe_load(row['Machine_Generated_YAML']), default_flow_style=False)
401
+ markdown += f"| ```yaml\n{gold_yaml}``` | ```yaml\n{machine_yaml}``` |\n"
402
 
403
  return markdown
404
 
 
405
  def drive_process():
406
  # this is to drive the processing process
407
  print("We are starting to DRIVE PROCESS")
 
475
  "Plantings_and_Fields_Prompt": recipe_dict.get("plantings_and_fields_prompt", "N/A"),
476
  "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
477
  "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
478
+ "Input_Transcript": input_chunks,
479
  "Gold_Standard_Key_Values": gold_standard_json,
480
  "Machine_Generated_Key_Values": completed_json,
481
  "Differences": differences,