Spaces:

our-sci
/

data-translation-experiments

Sleeping

App Files Files Community

rosemariafontana commited on Dec 18, 2024

Commit

c494d38

verified ·

1 Parent(s): a379e6a

Update script_for_automation.py

Browse files

Files changed (1) hide show

script_for_automation.py +25 -31

script_for_automation.py CHANGED Viewed

@@ -330,84 +330,78 @@ def get_data_ready(recipe_dict, input_data_piece):
     print("DID THAT NOW")
     return processed_data
 def generate_markdown_output(df):
     # Start the markdown output string
     markdown = ""
-    # Table for Recipe Fields (recipe_id to prompting_strategy)
-    markdown += "### Recipe Fields (Basic Information)\n"
     markdown += "| Recipe ID | Testing Strategy | Schema Processing Model | Pre-Processing Strategy | Pre-Processing Text | Pre-Processing Model | Prompting Strategy |\n"
     markdown += "|-----------|------------------|-------------------------|--------------------------|---------------------|----------------------|-------------------|\n"
-    # Iterate over rows to create the first table (recipe_id to prompting_strategy)
     for _, row in df.iterrows():
-        # Ensure all fields have a consistent length (pad with empty string if None)
         markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Testing_Strategy_Text']).ljust(20)} | {str(row['Schema_Processing_Model']).ljust(25)} | {str(row['Pre_Processing_Strategy']).ljust(23)} | {str(row['Pre_Processing_Text']).ljust(20)} | {str(row['Pre_Processing_Model']).ljust(20)} | {str(row['Prompting_Strategy']).ljust(25)} |\n"
-    # Separate section for Prompts
     markdown += "\n### Prompts\n"
     markdown += "| Plantings and Fields Prompt | Interactions Prompt | Treatments Prompt |\n"
     markdown += "|-----------------------------|---------------------|-------------------|\n"
-    # Iterate over rows to create the second table (plantings_and_fields_prompt, interactions_prompt, treatments_prompt)
     for _, row in df.iterrows():
-        # Ensure all fields are of consistent length
         markdown += f"| {str(row['Plantings_and_Fields_Prompt']).ljust(30)} | {str(row['Interactions_Prompt']).ljust(20)} | {str(row['Treatments_Prompt']).ljust(20)} |\n"
-    # Separate section for Input Transcript
-    markdown += "\n### Input Transcript\n"
-    markdown += "Since the input transcript might be very long, it is truncated here for readability:\n"
-    # Display a truncated version of the input transcript to avoid long text in the table
-    for _, row in df.iterrows():
-        truncated_input = (row['Input_Transcript'][:500] + '...') if len(row['Input_Transcript']) > 500 else row['Input_Transcript']
-        markdown += f"**Recipe ID {row['Recipe_ID']}**: {truncated_input}\n\n"
-    # Side-by-side comparison of Gold Standard and Machine Generated Key-Values
     markdown += "\n### Gold Standard vs Machine Generated Key-Values\n"
     markdown += "| Key | Gold Standard | Machine Generated |\n"
     markdown += "|-----|---------------|-------------------|\n"
-    # Iterate over rows to create the comparison table
     for _, row in df.iterrows():
         markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Gold_Standard_Key_Values']).ljust(25)} | {str(row['Machine_Generated_Key_Values']).ljust(25)} |\n"
-    # Display differences in a readable format
     markdown += "\n### Differences\n"
-    markdown += "The following differences were found between the gold standard and the machine-generated output:\n"
     markdown += "| Key | Difference |\n"
     markdown += "|-----|------------|\n"
     for _, row in df.iterrows():
-        # Assuming 'Differences' is a list of dictionaries with keys and changes
         differences = row['Differences']
         if isinstance(differences, list):
             for diff in differences:
-                # Ensure that diff has a 'values_changed' key
                 if isinstance(diff, dict) and 'values_changed' in diff:
                     for path, change in diff['values_changed'].items():
-                        # Ensure we have both 'old_value' and 'new_value'
                         if 'old_value' in change and 'new_value' in change:
                             markdown += f"| {str(path).ljust(20)} | {str(change['old_value']).ljust(20)} -> {str(change['new_value']).ljust(20)} |\n"
                         else:
-                            print(f"Skipping change at {path} due to missing old or new value")
                 else:
-                    print(f"Skipping non-dictionary diff or missing 'values_changed' key: {diff}")
         else:
-            print(f"Expected a list for differences, but got: {type(differences)}")
-    # Side-by-side YAML comparison for human visual inspection
     markdown += "\n### Gold Standard vs Machine Generated YAML\n"
     markdown += "| Gold Standard YAML | Machine Generated YAML |\n"
     markdown += "|--------------------|------------------------|\n"
-    # Add the side-by-side YAML comparison
     for _, row in df.iterrows():
-        markdown += f"| ```yaml\n{str(row['Gold_Standard_YAML']).ljust(50)}\n``` | ```yaml\n{str(row['Machine_Generated_YAML']).ljust(50)}\n``` |\n"
     return markdown
 def drive_process():
     # this is to drive the processing process
     print("We are starting to DRIVE PROCESS")
@@ -481,7 +475,7 @@ def drive_process():
                 "Plantings_and_Fields_Prompt": recipe_dict.get("plantings_and_fields_prompt", "N/A"),
                 "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
                 "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
-                "Input_Transcript": input_data,
                 "Gold_Standard_Key_Values": gold_standard_json,
                 "Machine_Generated_Key_Values": completed_json,
                 "Differences": differences,

     print("DID THAT NOW")
     return processed_data
+import yaml
+import json
 def generate_markdown_output(df):
     # Start the markdown output string
     markdown = ""
+    # Input Transcript Section
+    markdown += "### Input Transcript\n"
+    markdown += "Since the input transcript might be very long, it is truncated here for readability:\n\n"
+    for _, row in df.iterrows():
+        truncated_input = (row['Input_Transcript'][:500] + '...') if len(row['Input_Transcript']) > 500 else row['Input_Transcript']
+        markdown += f"**Recipe ID {row['Recipe_ID']}**:\n\n{truncated_input}\n\n"
+    # Recipe Fields Section
+    markdown += "\n### Recipe Fields (Basic Information)\n"
     markdown += "| Recipe ID | Testing Strategy | Schema Processing Model | Pre-Processing Strategy | Pre-Processing Text | Pre-Processing Model | Prompting Strategy |\n"
     markdown += "|-----------|------------------|-------------------------|--------------------------|---------------------|----------------------|-------------------|\n"
     for _, row in df.iterrows():
         markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Testing_Strategy_Text']).ljust(20)} | {str(row['Schema_Processing_Model']).ljust(25)} | {str(row['Pre_Processing_Strategy']).ljust(23)} | {str(row['Pre_Processing_Text']).ljust(20)} | {str(row['Pre_Processing_Model']).ljust(20)} | {str(row['Prompting_Strategy']).ljust(25)} |\n"
+    # Prompts Section
     markdown += "\n### Prompts\n"
     markdown += "| Plantings and Fields Prompt | Interactions Prompt | Treatments Prompt |\n"
     markdown += "|-----------------------------|---------------------|-------------------|\n"
     for _, row in df.iterrows():
         markdown += f"| {str(row['Plantings_and_Fields_Prompt']).ljust(30)} | {str(row['Interactions_Prompt']).ljust(20)} | {str(row['Treatments_Prompt']).ljust(20)} |\n"
+    # Side-by-Side Comparison
     markdown += "\n### Gold Standard vs Machine Generated Key-Values\n"
     markdown += "| Key | Gold Standard | Machine Generated |\n"
     markdown += "|-----|---------------|-------------------|\n"
     for _, row in df.iterrows():
         markdown += f"| {str(row['Recipe_ID']).ljust(10)} | {str(row['Gold_Standard_Key_Values']).ljust(25)} | {str(row['Machine_Generated_Key_Values']).ljust(25)} |\n"
+    # Differences Section
     markdown += "\n### Differences\n"
+    markdown += "The following differences were found between the gold standard and the machine-generated output:\n\n"
     markdown += "| Key | Difference |\n"
     markdown += "|-----|------------|\n"
     for _, row in df.iterrows():
         differences = row['Differences']
         if isinstance(differences, list):
             for diff in differences:
                 if isinstance(diff, dict) and 'values_changed' in diff:
                     for path, change in diff['values_changed'].items():
                         if 'old_value' in change and 'new_value' in change:
                             markdown += f"| {str(path).ljust(20)} | {str(change['old_value']).ljust(20)} -> {str(change['new_value']).ljust(20)} |\n"
                         else:
+                            markdown += f"| {str(path).ljust(20)} | (Missing old/new value) |\n"
                 else:
+                    markdown += f"| (Invalid diff) | |\n"
         else:
+            markdown += f"| (No differences) | |\n"
+    # YAML Comparison Section
     markdown += "\n### Gold Standard vs Machine Generated YAML\n"
     markdown += "| Gold Standard YAML | Machine Generated YAML |\n"
     markdown += "|--------------------|------------------------|\n"
     for _, row in df.iterrows():
+        gold_yaml = yaml.dump(yaml.safe_load(row['Gold_Standard_YAML']), default_flow_style=False)
+        machine_yaml = yaml.dump(yaml.safe_load(row['Machine_Generated_YAML']), default_flow_style=False)
+        markdown += f"| ```yaml\n{gold_yaml}``` | ```yaml\n{machine_yaml}``` |\n"
     return markdown
 def drive_process():
     # this is to drive the processing process
     print("We are starting to DRIVE PROCESS")
                 "Plantings_and_Fields_Prompt": recipe_dict.get("plantings_and_fields_prompt", "N/A"),
                 "Interactions_Prompt": recipe_dict.get("interactions_prompt", "N/A"),
                 "Treatments_Prompt": recipe_dict.get("treatments_prompt", "N/A"),
+                "Input_Transcript": input_chunks,
                 "Gold_Standard_Key_Values": gold_standard_json,
                 "Machine_Generated_Key_Values": completed_json,
                 "Differences": differences,