Spaces:

flowaicom
/

Flow-Judge-v0.1

Runtime error

App Files Files Community

bergr7f commited on Oct 14, 2024

Commit

b9ee4b2

1 Parent(s): 81602d3

feat: stylistic changes + 1 more example

Browse files

Files changed (3) hide show

app.py +22 -10
examples.py +13 -0
img/flow_judge_banner.png +0 -0

app.py CHANGED Viewed

@@ -75,6 +75,8 @@ def reset_all():
 # Define presets
 EXAMPLES = get_examples()
 HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monospace;">Flow Judge Demo</h1>
 <p align="center" style="font-family: 'Courier New', Courier, monospace;">
@@ -82,7 +84,7 @@ HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monos
     <a href="https://www.flow-ai.com/judge">Technical Report</a> |
     <a href="https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec">Model Weights</a> |
     <a href="https://github.com/flowaicom/lm-evaluation-harness/tree/Flow-Judge-v0.1_evals/lm_eval/tasks/flow_judge_evals">Evaluation Code</a> |
-    <a href="https://github.com/flowaicom/flow-judge/tree/main/examples">Examples</a>
   </strong>
 </p>
@@ -94,17 +96,24 @@ HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monos
 with gr.Blocks() as demo:
     model_downloaded = download_model()
-    gr.HTML(HEADER)
-    gr.Markdown("**Try it out with some examples**")
     with gr.Row():
         with gr.Column(scale=1):
-            preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//2]]
         with gr.Column(scale=1):
-            preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//2:]]
     with gr.Row(equal_height=False):
         with gr.Column(scale=1):
-            gr.Markdown("**Inputs**")
             with gr.Group():
                 inputs_task = gr.State([])
                 new_input_name = gr.Textbox(label="Name")
@@ -137,7 +146,8 @@ with gr.Blocks() as demo:
             )
         with gr.Column(scale=1):
-            gr.Markdown("**Output**")
             with gr.Group():
                 with gr.Row(equal_height=True):
                     with gr.Column(min_width=60, scale=2):
@@ -145,9 +155,11 @@ with gr.Blocks() as demo:
                     with gr.Column(scale=9):
                         output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3)
-    gr.Markdown("**Evaluation criteria and rubric**")
     with gr.Column(scale=1):
         with gr.Row():
             with gr.Column(scale=1):
                 rubric_items = gr.State([])
@@ -184,7 +196,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=1, variant="panel"):
-            gr.Markdown("**Evaluation**")
             with gr.Group():
                 with gr.Row(equal_height=True):
                     with gr.Column(min_width=15, scale=1):

 # Define presets
 EXAMPLES = get_examples()
+IMAGE_PATH = "./img/flow_judge_banner.png"
 HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monospace;">Flow Judge Demo</h1>
 <p align="center" style="font-family: 'Courier New', Courier, monospace;">
     <a href="https://www.flow-ai.com/judge">Technical Report</a> |
     <a href="https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec">Model Weights</a> |
     <a href="https://github.com/flowaicom/lm-evaluation-harness/tree/Flow-Judge-v0.1_evals/lm_eval/tasks/flow_judge_evals">Evaluation Code</a> |
+    <a href="https://github.com/flowaicom/flow-judge/tree/main/examples">Tutorials</a>
   </strong>
 </p>
 with gr.Blocks() as demo:
     model_downloaded = download_model()
+    with gr.Row(equal_height=False):
+        with gr.Column(scale=2):
+            gr.Image(IMAGE_PATH, show_label=False, interactive=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False)
+        with gr.Column(scale=3):
+            gr.HTML(HEADER)
+    gr.Markdown("# ⚡ **Quickstart Examples**")
     with gr.Row():
         with gr.Column(scale=1):
+            preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//3]]
+        with gr.Column(scale=1):
+            preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//3:2*len(EXAMPLES)//3]]
         with gr.Column(scale=1):
+            preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[2*len(EXAMPLES)//3:]]
     with gr.Row(equal_height=False):
         with gr.Column(scale=1):
+            gr.Markdown("## **Evaluation task inputs**")
+            gr.Markdown("*<span style='color: gray;'>Define the input names and values for the evaluation task. Inputs are optional if evaluation depends on the output only.</span>*")
             with gr.Group():
                 inputs_task = gr.State([])
                 new_input_name = gr.Textbox(label="Name")
             )
         with gr.Column(scale=1):
+            gr.Markdown("## **Evaluation task output**")
+            gr.Markdown("*<span style='color: gray;'>Define the output name and value for the evaluation task. Output is always required.</span>*")
             with gr.Group():
                 with gr.Row(equal_height=True):
                     with gr.Column(min_width=60, scale=2):
                     with gr.Column(scale=9):
                         output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3)
     with gr.Column(scale=1):
+        gr.Markdown("## **Evaluation criteria and rubric**")
+        gr.Markdown("*<span style='color: gray;'>Define the evaluation criteria and rubric for the evaluation task. Supported scoring scales: Binary (0 and 1), 3-Likert and 5-Likert.</span>*\n\n*<span style='color: gray;'>❗You can experiment with other scoring scales. However, performance may vary.</span>*")
         with gr.Row():
             with gr.Column(scale=1):
                 rubric_items = gr.State([])
     with gr.Row():
         with gr.Column(scale=1, variant="panel"):
+            gr.Markdown("# **Evaluation**")
             with gr.Group():
                 with gr.Row(equal_height=True):
                     with gr.Column(min_width=15, scale=1):

examples.py CHANGED Viewed

@@ -66,6 +66,19 @@ EXAMPLES = [
             {"name": "4", "value": "The model frequently comprehends and accurately applies industry-specific language or technical terms, with only minor mistakes or inconsistencies."},
             {"name": "5", "value": "The model perfectly comprehends and applies industry-specific language or technical terms, offering precise, perceptive feedback that demonstrates a comprehensive understanding of the topic."},
         ]
     }
 ]

             {"name": "4", "value": "The model frequently comprehends and accurately applies industry-specific language or technical terms, with only minor mistakes or inconsistencies."},
             {"name": "5", "value": "The model perfectly comprehends and applies industry-specific language or technical terms, offering precise, perceptive feedback that demonstrates a comprehensive understanding of the topic."},
         ]
+    },
+    {
+        "description": "🫠 Response-level Hallucination",
+        "inputs_task": [
+            {"name": "Question", "value": "Which genus of moth in the world's seventh-largest country contains only one species?"},
+            {"name": "Passage", "value": "Indogrammodes is a genus of moths of the Crambidae family. It contains only one species, Indogrammodes pectinicornalis, which is found in India.India, officially the Republic of India (\"Bhārat Gaṇarājya\"), is a country in South Asia. It is the seventh-largest country by area, the second-most populous country (with over 1.2 billion people), and the most populous democracy in the world."},
+        ],
+        "output": {"name": "Answer", "value": "The Indogrammodes genus of moths found in India has only one species."},
+        "evaluation_criteria": "Evaluate whether the information provided in the answer is factually accurate and directly supported by the context given in the document, without any fabricated or hallucinated details.",
+        "rubric": [
+            {"name": "0", "value": "The answer is not supported by the document. It contains inaccuracies, fabrications, or details that are not present in the document."},
+            {"name": "1", "value": "The answer is fully supported by the document. It is factually accurate and all details are directly derived from the document."},
+        ]
     }
 ]

img/flow_judge_banner.png ADDED Viewed