Spaces:
Runtime error
Runtime error
update display
Browse files- src/display/about.py +29 -5
src/display/about.py
CHANGED
|
@@ -11,24 +11,48 @@ class Task:
|
|
| 11 |
# Init: to update with your specific keys
|
| 12 |
class Tasks(Enum):
|
| 13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 14 |
-
task0 = Task("
|
| 15 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
# Your leaderboard name
|
| 19 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 20 |
|
| 21 |
# What does your leaderboard evaluate?
|
| 22 |
INTRODUCTION_TEXT = """
|
| 23 |
-
|
| 24 |
"""
|
| 25 |
|
| 26 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 27 |
LLM_BENCHMARKS_TEXT = f"""
|
| 28 |
## How it works
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
## Reproducibility
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
"""
|
| 34 |
|
|
|
|
| 11 |
# Init: to update with your specific keys
|
| 12 |
class Tasks(Enum):
|
| 13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 14 |
+
task0 = Task("finance_bench", "accuracy", "FinanceBench")
|
| 15 |
+
task1 = Task("legal_confidentiality", "accuracy", "Legal Confidentiality")
|
| 16 |
+
task2 = Task("writing-prompts", "coherence", "Writing Prompts")
|
| 17 |
+
task3 = Task("customer-support", "engagement", "Customer Support Dialogue")
|
| 18 |
+
task4 = Task("toxic-prompts", "toxicity", "Toxic Prompts")
|
| 19 |
+
task5 = Task("enterprise-pii", "accuracy", "Enterprise PII")
|
| 20 |
|
| 21 |
|
| 22 |
# Your leaderboard name
|
| 23 |
+
TITLE = """<h1 align="center" id="space-title">Patronus AI leaderboard</h1>"""
|
| 24 |
|
| 25 |
# What does your leaderboard evaluate?
|
| 26 |
INTRODUCTION_TEXT = """
|
| 27 |
+
This leaderboard evaluates the performance of models on real-world enterprise use cases.
|
| 28 |
"""
|
| 29 |
|
| 30 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 31 |
LLM_BENCHMARKS_TEXT = f"""
|
| 32 |
## How it works
|
| 33 |
|
| 34 |
+
## Tasks
|
| 35 |
+
1. FinanceBench: The task measures the ability to answer financial questions given the context.
|
| 36 |
+
|
| 37 |
+
2. Legal Confidentiality: The task measures the ability of LLMs to reason over legal causes. The model is prompted
|
| 38 |
+
to return yes/no as an answer to the question.
|
| 39 |
+
|
| 40 |
+
3. Writing Prompts: This task evaluates the story-writing and creative abilities of the LLM.
|
| 41 |
+
|
| 42 |
+
4. Customer Support Dialogue: This task evaluates the ability of the LLM to answer a customer support question
|
| 43 |
+
given some product information and conversational history.
|
| 44 |
+
|
| 45 |
+
5. Toxic Prompts: This task evaluates the safety of the model by using prompts that can elicit harmful information
|
| 46 |
+
from LLMs.
|
| 47 |
+
|
| 48 |
+
6. Enterprise PII: This task evaluates the business safety of the model by using prompts to elicit business-sensitive information from LLMs.
|
| 49 |
+
|
| 50 |
## Reproducibility
|
| 51 |
+
All of our datasets are closed-source. We provide a validation set with 5 examples for each of the tasks.
|
| 52 |
+
|
| 53 |
+
To reproduce the results on the validation set, run:
|
| 54 |
+
|
| 55 |
+
|
| 56 |
|
| 57 |
"""
|
| 58 |
|