Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Minseok Bae
commited on
Commit
·
156ef43
1
Parent(s):
2864204
Refine the code style
Browse files- app.py +2 -2
- src/backend/evaluate_model.py +3 -7
- src/backend/manage_requests.py +1 -0
- src/backend/model_operations.py +5 -6
- src/backend/util.py +9 -11
- src/display/about.py +3 -4
- src/envs.py +2 -1
app.py
CHANGED
|
@@ -97,7 +97,7 @@ def filter_models(
|
|
| 97 |
if show_deleted:
|
| 98 |
filtered_df = df
|
| 99 |
else: # Show only still on the hub models
|
| 100 |
-
filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]
|
| 101 |
|
| 102 |
type_emoji = [t[0] for t in type_query]
|
| 103 |
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
|
@@ -181,7 +181,7 @@ with demo:
|
|
| 181 |
elem_id="leaderboard-table",
|
| 182 |
interactive=False,
|
| 183 |
visible=True,
|
| 184 |
-
column_widths=["2%", "33%"]
|
| 185 |
)
|
| 186 |
|
| 187 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
|
|
|
| 97 |
if show_deleted:
|
| 98 |
filtered_df = df
|
| 99 |
else: # Show only still on the hub models
|
| 100 |
+
filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
|
| 101 |
|
| 102 |
type_emoji = [t[0] for t in type_query]
|
| 103 |
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
|
|
|
| 181 |
elem_id="leaderboard-table",
|
| 182 |
interactive=False,
|
| 183 |
visible=True,
|
| 184 |
+
column_widths=["2%", "33%"]
|
| 185 |
)
|
| 186 |
|
| 187 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
src/backend/evaluate_model.py
CHANGED
|
@@ -69,13 +69,11 @@ class Evaluator:
|
|
| 69 |
dict: A dictionary containing evaluation results.
|
| 70 |
"""
|
| 71 |
try:
|
| 72 |
-
|
| 73 |
-
df = pd.read_csv(envs.SAMPLE_DATASET_PATH)
|
| 74 |
generated_summaries_df = self.summary_generator.generate_summaries(df)
|
| 75 |
|
| 76 |
avg_summary_len = self.summary_generator.avg_length
|
| 77 |
answer_rate = self.summary_generator.answer_rate
|
| 78 |
-
# error_rate = self.summary_generator.error_rate
|
| 79 |
|
| 80 |
hallucination_scores = self.eval_model.evaluate_hallucination(
|
| 81 |
generated_summaries_df)
|
|
@@ -83,16 +81,14 @@ class Evaluator:
|
|
| 83 |
hallucination_rate = self.eval_model.hallucination_rate
|
| 84 |
|
| 85 |
results = util.format_results(model_name=self.model, revision=self.revision,
|
| 86 |
-
precision=self.precision,
|
| 87 |
factual_consistency_rate=factual_consistency_rate,
|
| 88 |
hallucination_rate=hallucination_rate,
|
| 89 |
answer_rate=answer_rate,
|
| 90 |
avg_summary_len=avg_summary_len)
|
| 91 |
-
|
| 92 |
return results
|
| 93 |
except FileNotFoundError:
|
| 94 |
-
|
| 95 |
-
logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
|
| 96 |
raise
|
| 97 |
except Exception as e:
|
| 98 |
logging.error(f"Error during evaluation: {e}")
|
|
|
|
| 69 |
dict: A dictionary containing evaluation results.
|
| 70 |
"""
|
| 71 |
try:
|
| 72 |
+
df = pd.read_csv(envs.DATASET_PATH)
|
|
|
|
| 73 |
generated_summaries_df = self.summary_generator.generate_summaries(df)
|
| 74 |
|
| 75 |
avg_summary_len = self.summary_generator.avg_length
|
| 76 |
answer_rate = self.summary_generator.answer_rate
|
|
|
|
| 77 |
|
| 78 |
hallucination_scores = self.eval_model.evaluate_hallucination(
|
| 79 |
generated_summaries_df)
|
|
|
|
| 81 |
hallucination_rate = self.eval_model.hallucination_rate
|
| 82 |
|
| 83 |
results = util.format_results(model_name=self.model, revision=self.revision,
|
| 84 |
+
precision=self.precision,
|
| 85 |
factual_consistency_rate=factual_consistency_rate,
|
| 86 |
hallucination_rate=hallucination_rate,
|
| 87 |
answer_rate=answer_rate,
|
| 88 |
avg_summary_len=avg_summary_len)
|
|
|
|
| 89 |
return results
|
| 90 |
except FileNotFoundError:
|
| 91 |
+
logging.error(f"File not found: {envs.DATASET_PATH}")
|
|
|
|
| 92 |
raise
|
| 93 |
except Exception as e:
|
| 94 |
logging.error(f"Error during evaluation: {e}")
|
src/backend/manage_requests.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Optional
|
|
| 6 |
|
| 7 |
from huggingface_hub import HfApi, snapshot_download
|
| 8 |
|
|
|
|
| 9 |
@dataclass
|
| 10 |
class EvalRequest:
|
| 11 |
model: str
|
|
|
|
| 6 |
|
| 7 |
from huggingface_hub import HfApi, snapshot_download
|
| 8 |
|
| 9 |
+
|
| 10 |
@dataclass
|
| 11 |
class EvalRequest:
|
| 12 |
model: str
|
src/backend/model_operations.py
CHANGED
|
@@ -105,11 +105,11 @@ class SummaryGenerator:
|
|
| 105 |
for index, row in df.iterrows():
|
| 106 |
_source = row['text']
|
| 107 |
_dataset = row['dataset']
|
| 108 |
-
|
| 109 |
system_prompt = envs.SYSTEM_PROMPT
|
| 110 |
user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
| 111 |
|
| 112 |
-
while True:
|
| 113 |
try:
|
| 114 |
_summary = generate_summary(self.model, system_prompt,
|
| 115 |
user_prompt, self.api_base)
|
|
@@ -129,7 +129,7 @@ class SummaryGenerator:
|
|
| 129 |
summary.append(_summary)
|
| 130 |
source.append(_source)
|
| 131 |
dataset.append(_dataset)
|
| 132 |
-
|
| 133 |
time.sleep(1)
|
| 134 |
|
| 135 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
|
@@ -199,10 +199,9 @@ class EvaluationModel:
|
|
| 199 |
Returns:
|
| 200 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
| 201 |
"""
|
| 202 |
-
|
| 203 |
-
generated_summaries = np.array(summaries_df['summary'])
|
| 204 |
try:
|
| 205 |
-
scores = self.model.predict(
|
| 206 |
self.scores = scores
|
| 207 |
return self.scores
|
| 208 |
except Exception as e:
|
|
|
|
| 105 |
for index, row in df.iterrows():
|
| 106 |
_source = row['text']
|
| 107 |
_dataset = row['dataset']
|
| 108 |
+
|
| 109 |
system_prompt = envs.SYSTEM_PROMPT
|
| 110 |
user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
| 111 |
|
| 112 |
+
while True:
|
| 113 |
try:
|
| 114 |
_summary = generate_summary(self.model, system_prompt,
|
| 115 |
user_prompt, self.api_base)
|
|
|
|
| 129 |
summary.append(_summary)
|
| 130 |
source.append(_source)
|
| 131 |
dataset.append(_dataset)
|
| 132 |
+
|
| 133 |
time.sleep(1)
|
| 134 |
|
| 135 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
|
|
|
| 199 |
Returns:
|
| 200 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
| 201 |
"""
|
| 202 |
+
source_summary_pairs = util.create_pairs(summaries_df)
|
|
|
|
| 203 |
try:
|
| 204 |
+
scores = self.model.predict(source_summary_pairs)
|
| 205 |
self.scores = scores
|
| 206 |
return self.scores
|
| 207 |
except Exception as e:
|
src/backend/util.py
CHANGED
|
@@ -1,23 +1,21 @@
|
|
| 1 |
-
def
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
|
| 5 |
Args:
|
| 6 |
-
|
| 7 |
|
| 8 |
Returns:
|
| 9 |
-
|
| 10 |
"""
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
-
return
|
| 15 |
-
You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described:'
|
| 16 |
-
Passage:\n {source_passage}
|
| 17 |
-
"""
|
| 18 |
|
| 19 |
|
| 20 |
-
def format_results(model_name: str, revision: str, precision: str,
|
| 21 |
factual_consistency_rate: float, hallucination_rate: float,
|
| 22 |
answer_rate: float, avg_summary_len: float) -> dict:
|
| 23 |
"""
|
|
|
|
| 1 |
+
def create_pairs(df):
|
| 2 |
"""
|
| 3 |
+
Creates pairs of source and summary from the dataframe.
|
| 4 |
|
| 5 |
Args:
|
| 6 |
+
df (DataFrame): The dataframe containing source and summary columns.
|
| 7 |
|
| 8 |
Returns:
|
| 9 |
+
list: A list of pairs [source, summary].
|
| 10 |
"""
|
| 11 |
+
pairs = []
|
| 12 |
+
for _, row in df.iterrows():
|
| 13 |
+
pairs.append([row['source'], row['summary']])
|
| 14 |
|
| 15 |
+
return pairs
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
+
def format_results(model_name: str, revision: str, precision: str,
|
| 19 |
factual_consistency_rate: float, hallucination_rate: float,
|
| 20 |
answer_rate: float, avg_summary_len: float) -> dict:
|
| 21 |
"""
|
src/display/about.py
CHANGED
|
@@ -9,15 +9,14 @@ class Task:
|
|
| 9 |
|
| 10 |
|
| 11 |
class Tasks(Enum):
|
| 12 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 13 |
-
hallucination_rate = Task("hallucination_rate",
|
| 14 |
"hallucination_rate", "Hallucination Rate")
|
| 15 |
accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
|
| 16 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
|
| 17 |
-
average_summary_length = Task("average_summary_length",
|
| 18 |
"average_summary_length", "Average Summary Length")
|
| 19 |
# error_rate = Task("error_rate", "error_rate", "Error Rate")
|
| 20 |
-
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class Tasks(Enum):
|
| 12 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 13 |
+
hallucination_rate = Task("hallucination_rate",
|
| 14 |
"hallucination_rate", "Hallucination Rate")
|
| 15 |
accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
|
| 16 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
|
| 17 |
+
average_summary_length = Task("average_summary_length",
|
| 18 |
"average_summary_length", "Average Summary Length")
|
| 19 |
# error_rate = Task("error_rate", "error_rate", "Error Rate")
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
# Your leaderboard name
|
src/envs.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
|
|
|
| 5 |
# replace this with our token
|
| 6 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
| 7 |
|
|
@@ -21,7 +22,7 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
|
| 21 |
DEVICE = "cpu"
|
| 22 |
API = HfApi(token=TOKEN)
|
| 23 |
|
| 24 |
-
|
| 25 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
| 26 |
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
| 27 |
|
|
|
|
| 2 |
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
| 5 |
+
|
| 6 |
# replace this with our token
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
| 8 |
|
|
|
|
| 22 |
DEVICE = "cpu"
|
| 23 |
API = HfApi(token=TOKEN)
|
| 24 |
|
| 25 |
+
DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
|
| 26 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
| 27 |
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
| 28 |
|