|
|
import os |
|
|
import yaml |
|
|
import json |
|
|
import pandas as pd |
|
|
|
|
|
import evaluate |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
def load_pipeline(model_path): |
|
|
summarizer = pipeline("summarization", model=model_path, device=0) |
|
|
return summarizer |
|
|
|
|
|
def infernece(pipeline, eval_data): |
|
|
prompt = "summarize the following sentence:" |
|
|
sentences = eval_data['original'].tolist() |
|
|
compressed = eval_data['compressed'].tolist() |
|
|
predictions = [] |
|
|
for sent in sentences: |
|
|
text = prompt + sent |
|
|
out = pipeline(text) |
|
|
predictions.append(out[0]['summary_text']) |
|
|
return {"original": sentences, "compressed": compressed, "predictions": predictions} |
|
|
|
|
|
def compute_performace(eval_data): |
|
|
original_compressed = eval_data['compressed'] |
|
|
pred_compressed = eval_data['predictions'] |
|
|
rouge = evaluate.load('rouge') |
|
|
predictions = eval_data['predictions'] |
|
|
references = eval_data['compressed'] |
|
|
|
|
|
results = rouge.compute(predictions=predictions, references=references) |
|
|
print(results) |
|
|
return results |
|
|
|
|
|
def get_latest_checkpoint(checkpoint_dir): |
|
|
subdirs = [name for name in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, name)) and name.startswith("checkpoint-")] |
|
|
checkpoint_numbers = [int(subdir.split("-")[1]) for subdir in subdirs] |
|
|
latest_checkpoint = "checkpoint-" + str(max(checkpoint_numbers)) |
|
|
return latest_checkpoint |
|
|
|
|
|
if __name__ == "__main__": |
|
|
config = yaml.safe_load(open("config.yaml", "r")) |
|
|
PROJECT_DIR = eval(config["SENTENCE_COMPRESSION"]["PROJECT_DIR"]) |
|
|
data_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["DATA"]["CLEAN_DATA"]) |
|
|
model_checkpoint = config["SENTENCE_COMPRESSION"]["INFERENCE"]["MODEL_PATH"] |
|
|
latest_checkpoint = get_latest_checkpoint(os.path.join(PROJECT_DIR, model_checkpoint)) |
|
|
model_path = os.path.join(PROJECT_DIR, model_checkpoint, latest_checkpoint) |
|
|
pipeline = load_pipeline(model_path) |
|
|
eval_data = pd.read_csv(os.path.join(data_dir, 'eval_data.csv')) |
|
|
eval_data_res = infernece(pipeline, eval_data) |
|
|
output_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["OUTPUT"]["RESULT"]) |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
eval_res_df = pd.DataFrame(eval_data_res) |
|
|
eval_res_df.to_csv(os.path.join(output_dir, "eval_result.csv"), index=False) |
|
|
result = compute_performace(eval_data_res) |
|
|
json.dump(result, open(os.path.join(output_dir, "performance.json"), "w"), indent=4) |
|
|
|