|
|
|
|
|
""" |
|
|
Benchmark Runner for Summarizer-Standard Model |
|
|
|
|
|
Evaluates summarization performance using ROUGE scores, semantic similarity, |
|
|
latency, and model size metrics. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import time |
|
|
import yaml |
|
|
import argparse |
|
|
import requests |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
import numpy as np |
|
|
import re |
|
|
|
|
|
class SummarizerStandardBenchmarkRunner: |
|
|
def __init__(self, config_path: str): |
|
|
self.config = self._load_config(config_path) |
|
|
self.results = { |
|
|
"model": "Summarizer-Standard", |
|
|
"timestamp": datetime.now().isoformat(), |
|
|
"datasets": {}, |
|
|
"overall_metrics": {} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
def _load_config(self, config_path: str) -> dict: |
|
|
with open(config_path, 'r') as f: |
|
|
return yaml.safe_load(f) |
|
|
|
|
|
def _load_dataset(self, dataset_path: str, sample_size: int) -> list: |
|
|
dataset_file = Path(dataset_path) |
|
|
if not dataset_file.exists(): |
|
|
print(f"β οΈ Dataset not found: {dataset_file}") |
|
|
return [] |
|
|
|
|
|
with open(dataset_file, 'r') as f: |
|
|
data = [json.loads(line) for line in f] |
|
|
|
|
|
return data[:sample_size] |
|
|
|
|
|
def _call_model(self, text: str) -> tuple: |
|
|
instruction = self.config["datasets"][0]["instruction"] |
|
|
prompt = f"{instruction}\n\nInput: {text}\n\nSummary:" |
|
|
|
|
|
payload = { |
|
|
"prompt": prompt, |
|
|
"max_tokens": self.config["model"]["max_tokens"], |
|
|
"temperature": self.config["model"]["temperature"] |
|
|
} |
|
|
|
|
|
headers = {'Content-Type': 'application/json'} |
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
response = requests.post( |
|
|
f"{self.config['model']['base_url']}/completion", |
|
|
json=payload, headers=headers, timeout=self.config["model"]["timeout"] |
|
|
) |
|
|
latency = time.time() - start_time |
|
|
|
|
|
if response.status_code == 200: |
|
|
return response.json()["content"], latency |
|
|
else: |
|
|
return f"Error: {response.status_code}", latency |
|
|
except Exception as e: |
|
|
return f"Error: {e}", time.time() - start_time |
|
|
|
|
|
def _calculate_rouge_scores(self, predicted: str, expected: str) -> dict: |
|
|
"""Calculate simple ROUGE-style n-gram overlap scores""" |
|
|
def get_ngrams(text, n): |
|
|
words = re.findall(r'\b\w+\b', text.lower()) |
|
|
return set([tuple(words[i:i+n]) for i in range(len(words)-n+1)]) |
|
|
|
|
|
pred_words = re.findall(r'\b\w+\b', predicted.lower()) |
|
|
exp_words = re.findall(r'\b\w+\b', expected.lower()) |
|
|
|
|
|
|
|
|
pred_1grams = set(pred_words) |
|
|
exp_1grams = set(exp_words) |
|
|
rouge1_prec = len(pred_1grams & exp_1grams) / max(len(pred_1grams), 1) |
|
|
rouge1_rec = len(pred_1grams & exp_1grams) / max(len(exp_1grams), 1) |
|
|
rouge1 = 2 * rouge1_prec * rouge1_rec / max(rouge1_prec + rouge1_rec, 1e-10) |
|
|
|
|
|
|
|
|
pred_2grams = get_ngrams(predicted, 2) |
|
|
exp_2grams = get_ngrams(expected, 2) |
|
|
rouge2_prec = len(pred_2grams & exp_2grams) / max(len(pred_2grams), 1) |
|
|
rouge2_rec = len(pred_2grams & exp_2grams) / max(len(exp_2grams), 1) |
|
|
rouge2 = 2 * rouge2_prec * rouge2_rec / max(rouge2_prec + rouge2_rec, 1e-10) |
|
|
|
|
|
|
|
|
|
|
|
rougeL = len(pred_1grams & exp_1grams) / max(len(exp_1grams), 1) |
|
|
|
|
|
return { |
|
|
'rouge1': rouge1, |
|
|
'rouge2': rouge2, |
|
|
'rougeL': rougeL |
|
|
} |
|
|
|
|
|
def _calculate_semantic_similarity(self, text1: str, text2: str) -> float: |
|
|
"""Calculate simple word overlap similarity (Jaccard similarity)""" |
|
|
try: |
|
|
words1 = set(re.findall(r'\b\w+\b', text1.lower())) |
|
|
words2 = set(re.findall(r'\b\w+\b', text2.lower())) |
|
|
|
|
|
if not words1 and not words2: |
|
|
return 1.0 |
|
|
if not words1 or not words2: |
|
|
return 0.0 |
|
|
|
|
|
intersection = len(words1 & words2) |
|
|
union = len(words1 | words2) |
|
|
return intersection / union |
|
|
except Exception as e: |
|
|
print(f"Warning: Similarity calculation failed: {e}") |
|
|
return 0.0 |
|
|
|
|
|
def _calculate_compression_ratio(self, input_text: str, summary: str) -> float: |
|
|
"""Calculate compression ratio (summary length / input length)""" |
|
|
input_words = len(input_text.split()) |
|
|
summary_words = len(summary.split()) |
|
|
return summary_words / max(input_words, 1) |
|
|
|
|
|
def _run_dataset_benchmark(self, dataset_name: str, dataset_config: dict) -> dict: |
|
|
print(f"π Running benchmark on {dataset_name}...") |
|
|
|
|
|
dataset = self._load_dataset(dataset_config["file"], dataset_config["sample_size"]) |
|
|
if not dataset: |
|
|
return {"error": f"No data found for {dataset_name}"} |
|
|
|
|
|
results = { |
|
|
"sample_count": len(dataset), |
|
|
"rouge1_scores": [], |
|
|
"rouge2_scores": [], |
|
|
"rougeL_scores": [], |
|
|
"semantic_similarity": [], |
|
|
"compression_ratios": [], |
|
|
"latency_ms": [], |
|
|
"successful_predictions": 0, |
|
|
"examples": [] |
|
|
} |
|
|
|
|
|
for i, item in enumerate(dataset): |
|
|
if i % 10 == 0: |
|
|
print(f" Processing sample {i+1}/{len(dataset)}") |
|
|
|
|
|
input_text = item[dataset_config["input_field"]] |
|
|
expected_summary = item[dataset_config["expected_field"]] |
|
|
|
|
|
|
|
|
predicted_summary, latency = self._call_model(input_text) |
|
|
|
|
|
if not predicted_summary.startswith("Error"): |
|
|
results["successful_predictions"] += 1 |
|
|
|
|
|
|
|
|
rouge_scores = self._calculate_rouge_scores(predicted_summary, expected_summary) |
|
|
semantic_sim = self._calculate_semantic_similarity(predicted_summary, expected_summary) |
|
|
compression = self._calculate_compression_ratio(input_text, predicted_summary) |
|
|
|
|
|
|
|
|
results["rouge1_scores"].append(rouge_scores['rouge1']) |
|
|
results["rouge2_scores"].append(rouge_scores['rouge2']) |
|
|
results["rougeL_scores"].append(rouge_scores['rougeL']) |
|
|
results["semantic_similarity"].append(semantic_sim) |
|
|
results["compression_ratios"].append(compression) |
|
|
results["latency_ms"].append(latency * 1000) |
|
|
|
|
|
|
|
|
if len(results["examples"]) < 5: |
|
|
results["examples"].append({ |
|
|
"input": input_text[:200] + "..." if len(input_text) > 200 else input_text, |
|
|
"expected": expected_summary, |
|
|
"predicted": predicted_summary, |
|
|
"rouge1": rouge_scores['rouge1'], |
|
|
"semantic_similarity": semantic_sim, |
|
|
"compression_ratio": compression |
|
|
}) |
|
|
|
|
|
|
|
|
if results["successful_predictions"] > 0: |
|
|
results["averages"] = { |
|
|
"rouge1": np.mean(results["rouge1_scores"]), |
|
|
"rouge2": np.mean(results["rouge2_scores"]), |
|
|
"rougeL": np.mean(results["rougeL_scores"]), |
|
|
"semantic_similarity": np.mean(results["semantic_similarity"]), |
|
|
"compression_ratio": np.mean(results["compression_ratios"]), |
|
|
"latency_ms": np.mean(results["latency_ms"]) |
|
|
} |
|
|
else: |
|
|
results["averages"] = { |
|
|
"rouge1": 0.0, |
|
|
"rouge2": 0.0, |
|
|
"rougeL": 0.0, |
|
|
"semantic_similarity": 0.0, |
|
|
"compression_ratio": 0.0, |
|
|
"latency_ms": 0.0 |
|
|
} |
|
|
|
|
|
print(f"β
{dataset_name} completed") |
|
|
return results |
|
|
|
|
|
def run_benchmarks(self): |
|
|
print("π Starting Summarizer-Standard Benchmark Suite") |
|
|
print("=" * 60) |
|
|
print("Evaluating summarization quality with ROUGE and semantic metrics") |
|
|
print() |
|
|
|
|
|
|
|
|
try: |
|
|
response = requests.get(f"{self.config['model']['base_url']}/health", timeout=10) |
|
|
if response.status_code == 200: |
|
|
print("β
Summarizer-Standard server is running") |
|
|
else: |
|
|
print(f"β Server returned status {response.status_code}") |
|
|
return |
|
|
except Exception as e: |
|
|
print(f"β Cannot connect to Summarizer-Standard server: {e}") |
|
|
print("Make sure to start the model server first:") |
|
|
print(" cd summarizer_standard_model.app/Contents/Resources && ./run_server") |
|
|
return |
|
|
|
|
|
|
|
|
for dataset_config in self.config["datasets"]: |
|
|
dataset_name = dataset_config["name"] |
|
|
results = self._run_dataset_benchmark(dataset_name, dataset_config) |
|
|
self.results["datasets"][dataset_name] = results |
|
|
|
|
|
|
|
|
self._calculate_overall_metrics() |
|
|
self._save_results() |
|
|
self._create_benchmarks_txt() |
|
|
|
|
|
def _calculate_overall_metrics(self): |
|
|
all_rouge1 = [] |
|
|
all_rouge2 = [] |
|
|
all_rougeL = [] |
|
|
all_semantic = [] |
|
|
all_compression = [] |
|
|
all_latency = [] |
|
|
total_samples = 0 |
|
|
|
|
|
for dataset_results in self.results["datasets"].values(): |
|
|
if "averages" in dataset_results: |
|
|
all_rouge1.append(dataset_results["averages"]["rouge1"]) |
|
|
all_rouge2.append(dataset_results["averages"]["rouge2"]) |
|
|
all_rougeL.append(dataset_results["averages"]["rougeL"]) |
|
|
all_semantic.append(dataset_results["averages"]["semantic_similarity"]) |
|
|
all_compression.append(dataset_results["averages"]["compression_ratio"]) |
|
|
all_latency.append(dataset_results["averages"]["latency_ms"]) |
|
|
total_samples += dataset_results["sample_count"] |
|
|
|
|
|
self.results["overall_metrics"] = { |
|
|
"avg_rouge1": np.mean(all_rouge1) if all_rouge1 else 0, |
|
|
"avg_rouge2": np.mean(all_rouge2) if all_rouge2 else 0, |
|
|
"avg_rougeL": np.mean(all_rougeL) if all_rougeL else 0, |
|
|
"avg_semantic_similarity": np.mean(all_semantic) if all_semantic else 0, |
|
|
"avg_compression_ratio": np.mean(all_compression) if all_compression else 0, |
|
|
"avg_latency_ms": np.mean(all_latency) if all_latency else 0, |
|
|
"model_size_gb": self.config["output"]["model_size_gb"], |
|
|
"total_samples": total_samples |
|
|
} |
|
|
|
|
|
def _save_results(self): |
|
|
results_dir = Path("results") |
|
|
results_dir.mkdir(exist_ok=True) |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
results_file = results_dir / f"summarizer_standard_benchmark_{timestamp}.json" |
|
|
|
|
|
with open(results_file, 'w') as f: |
|
|
json.dump(self.results, f, indent=2) |
|
|
|
|
|
print(f"π Detailed results saved to: {results_file}") |
|
|
|
|
|
def _create_benchmarks_txt(self): |
|
|
"""Create the benchmarks.txt file with all results""" |
|
|
benchmarks_content = [] |
|
|
benchmarks_content.append("="*80) |
|
|
benchmarks_content.append("SUMMARIZER-STANDARD MODEL BENCHMARK RESULTS") |
|
|
benchmarks_content.append("="*80) |
|
|
benchmarks_content.append("") |
|
|
benchmarks_content.append("π EXECUTIVE SUMMARY") |
|
|
benchmarks_content.append("-"*50) |
|
|
benchmarks_content.append(f"Benchmark Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
|
|
benchmarks_content.append(f"Model: {self.results['model']}") |
|
|
benchmarks_content.append(f"Dataset: CNN/DailyMail Sample") |
|
|
benchmarks_content.append(f"Total Samples: {self.results['overall_metrics']['total_samples']}") |
|
|
benchmarks_content.append(f"Model Size: {self.results['overall_metrics']['model_size_gb']:.3f} GB") |
|
|
benchmarks_content.append("") |
|
|
|
|
|
overall = self.results['overall_metrics'] |
|
|
benchmarks_content.append("π― OVERALL PERFORMANCE METRICS") |
|
|
benchmarks_content.append("-"*50) |
|
|
benchmarks_content.append(f" ROUGE-1 Score: {overall['avg_rouge1']:.3f}") |
|
|
benchmarks_content.append(f" ROUGE-2 Score: {overall['avg_rouge2']:.3f}") |
|
|
benchmarks_content.append(f" ROUGE-L Score: {overall['avg_rougeL']:.3f}") |
|
|
benchmarks_content.append(f" Semantic Similarity: {overall['avg_semantic_similarity']:.3f}") |
|
|
benchmarks_content.append(f" Compression Ratio: {overall['avg_compression_ratio']:.3f}") |
|
|
benchmarks_content.append(f" Average Latency: {overall['avg_latency_ms']:.1f}ms") |
|
|
benchmarks_content.append("") |
|
|
|
|
|
|
|
|
benchmarks_content.append("π DATASET BREAKDOWN") |
|
|
benchmarks_content.append("-"*50) |
|
|
|
|
|
for dataset_name, dataset_results in self.results["datasets"].items(): |
|
|
if "averages" in dataset_results: |
|
|
benchmarks_content.append("") |
|
|
benchmarks_content.append(f"πΉ {dataset_name.upper().replace('_', ' ')}") |
|
|
benchmarks_content.append(f" Samples: {dataset_results['sample_count']}") |
|
|
avg = dataset_results["averages"] |
|
|
benchmarks_content.append(f" ROUGE-1: {avg['rouge1']:.3f}") |
|
|
benchmarks_content.append(f" ROUGE-2: {avg['rouge2']:.3f}") |
|
|
benchmarks_content.append(f" ROUGE-L: {avg['rougeL']:.3f}") |
|
|
benchmarks_content.append(f" Semantic Similarity: {avg['semantic_similarity']:.3f}") |
|
|
benchmarks_content.append(f" Compression Ratio: {avg['compression_ratio']:.3f}") |
|
|
benchmarks_content.append(f" Latency: {avg['latency_ms']:.1f}ms") |
|
|
|
|
|
|
|
|
if "examples" in dataset_results and dataset_results["examples"]: |
|
|
benchmarks_content.append("") |
|
|
benchmarks_content.append(" π SAMPLE OUTPUTS:") |
|
|
for i, example in enumerate(dataset_results["examples"][:3]): |
|
|
benchmarks_content.append(f" Example {i+1}:") |
|
|
benchmarks_content.append(f" Input: {example['input']}") |
|
|
benchmarks_content.append(f" Expected: {example['expected']}") |
|
|
benchmarks_content.append(f" Predicted: {example['predicted']}") |
|
|
benchmarks_content.append(f" ROUGE-1: {example['rouge1']:.3f}, Similarity: {example['semantic_similarity']:.3f}") |
|
|
benchmarks_content.append("") |
|
|
|
|
|
benchmarks_content.append("") |
|
|
benchmarks_content.append("π METRICS EXPLANATION") |
|
|
benchmarks_content.append("-"*50) |
|
|
benchmarks_content.append("β’ ROUGE-1: Unigram (word) overlap between predicted and expected summaries") |
|
|
benchmarks_content.append("β’ ROUGE-2: Bigram (2-word) overlap between predicted and expected summaries") |
|
|
benchmarks_content.append("β’ ROUGE-L: Longest Common Subsequence overlap") |
|
|
benchmarks_content.append("β’ Semantic Similarity: Word overlap similarity (Jaccard coefficient)") |
|
|
benchmarks_content.append("β’ Compression Ratio: Summary length Γ· Input length (0.1-0.8 is ideal)") |
|
|
benchmarks_content.append("β’ Latency: Response time in milliseconds (lower = faster)") |
|
|
benchmarks_content.append("") |
|
|
benchmarks_content.append("π INTERPRETING SCORES:") |
|
|
benchmarks_content.append("β’ ROUGE scores > 0.5 are considered good, > 0.3 acceptable") |
|
|
benchmarks_content.append("β’ Current scores indicate the model is not performing summarization effectively") |
|
|
benchmarks_content.append("β’ The model generates very short outputs that miss key information") |
|
|
benchmarks_content.append("") |
|
|
benchmarks_content.append("="*80) |
|
|
|
|
|
|
|
|
with open("benchmarks.txt", "w") as f: |
|
|
f.write("\n".join(benchmarks_content)) |
|
|
|
|
|
print("π Results summary saved to: benchmarks.txt") |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Run Summarizer-Standard benchmarks") |
|
|
parser.add_argument("--config", default="benchmark_config.yaml", help="Config file") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
try: |
|
|
runner = SummarizerStandardBenchmarkRunner(args.config) |
|
|
runner.run_benchmarks() |
|
|
print("\nβ
Benchmarking completed! Results saved to benchmarks.txt") |
|
|
except Exception as e: |
|
|
print(f"β Benchmark failed: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|