cyber-report-generator / src /evaluation.py
hssn
Initial commit for HF Space
124ea58
"""
Evaluation metrics for TTP/IOC extraction and narrative quality.
M6 — Evaluation & validation.
"""
from typing import Any
def extract_entities_from_text(text: str) -> set[str]:
"""Extract TTPs, CVEs, and generic IOCs from text for evaluation."""
from .preprocessing import extract_iocs
iocs = extract_iocs(text)
return set(iocs)
def precision_recall_f1(pred: set[str], gold: set[str]) -> dict[str, float]:
"""Compute precision, recall, F1 for entity extraction."""
if not pred and not gold:
return {"precision": 1.0, "recall": 1.0, "f1": 1.0}
if not pred:
return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
if not gold:
return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
tp = len(pred & gold)
p = tp / len(pred) if pred else 0.0
r = tp / len(gold) if gold else 0.0
f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0.0
return {"precision": round(p, 4), "recall": round(r, 4), "f1": round(f1, 4)}
def rouge_n(reference: str, hypothesis: str, n: int = 2) -> dict[str, float]:
"""ROUGE-N: n-gram overlap recall."""
ref_tokens = reference.lower().split()
hyp_tokens = hypothesis.lower().split()
def ngrams(tokens: list[str], n: int) -> set[tuple[str, ...]]:
return {tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)}
ref_ng = ngrams(ref_tokens, n)
hyp_ng = ngrams(hyp_tokens, n)
if not ref_ng:
return {"rouge_precision": 0.0, "rouge_recall": 0.0, "rouge_f1": 0.0}
overlap = len(ref_ng & hyp_ng)
prec = overlap / len(hyp_ng) if hyp_ng else 0.0
rec = overlap / len(ref_ng)
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
return {
"rouge_precision": round(prec, 4),
"rouge_recall": round(rec, 4),
"rouge_f1": round(f1, 4),
}
def bleu_simple(reference: str, hypothesis: str) -> float:
"""Simplified BLEU: 1-gram precision."""
ref_words = reference.lower().split()
hyp_words = hypothesis.lower().split()
if not hyp_words:
return 0.0
matches = sum(1 for w in hyp_words if w in ref_words)
return round(matches / len(hyp_words), 4)
def evaluate_extraction(pred_entities: list[str], gold_entities: list[str]) -> dict[str, float]:
"""Evaluate entity extraction: precision, recall, F1."""
pred_set = set(pred_entities)
gold_set = set(gold_entities)
return precision_recall_f1(pred_set, gold_set)
def evaluate_narrative(reference: str, hypothesis: str) -> dict[str, float]:
"""Evaluate narrative quality: ROUGE-2, BLEU."""
r2 = rouge_n(reference, hypothesis, n=2)
bleu = bleu_simple(reference, hypothesis)
return {
"rouge2_recall": r2["rouge_recall"],
"rouge2_f1": r2["rouge_f1"],
"bleu_simple": bleu,
}
def run_eval_on_dataset(csv_path: str) -> dict[str, Any]:
"""
Run evaluation on a CSV with input_text, target_report.
Uses rule-based extraction; gold entities from target_report.
"""
import pandas as pd
df = pd.read_csv(csv_path, encoding="utf-8")
from .inference import generate_stub_report
all_p = []
all_r = []
all_f1 = []
all_rouge = []
all_bleu = []
for _, row in df.head(100).iterrows():
inp = row.get("input_text", "")
gold_report = str(row.get("target_report", ""))
report = generate_stub_report(inp)
pred = set(report.get("ttps", []) + report.get("cves", []) + report.get("iocs", []))
gold = extract_entities_from_text(gold_report)
m = precision_recall_f1(pred, gold)
all_p.append(m["precision"])
all_r.append(m["recall"])
all_f1.append(m["f1"])
nr = evaluate_narrative(gold_report, report.get("executive_summary", ""))
all_rouge.append(nr["rouge2_f1"])
all_bleu.append(nr["bleu_simple"])
def avg(x):
return round(sum(x) / len(x), 4) if x else 0.0
return {
"extraction": {"precision": avg(all_p), "recall": avg(all_r), "f1": avg(all_f1)},
"narrative": {"rouge2_f1": avg(all_rouge), "bleu_simple": avg(all_bleu)},
"n_samples": len(all_p),
}
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--data", default="data/val.csv")
args = ap.parse_args()
results = run_eval_on_dataset(args.data)
print("Evaluation results:", results)