Spaces:
Running on Zero
Running on Zero
| """Validate VERIS classification output against the official enumeration values. | |
| Takes a VERIS classification dict (with actor, action, asset, attribute keys) | |
| and checks that all values match the enumerations defined in enums.py. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| try: | |
| from veris_classifier.enums import ( | |
| ACTION_ERROR_VARIETY, | |
| ACTION_HACKING_VARIETY, | |
| ACTION_MALWARE_VARIETY, | |
| ACTION_MISUSE_VARIETY, | |
| ACTION_PHYSICAL_VARIETY, | |
| ACTION_SOCIAL_VARIETY, | |
| ACTOR_EXTERNAL_VARIETY, | |
| ACTOR_INTERNAL_VARIETY, | |
| ACTOR_MOTIVE, | |
| ASSET_VARIETY, | |
| ATTRIBUTE_AVAILABILITY_VARIETY, | |
| ATTRIBUTE_CONFIDENTIALITY_DATA_VARIETY, | |
| ATTRIBUTE_INTEGRITY_VARIETY, | |
| DATA_DISCLOSURE, | |
| ) | |
| except ModuleNotFoundError: | |
| from src.veris_classifier.enums import ( | |
| ACTION_ERROR_VARIETY, | |
| ACTION_HACKING_VARIETY, | |
| ACTION_MALWARE_VARIETY, | |
| ACTION_MISUSE_VARIETY, | |
| ACTION_PHYSICAL_VARIETY, | |
| ACTION_SOCIAL_VARIETY, | |
| ACTOR_EXTERNAL_VARIETY, | |
| ACTOR_INTERNAL_VARIETY, | |
| ACTOR_MOTIVE, | |
| ASSET_VARIETY, | |
| ATTRIBUTE_AVAILABILITY_VARIETY, | |
| ATTRIBUTE_CONFIDENTIALITY_DATA_VARIETY, | |
| ATTRIBUTE_INTEGRITY_VARIETY, | |
| DATA_DISCLOSURE, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Lookup tables | |
| # --------------------------------------------------------------------------- | |
| VALID_ACTOR_TYPES = {"external", "internal", "partner"} | |
| ACTOR_VARIETY_BY_TYPE: dict[str, set[str]] = { | |
| "external": set(ACTOR_EXTERNAL_VARIETY), | |
| "internal": set(ACTOR_INTERNAL_VARIETY), | |
| # VERIS does not define a separate partner variety list; partner entries | |
| # typically have an empty variety or reuse generic values. | |
| "partner": set(), | |
| } | |
| VALID_MOTIVES: set[str] = set(ACTOR_MOTIVE) | |
| VALID_ACTION_TYPES = { | |
| "malware", "hacking", "social", "misuse", | |
| "physical", "error", "environmental", | |
| } | |
| ACTION_VARIETY_BY_TYPE: dict[str, set[str]] = { | |
| "malware": set(ACTION_MALWARE_VARIETY), | |
| "hacking": set(ACTION_HACKING_VARIETY), | |
| "social": set(ACTION_SOCIAL_VARIETY), | |
| "misuse": set(ACTION_MISUSE_VARIETY), | |
| "physical": set(ACTION_PHYSICAL_VARIETY), | |
| "error": set(ACTION_ERROR_VARIETY), | |
| # environmental has no variety list defined in our enums. | |
| "environmental": set(), | |
| } | |
| VALID_ASSET_VARIETIES: set[str] = set(ASSET_VARIETY) | |
| VALID_ATTRIBUTE_TYPES = {"confidentiality", "integrity", "availability"} | |
| ATTRIBUTE_VARIETY_BY_TYPE: dict[str, set[str]] = { | |
| "integrity": set(ATTRIBUTE_INTEGRITY_VARIETY), | |
| "availability": set(ATTRIBUTE_AVAILABILITY_VARIETY), | |
| } | |
| VALID_DATA_VARIETY: set[str] = set(ATTRIBUTE_CONFIDENTIALITY_DATA_VARIETY) | |
| VALID_DATA_DISCLOSURE: set[str] = set(DATA_DISCLOSURE) | |
| # Threshold for "too many unknowns" warning. If more than this fraction of | |
| # all list values across the classification are "Unknown", a warning is raised. | |
| _UNKNOWN_WARNING_THRESHOLD = 0.5 | |
| # --------------------------------------------------------------------------- | |
| # Result container | |
| # --------------------------------------------------------------------------- | |
| class ValidationResult: | |
| """Outcome of validating a VERIS classification dict.""" | |
| valid: bool = True | |
| errors: list[str] = field(default_factory=list) | |
| warnings: list[str] = field(default_factory=list) | |
| def _add_error(self, msg: str) -> None: | |
| self.errors.append(msg) | |
| self.valid = False | |
| def _add_warning(self, msg: str) -> None: | |
| self.warnings.append(msg) | |
| # --------------------------------------------------------------------------- | |
| # Internal helpers | |
| # --------------------------------------------------------------------------- | |
| def _check_list_values( | |
| values: list[str], | |
| valid_set: set[str], | |
| label: str, | |
| result: ValidationResult, | |
| ) -> int: | |
| """Validate each value in *values* against *valid_set*. | |
| Returns the count of "Unknown" values encountered. | |
| """ | |
| unknown_count = 0 | |
| for val in values: | |
| if val == "Unknown": | |
| unknown_count += 1 | |
| if valid_set and val not in valid_set: | |
| result._add_error(f"{label}: invalid value '{val}'") | |
| return unknown_count | |
| def _ensure_list(obj: Any) -> list: | |
| """Coerce *obj* to a list if it is not one already.""" | |
| if isinstance(obj, list): | |
| return obj | |
| if obj is None: | |
| return [] | |
| return [obj] | |
| # --------------------------------------------------------------------------- | |
| # Section validators | |
| # --------------------------------------------------------------------------- | |
| def _validate_actor(actor: dict, result: ValidationResult) -> int: | |
| """Validate the ``actor`` section. Returns total unknown count.""" | |
| unknown_count = 0 | |
| if not isinstance(actor, dict): | |
| result._add_error("actor: expected a dict") | |
| return 0 | |
| for actor_type, info in actor.items(): | |
| if actor_type not in VALID_ACTOR_TYPES: | |
| result._add_error(f"actor: unknown actor type '{actor_type}'") | |
| continue | |
| if not isinstance(info, dict): | |
| result._add_error(f"actor.{actor_type}: expected a dict") | |
| continue | |
| # --- variety --- | |
| variety = _ensure_list(info.get("variety", [])) | |
| valid_set = ACTOR_VARIETY_BY_TYPE.get(actor_type, set()) | |
| unknown_count += _check_list_values( | |
| variety, valid_set, | |
| f"actor.{actor_type}.variety", result, | |
| ) | |
| # --- motive --- | |
| motive = _ensure_list(info.get("motive", [])) | |
| unknown_count += _check_list_values( | |
| motive, VALID_MOTIVES, | |
| f"actor.{actor_type}.motive", result, | |
| ) | |
| return unknown_count | |
| def _validate_action(action: dict, result: ValidationResult) -> int: | |
| """Validate the ``action`` section. Returns total unknown count.""" | |
| unknown_count = 0 | |
| if not isinstance(action, dict): | |
| result._add_error("action: expected a dict") | |
| return 0 | |
| for action_type, info in action.items(): | |
| if action_type not in VALID_ACTION_TYPES: | |
| result._add_error(f"action: unknown action type '{action_type}'") | |
| continue | |
| if not isinstance(info, dict): | |
| result._add_error(f"action.{action_type}: expected a dict") | |
| continue | |
| # --- variety --- | |
| variety = _ensure_list(info.get("variety", [])) | |
| valid_set = ACTION_VARIETY_BY_TYPE.get(action_type, set()) | |
| unknown_count += _check_list_values( | |
| variety, valid_set, | |
| f"action.{action_type}.variety", result, | |
| ) | |
| # --- vector --- | |
| # Vectors are present in the dataset but not yet enumerated in | |
| # enums.py. We accept any non-empty string and skip strict | |
| # validation to avoid false positives. | |
| vector = _ensure_list(info.get("vector", [])) | |
| for val in vector: | |
| if not isinstance(val, str) or not val.strip(): | |
| result._add_error( | |
| f"action.{action_type}.vector: " | |
| f"expected non-empty string, got '{val}'" | |
| ) | |
| if val == "Unknown": | |
| unknown_count += 1 | |
| return unknown_count | |
| def _validate_asset(asset: dict, result: ValidationResult) -> int: | |
| """Validate the ``asset`` section. Returns total unknown count.""" | |
| unknown_count = 0 | |
| if not isinstance(asset, dict): | |
| result._add_error("asset: expected a dict") | |
| return 0 | |
| variety = _ensure_list(asset.get("variety", [])) | |
| unknown_count += _check_list_values( | |
| variety, VALID_ASSET_VARIETIES, | |
| "asset.variety", result, | |
| ) | |
| return unknown_count | |
| def _validate_attribute(attribute: dict, result: ValidationResult) -> int: | |
| """Validate the ``attribute`` section. Returns total unknown count.""" | |
| unknown_count = 0 | |
| if not isinstance(attribute, dict): | |
| result._add_error("attribute: expected a dict") | |
| return 0 | |
| for attr_type, info in attribute.items(): | |
| if attr_type not in VALID_ATTRIBUTE_TYPES: | |
| result._add_error( | |
| f"attribute: unknown attribute type '{attr_type}'" | |
| ) | |
| continue | |
| if not isinstance(info, dict): | |
| result._add_error(f"attribute.{attr_type}: expected a dict") | |
| continue | |
| if attr_type == "confidentiality": | |
| # --- data_disclosure --- | |
| disclosure = info.get("data_disclosure") | |
| if disclosure is not None: | |
| if disclosure not in VALID_DATA_DISCLOSURE: | |
| result._add_error( | |
| f"attribute.confidentiality.data_disclosure: " | |
| f"invalid value '{disclosure}'" | |
| ) | |
| if disclosure == "Unknown": | |
| unknown_count += 1 | |
| # --- data_variety --- | |
| data_variety = _ensure_list(info.get("data_variety", [])) | |
| unknown_count += _check_list_values( | |
| data_variety, VALID_DATA_VARIETY, | |
| "attribute.confidentiality.data_variety", result, | |
| ) | |
| else: | |
| # integrity / availability | |
| variety = _ensure_list(info.get("variety", [])) | |
| valid_set = ATTRIBUTE_VARIETY_BY_TYPE.get(attr_type, set()) | |
| unknown_count += _check_list_values( | |
| variety, valid_set, | |
| f"attribute.{attr_type}.variety", result, | |
| ) | |
| return unknown_count | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def validate_classification(classification: dict) -> ValidationResult: | |
| """Validate a VERIS classification dict against known enumerations. | |
| Parameters | |
| ---------- | |
| classification: | |
| A dict with top-level keys ``actor``, ``action``, ``asset``, and | |
| ``attribute``, following the structure produced by the classifier | |
| and used in the training dataset. | |
| Returns | |
| ------- | |
| ValidationResult | |
| Contains ``valid`` (bool), ``errors`` (list of str), and | |
| ``warnings`` (list of str). | |
| """ | |
| result = ValidationResult() | |
| if not isinstance(classification, dict): | |
| result._add_error("classification must be a dict") | |
| return result | |
| # Track totals for the "too many unknowns" warning. | |
| total_values = 0 | |
| total_unknowns = 0 | |
| # --- Required top-level keys --- | |
| required_keys = {"actor", "action", "asset", "attribute"} | |
| missing = required_keys - set(classification.keys()) | |
| if missing: | |
| result._add_warning( | |
| f"missing top-level keys: {', '.join(sorted(missing))}" | |
| ) | |
| # --- Actor --- | |
| actor = classification.get("actor") | |
| if actor is not None: | |
| unknowns = _validate_actor(actor, result) | |
| total_unknowns += unknowns | |
| # Count total values contributed by the actor section. | |
| for info in actor.values(): | |
| if isinstance(info, dict): | |
| total_values += len(_ensure_list(info.get("variety", []))) | |
| total_values += len(_ensure_list(info.get("motive", []))) | |
| # --- Action --- | |
| action = classification.get("action") | |
| if action is not None: | |
| unknowns = _validate_action(action, result) | |
| total_unknowns += unknowns | |
| for info in action.values(): | |
| if isinstance(info, dict): | |
| total_values += len(_ensure_list(info.get("variety", []))) | |
| total_values += len(_ensure_list(info.get("vector", []))) | |
| # --- Asset --- | |
| asset = classification.get("asset") | |
| if asset is not None: | |
| unknowns = _validate_asset(asset, result) | |
| total_unknowns += unknowns | |
| total_values += len(_ensure_list(asset.get("variety", []))) | |
| # --- Attribute --- | |
| attribute = classification.get("attribute") | |
| if attribute is not None: | |
| unknowns = _validate_attribute(attribute, result) | |
| total_unknowns += unknowns | |
| for attr_type, info in attribute.items(): | |
| if isinstance(info, dict): | |
| if attr_type == "confidentiality": | |
| total_values += len( | |
| _ensure_list(info.get("data_variety", [])) | |
| ) | |
| if info.get("data_disclosure") is not None: | |
| total_values += 1 | |
| else: | |
| total_values += len( | |
| _ensure_list(info.get("variety", [])) | |
| ) | |
| # --- Unknown saturation warning --- | |
| if total_values > 0: | |
| unknown_ratio = total_unknowns / total_values | |
| if unknown_ratio > _UNKNOWN_WARNING_THRESHOLD: | |
| result._add_warning( | |
| f"high ratio of 'Unknown' values: " | |
| f"{total_unknowns}/{total_values} " | |
| f"({unknown_ratio:.0%})" | |
| ) | |
| return result | |