Upload 5 files
Browse files- src/aibom-generator/enhanced_extractor.py +876 -0
- src/aibom-generator/field_registry.json +737 -0
- src/aibom-generator/field_registry_manager.py +648 -0
- src/aibom-generator/generator.py +442 -35
- src/aibom-generator/utils.py +335 -141
src/aibom-generator/enhanced_extractor.py
ADDED
|
@@ -0,0 +1,876 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AI SBOM Generator
|
| 4 |
+
|
| 5 |
+
This module provides a fully configurable enhanced data extraction system that
|
| 6 |
+
automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
|
| 7 |
+
It includes comprehensive logging, fallback mechanisms, and confidence tracking.
|
| 8 |
+
|
| 9 |
+
Key Features:
|
| 10 |
+
- Automatically discovers all fields from the registry (field_registry.json)
|
| 11 |
+
- Attempts extraction for every registry field
|
| 12 |
+
- Provides detailed logging for each field attempt
|
| 13 |
+
- Graceful error handling for individual field failures
|
| 14 |
+
- Maintains backward compatibility with existing code
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
import logging
|
| 20 |
+
import re
|
| 21 |
+
import requests
|
| 22 |
+
from typing import Dict, Any, Optional, List, Tuple
|
| 23 |
+
from enum import Enum
|
| 24 |
+
from dataclasses import dataclass, field
|
| 25 |
+
from datetime import datetime
|
| 26 |
+
from urllib.parse import urlparse, urljoin
|
| 27 |
+
import time
|
| 28 |
+
|
| 29 |
+
# Import existing dependencies
|
| 30 |
+
from huggingface_hub import HfApi, ModelCard, hf_hub_download
|
| 31 |
+
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
|
| 32 |
+
|
| 33 |
+
# Import field registry manager (field_registry_manager.py)
|
| 34 |
+
try:
|
| 35 |
+
from .field_registry_manager import get_field_registry_manager
|
| 36 |
+
REGISTRY_AVAILABLE = True
|
| 37 |
+
except ImportError:
|
| 38 |
+
try:
|
| 39 |
+
from field_registry_manager import get_field_registry_manager
|
| 40 |
+
REGISTRY_AVAILABLE = True
|
| 41 |
+
except ImportError:
|
| 42 |
+
REGISTRY_AVAILABLE = False
|
| 43 |
+
print("β οΈ Field registry manager not available, falling back to legacy extraction")
|
| 44 |
+
|
| 45 |
+
# Configure logging for this module
|
| 46 |
+
logger = logging.getLogger(__name__)
|
| 47 |
+
|
| 48 |
+
class DataSource(Enum):
|
| 49 |
+
"""Enumeration of data sources for provenance tracking"""
|
| 50 |
+
HF_API = "huggingface_api"
|
| 51 |
+
MODEL_CARD = "model_card_yaml"
|
| 52 |
+
README_TEXT = "readme_text"
|
| 53 |
+
CONFIG_FILE = "config_file"
|
| 54 |
+
REPOSITORY_FILES = "repository_files"
|
| 55 |
+
EXTERNAL_REFERENCE = "external_reference"
|
| 56 |
+
INTELLIGENT_DEFAULT = "intelligent_default"
|
| 57 |
+
PLACEHOLDER = "placeholder"
|
| 58 |
+
REGISTRY_DRIVEN = "registry_driven"
|
| 59 |
+
|
| 60 |
+
class ConfidenceLevel(Enum):
|
| 61 |
+
"""Confidence levels for extracted data"""
|
| 62 |
+
HIGH = "high" # Direct API data, official sources
|
| 63 |
+
MEDIUM = "medium" # Inferred from reliable patterns
|
| 64 |
+
LOW = "low" # Weak inference or pattern matching
|
| 65 |
+
NONE = "none" # Placeholder values
|
| 66 |
+
|
| 67 |
+
@dataclass
|
| 68 |
+
class ExtractionResult:
|
| 69 |
+
"""Container for extraction results with full provenance"""
|
| 70 |
+
value: Any
|
| 71 |
+
source: DataSource
|
| 72 |
+
confidence: ConfidenceLevel
|
| 73 |
+
extraction_method: str
|
| 74 |
+
timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
| 75 |
+
fallback_chain: List[str] = field(default_factory=list)
|
| 76 |
+
|
| 77 |
+
def __str__(self):
|
| 78 |
+
return f"{self.value} (source: {self.source.value}, confidence: {self.confidence.value})"
|
| 79 |
+
|
| 80 |
+
class EnhancedExtractor:
|
| 81 |
+
"""
|
| 82 |
+
Registry-integrated enhanced extractor that automatically picks up new fields
|
| 83 |
+
from the JSON registry (field_registry.json) without requiring code changes.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def __init__(self, hf_api: Optional[HfApi] = None, field_registry_manager=None):
|
| 87 |
+
"""
|
| 88 |
+
Initialize the enhanced extractor with registry integration (field_registry.json and field_registry_manager.py).
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
hf_api: Optional HuggingFace API instance (will create if not provided)
|
| 92 |
+
field_registry_manager.py: Optional registry manager instance
|
| 93 |
+
"""
|
| 94 |
+
self.hf_api = hf_api or HfApi()
|
| 95 |
+
self.extraction_results = {}
|
| 96 |
+
|
| 97 |
+
# Initialize registry manager (field_registry_manager.py)
|
| 98 |
+
self.registry_manager = field_registry_manager
|
| 99 |
+
if not self.registry_manager and REGISTRY_AVAILABLE:
|
| 100 |
+
try:
|
| 101 |
+
self.registry_manager = get_field_registry_manager()
|
| 102 |
+
logger.info("β
Registry manager initialized successfully")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.warning(f"β οΈ Could not initialize registry manager: {e}")
|
| 105 |
+
self.registry_manager = None
|
| 106 |
+
|
| 107 |
+
# Load registry fields
|
| 108 |
+
self.registry_fields = {}
|
| 109 |
+
if self.registry_manager:
|
| 110 |
+
try:
|
| 111 |
+
registry = self.registry_manager.registry
|
| 112 |
+
self.registry_fields = registry.get('fields', {})
|
| 113 |
+
logger.info(f"β
Loaded {len(self.registry_fields)} fields from registry")
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"β Error loading registry fields: {e}")
|
| 116 |
+
self.registry_fields = {}
|
| 117 |
+
|
| 118 |
+
# Configure logging
|
| 119 |
+
self._setup_logging()
|
| 120 |
+
|
| 121 |
+
# Compile regex patterns for text extraction
|
| 122 |
+
self._compile_patterns()
|
| 123 |
+
|
| 124 |
+
logger.info(f"Enhanced extractor initialized (registry-driven: {bool(self.registry_fields)})")
|
| 125 |
+
|
| 126 |
+
def _setup_logging(self):
|
| 127 |
+
"""Setup logging configuration for detailed extraction tracking"""
|
| 128 |
+
# Ensure a logger that will show in HF Spaces
|
| 129 |
+
if not logger.handlers:
|
| 130 |
+
handler = logging.StreamHandler()
|
| 131 |
+
formatter = logging.Formatter(
|
| 132 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 133 |
+
)
|
| 134 |
+
handler.setFormatter(formatter)
|
| 135 |
+
logger.addHandler(handler)
|
| 136 |
+
logger.setLevel(logging.INFO)
|
| 137 |
+
|
| 138 |
+
def _compile_patterns(self):
|
| 139 |
+
"""Compile regex patterns for text extraction"""
|
| 140 |
+
self.patterns = {
|
| 141 |
+
'license': [
|
| 142 |
+
r'license[:\s]+([a-zA-Z0-9\-\.]+)',
|
| 143 |
+
r'licensed under[:\s]+([a-zA-Z0-9\-\.]+)',
|
| 144 |
+
r'released under[:\s]+([a-zA-Z0-9\-\.]+)',
|
| 145 |
+
],
|
| 146 |
+
'datasets': [
|
| 147 |
+
r'trained on[:\s]+([a-zA-Z0-9\-\_\/]+)',
|
| 148 |
+
r'dataset[:\s]+([a-zA-Z0-9\-\_\/]+)',
|
| 149 |
+
r'using[:\s]+([a-zA-Z0-9\-\_\/]+)\s+dataset',
|
| 150 |
+
],
|
| 151 |
+
'metrics': [
|
| 152 |
+
r'([a-zA-Z]+)[:\s]+([0-9\.]+)',
|
| 153 |
+
r'achieves[:\s]+([0-9\.]+)[:\s]+([a-zA-Z]+)',
|
| 154 |
+
],
|
| 155 |
+
'model_type': [
|
| 156 |
+
r'model type[:\s]+([a-zA-Z0-9\-]+)',
|
| 157 |
+
r'architecture[:\s]+([a-zA-Z0-9\-]+)',
|
| 158 |
+
],
|
| 159 |
+
'energy': [
|
| 160 |
+
r'energy[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
|
| 161 |
+
r'power[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
|
| 162 |
+
r'consumption[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
|
| 163 |
+
],
|
| 164 |
+
'limitations': [
|
| 165 |
+
r'limitation[s]?[:\s]+([^\.]+)',
|
| 166 |
+
r'known issue[s]?[:\s]+([^\.]+)',
|
| 167 |
+
r'constraint[s]?[:\s]+([^\.]+)',
|
| 168 |
+
],
|
| 169 |
+
'safety': [
|
| 170 |
+
r'safety[:\s]+([^\.]+)',
|
| 171 |
+
r'risk[s]?[:\s]+([^\.]+)',
|
| 172 |
+
r'bias[:\s]+([^\.]+)',
|
| 173 |
+
]
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
# Compile all patterns
|
| 177 |
+
for category, pattern_list in self.patterns.items():
|
| 178 |
+
self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
|
| 179 |
+
|
| 180 |
+
def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
| 181 |
+
"""
|
| 182 |
+
Main extraction method with full registry integration.
|
| 183 |
+
|
| 184 |
+
This method automatically discovers all fields from the registry and attempts
|
| 185 |
+
to extract them without requiring code changes when new fields are added.
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
model_id: Hugging Face model identifier
|
| 189 |
+
model_info: Model information from HF API
|
| 190 |
+
model_card: Model card object from HF
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
Dictionary of extracted metadata
|
| 194 |
+
"""
|
| 195 |
+
logger.info(f"π Starting registry-driven extraction for model: {model_id}")
|
| 196 |
+
|
| 197 |
+
# Initialize extraction results tracking
|
| 198 |
+
self.extraction_results = {}
|
| 199 |
+
metadata = {}
|
| 200 |
+
|
| 201 |
+
if self.registry_fields:
|
| 202 |
+
# Registry-driven extraction
|
| 203 |
+
logger.info(f"π Registry-driven mode: Attempting extraction for {len(self.registry_fields)} fields")
|
| 204 |
+
metadata = self._registry_driven_extraction(model_id, model_info, model_card)
|
| 205 |
+
else:
|
| 206 |
+
# Fallback to legacy extraction
|
| 207 |
+
logger.warning("β οΈ Registry not available, falling back to legacy extraction")
|
| 208 |
+
metadata = self._legacy_extraction(model_id, model_info, model_card)
|
| 209 |
+
|
| 210 |
+
# Log extraction summary
|
| 211 |
+
self._log_extraction_summary(model_id, metadata)
|
| 212 |
+
|
| 213 |
+
# Return metadata in the same format as original method
|
| 214 |
+
return {k: v for k, v in metadata.items() if v is not None}
|
| 215 |
+
|
| 216 |
+
def _registry_driven_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
| 217 |
+
"""
|
| 218 |
+
Registry-driven extraction that automatically processes all registry fields.
|
| 219 |
+
"""
|
| 220 |
+
metadata = {}
|
| 221 |
+
|
| 222 |
+
# Prepare extraction context
|
| 223 |
+
extraction_context = {
|
| 224 |
+
'model_id': model_id,
|
| 225 |
+
'model_info': model_info,
|
| 226 |
+
'model_card': model_card,
|
| 227 |
+
'readme_content': self._get_readme_content(model_card, model_id),
|
| 228 |
+
'config_data': self._download_and_parse_config(model_id, "config.json"),
|
| 229 |
+
'tokenizer_config': self._download_and_parse_config(model_id, "tokenizer_config.json")
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
# Process each field from the registry
|
| 233 |
+
successful_extractions = 0
|
| 234 |
+
failed_extractions = 0
|
| 235 |
+
|
| 236 |
+
for field_name, field_config in self.registry_fields.items():
|
| 237 |
+
try:
|
| 238 |
+
logger.info(f"π Attempting extraction for field: {field_name}")
|
| 239 |
+
|
| 240 |
+
# Extract field using registry configuration
|
| 241 |
+
extracted_value = self._extract_registry_field(field_name, field_config, extraction_context)
|
| 242 |
+
|
| 243 |
+
if extracted_value is not None:
|
| 244 |
+
metadata[field_name] = extracted_value
|
| 245 |
+
successful_extractions += 1
|
| 246 |
+
logger.info(f"β
Successfully extracted {field_name}: {extracted_value}")
|
| 247 |
+
else:
|
| 248 |
+
failed_extractions += 1
|
| 249 |
+
logger.info(f"β Failed to extract {field_name}")
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
failed_extractions += 1
|
| 253 |
+
logger.error(f"β Error extracting {field_name}: {e}")
|
| 254 |
+
# Continue with other fields - individual failures don't stop the process
|
| 255 |
+
continue
|
| 256 |
+
|
| 257 |
+
logger.info(f"π Registry extraction complete: {successful_extractions} successful, {failed_extractions} failed")
|
| 258 |
+
|
| 259 |
+
# Add external references
|
| 260 |
+
metadata.update(self._generate_external_references(model_id, metadata))
|
| 261 |
+
|
| 262 |
+
return metadata
|
| 263 |
+
|
| 264 |
+
def _extract_registry_field(self, field_name: str, field_config: Dict[str, Any], context: Dict[str, Any]) -> Any:
|
| 265 |
+
"""
|
| 266 |
+
Extract a single field based on its registry configuration.
|
| 267 |
+
|
| 268 |
+
This method uses multiple extraction strategies in order of preference:
|
| 269 |
+
1. Direct API extraction
|
| 270 |
+
2. Model card YAML extraction
|
| 271 |
+
3. Text pattern matching
|
| 272 |
+
4. Intelligent inference
|
| 273 |
+
5. Fallback values
|
| 274 |
+
"""
|
| 275 |
+
extraction_methods = []
|
| 276 |
+
|
| 277 |
+
# Strategy 1: Direct API extraction
|
| 278 |
+
api_value = self._try_api_extraction(field_name, context)
|
| 279 |
+
if api_value is not None:
|
| 280 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 281 |
+
value=api_value,
|
| 282 |
+
source=DataSource.HF_API,
|
| 283 |
+
confidence=ConfidenceLevel.HIGH,
|
| 284 |
+
extraction_method="api_direct"
|
| 285 |
+
)
|
| 286 |
+
extraction_methods.append("api_direct")
|
| 287 |
+
return api_value
|
| 288 |
+
|
| 289 |
+
# Strategy 2: Model card YAML extraction
|
| 290 |
+
yaml_value = self._try_model_card_extraction(field_name, context)
|
| 291 |
+
if yaml_value is not None:
|
| 292 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 293 |
+
value=yaml_value,
|
| 294 |
+
source=DataSource.MODEL_CARD,
|
| 295 |
+
confidence=ConfidenceLevel.HIGH,
|
| 296 |
+
extraction_method="model_card_yaml"
|
| 297 |
+
)
|
| 298 |
+
extraction_methods.append("model_card_yaml")
|
| 299 |
+
return yaml_value
|
| 300 |
+
|
| 301 |
+
# Strategy 3: Configuration file extraction
|
| 302 |
+
config_value = self._try_config_extraction(field_name, context)
|
| 303 |
+
if config_value is not None:
|
| 304 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 305 |
+
value=config_value,
|
| 306 |
+
source=DataSource.CONFIG_FILE,
|
| 307 |
+
confidence=ConfidenceLevel.HIGH,
|
| 308 |
+
extraction_method="config_file"
|
| 309 |
+
)
|
| 310 |
+
extraction_methods.append("config_file")
|
| 311 |
+
return config_value
|
| 312 |
+
|
| 313 |
+
# Strategy 4: Text pattern extraction
|
| 314 |
+
text_value = self._try_text_pattern_extraction(field_name, context)
|
| 315 |
+
if text_value is not None:
|
| 316 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 317 |
+
value=text_value,
|
| 318 |
+
source=DataSource.README_TEXT,
|
| 319 |
+
confidence=ConfidenceLevel.MEDIUM,
|
| 320 |
+
extraction_method="text_pattern"
|
| 321 |
+
)
|
| 322 |
+
extraction_methods.append("text_pattern")
|
| 323 |
+
return text_value
|
| 324 |
+
|
| 325 |
+
# Strategy 5: Intelligent inference
|
| 326 |
+
inferred_value = self._try_intelligent_inference(field_name, context)
|
| 327 |
+
if inferred_value is not None:
|
| 328 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 329 |
+
value=inferred_value,
|
| 330 |
+
source=DataSource.INTELLIGENT_DEFAULT,
|
| 331 |
+
confidence=ConfidenceLevel.MEDIUM,
|
| 332 |
+
extraction_method="intelligent_inference"
|
| 333 |
+
)
|
| 334 |
+
extraction_methods.append("intelligent_inference")
|
| 335 |
+
return inferred_value
|
| 336 |
+
|
| 337 |
+
# Strategy 6: Fallback value (if configured)
|
| 338 |
+
fallback_value = self._try_fallback_value(field_name, field_config)
|
| 339 |
+
if fallback_value is not None:
|
| 340 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 341 |
+
value=fallback_value,
|
| 342 |
+
source=DataSource.PLACEHOLDER,
|
| 343 |
+
confidence=ConfidenceLevel.NONE,
|
| 344 |
+
extraction_method="fallback_placeholder",
|
| 345 |
+
fallback_chain=extraction_methods
|
| 346 |
+
)
|
| 347 |
+
return fallback_value
|
| 348 |
+
|
| 349 |
+
# No extraction successful
|
| 350 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 351 |
+
value=None,
|
| 352 |
+
source=DataSource.PLACEHOLDER,
|
| 353 |
+
confidence=ConfidenceLevel.NONE,
|
| 354 |
+
extraction_method="extraction_failed",
|
| 355 |
+
fallback_chain=extraction_methods
|
| 356 |
+
)
|
| 357 |
+
return None
|
| 358 |
+
|
| 359 |
+
def _try_api_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
| 360 |
+
"""Try to extract field from HuggingFace API data"""
|
| 361 |
+
model_info = context.get('model_info')
|
| 362 |
+
if not model_info:
|
| 363 |
+
return None
|
| 364 |
+
|
| 365 |
+
# Field mapping for API extraction
|
| 366 |
+
api_mappings = {
|
| 367 |
+
'author': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
|
| 368 |
+
'name': lambda info: getattr(info, 'modelId', context['model_id']).split('/')[-1],
|
| 369 |
+
'tags': lambda info: getattr(info, 'tags', []),
|
| 370 |
+
'pipeline_tag': lambda info: getattr(info, 'pipeline_tag', None),
|
| 371 |
+
'downloads': lambda info: getattr(info, 'downloads', 0),
|
| 372 |
+
'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
|
| 373 |
+
'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
|
| 374 |
+
'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
|
| 375 |
+
'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main"
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
if field_name in api_mappings:
|
| 379 |
+
try:
|
| 380 |
+
return api_mappings[field_name](model_info)
|
| 381 |
+
except Exception as e:
|
| 382 |
+
logger.debug(f"API extraction failed for {field_name}: {e}")
|
| 383 |
+
return None
|
| 384 |
+
|
| 385 |
+
return None
|
| 386 |
+
|
| 387 |
+
def _try_model_card_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
| 388 |
+
"""Try to extract field from model card YAML frontmatter"""
|
| 389 |
+
model_card = context.get('model_card')
|
| 390 |
+
if not model_card or not hasattr(model_card, 'data') or not model_card.data:
|
| 391 |
+
return None
|
| 392 |
+
|
| 393 |
+
try:
|
| 394 |
+
card_data = model_card.data.to_dict() if hasattr(model_card.data, 'to_dict') else {}
|
| 395 |
+
|
| 396 |
+
# Field mapping for model card extraction
|
| 397 |
+
card_mappings = {
|
| 398 |
+
'license': 'license',
|
| 399 |
+
'language': 'language',
|
| 400 |
+
'library_name': 'library_name',
|
| 401 |
+
'base_model': 'base_model',
|
| 402 |
+
'datasets': 'datasets',
|
| 403 |
+
'description': ['model_summary', 'description'],
|
| 404 |
+
'typeOfModel': 'model_type',
|
| 405 |
+
'licenses': 'license' # Alternative mapping
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
if field_name in card_mappings:
|
| 409 |
+
mapping = card_mappings[field_name]
|
| 410 |
+
if isinstance(mapping, list):
|
| 411 |
+
# Try multiple keys
|
| 412 |
+
for key in mapping:
|
| 413 |
+
value = card_data.get(key)
|
| 414 |
+
if value:
|
| 415 |
+
return value
|
| 416 |
+
else:
|
| 417 |
+
# Single key
|
| 418 |
+
return card_data.get(mapping)
|
| 419 |
+
|
| 420 |
+
# Direct field name lookup
|
| 421 |
+
return card_data.get(field_name)
|
| 422 |
+
|
| 423 |
+
except Exception as e:
|
| 424 |
+
logger.debug(f"Model card extraction failed for {field_name}: {e}")
|
| 425 |
+
return None
|
| 426 |
+
|
| 427 |
+
def _try_config_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
| 428 |
+
"""Try to extract field from configuration files"""
|
| 429 |
+
config_data = context.get('config_data')
|
| 430 |
+
tokenizer_config = context.get('tokenizer_config')
|
| 431 |
+
|
| 432 |
+
# Config file mappings
|
| 433 |
+
config_mappings = {
|
| 434 |
+
'model_type': ('config_data', 'model_type'),
|
| 435 |
+
'architectures': ('config_data', 'architectures'),
|
| 436 |
+
'vocab_size': ('config_data', 'vocab_size'),
|
| 437 |
+
'tokenizer_class': ('tokenizer_config', 'tokenizer_class'),
|
| 438 |
+
'typeOfModel': ('config_data', 'model_type')
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
if field_name in config_mappings:
|
| 442 |
+
config_type, config_key = config_mappings[field_name]
|
| 443 |
+
config_source = context.get(config_type)
|
| 444 |
+
if config_source:
|
| 445 |
+
return config_source.get(config_key)
|
| 446 |
+
|
| 447 |
+
return None
|
| 448 |
+
|
| 449 |
+
def _try_text_pattern_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
| 450 |
+
"""Try to extract field using text pattern matching"""
|
| 451 |
+
readme_content = context.get('readme_content')
|
| 452 |
+
if not readme_content:
|
| 453 |
+
return None
|
| 454 |
+
|
| 455 |
+
# Pattern mappings for different fields
|
| 456 |
+
pattern_mappings = {
|
| 457 |
+
'license': 'license',
|
| 458 |
+
'datasets': 'datasets',
|
| 459 |
+
'energyConsumption': 'energy',
|
| 460 |
+
'limitation': 'limitations',
|
| 461 |
+
'safetyRiskAssessment': 'safety',
|
| 462 |
+
'model_type': 'model_type'
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
if field_name in pattern_mappings:
|
| 466 |
+
pattern_key = pattern_mappings[field_name]
|
| 467 |
+
if pattern_key in self.patterns:
|
| 468 |
+
matches = self._find_pattern_matches(readme_content, self.patterns[pattern_key])
|
| 469 |
+
if matches:
|
| 470 |
+
return matches[0] if len(matches) == 1 else matches
|
| 471 |
+
|
| 472 |
+
return None
|
| 473 |
+
|
| 474 |
+
def _try_intelligent_inference(self, field_name: str, context: Dict[str, Any]) -> Any:
|
| 475 |
+
"""Try to infer field value from other available data"""
|
| 476 |
+
model_id = context['model_id']
|
| 477 |
+
|
| 478 |
+
# Intelligent inference rules
|
| 479 |
+
inference_rules = {
|
| 480 |
+
'author': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
|
| 481 |
+
'suppliedBy': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
|
| 482 |
+
'name': lambda: model_id.split('/')[-1],
|
| 483 |
+
'primaryPurpose': lambda: 'text-generation', # Default for most HF models
|
| 484 |
+
'typeOfModel': lambda: 'transformer', # Default for most HF models
|
| 485 |
+
'downloadLocation': lambda: f"https://huggingface.co/{model_id}/tree/main",
|
| 486 |
+
'bomFormat': lambda: 'CycloneDX',
|
| 487 |
+
'specVersion': lambda: '1.6',
|
| 488 |
+
'serialNumber': lambda: f"urn:uuid:{model_id.replace('/', '-')}",
|
| 489 |
+
'version': lambda: '1.0.0'
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
if field_name in inference_rules:
|
| 493 |
+
try:
|
| 494 |
+
return inference_rules[field_name]()
|
| 495 |
+
except Exception as e:
|
| 496 |
+
logger.debug(f"Intelligent inference failed for {field_name}: {e}")
|
| 497 |
+
return None
|
| 498 |
+
|
| 499 |
+
return None
|
| 500 |
+
|
| 501 |
+
def _try_fallback_value(self, field_name: str, field_config: Dict[str, Any]) -> Any:
|
| 502 |
+
"""Try to get fallback value from field configuration"""
|
| 503 |
+
# Check if field config has fallback value
|
| 504 |
+
if isinstance(field_config, dict):
|
| 505 |
+
fallback = field_config.get('fallback_value')
|
| 506 |
+
if fallback:
|
| 507 |
+
return fallback
|
| 508 |
+
|
| 509 |
+
# Standard fallback values for common fields
|
| 510 |
+
standard_fallbacks = {
|
| 511 |
+
'license': 'NOASSERTION',
|
| 512 |
+
'description': 'No description available',
|
| 513 |
+
'version': '1.0.0',
|
| 514 |
+
'bomFormat': 'CycloneDX',
|
| 515 |
+
'specVersion': '1.6'
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
return standard_fallbacks.get(field_name)
|
| 519 |
+
|
| 520 |
+
def _legacy_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
| 521 |
+
"""
|
| 522 |
+
Fallback to legacy extraction when registry is not available.
|
| 523 |
+
This maintains backward compatibility.
|
| 524 |
+
"""
|
| 525 |
+
logger.info("π Executing legacy extraction mode")
|
| 526 |
+
metadata = {}
|
| 527 |
+
|
| 528 |
+
# Execute legacy extraction layers
|
| 529 |
+
metadata.update(self._layer1_structured_api(model_id, model_info, model_card))
|
| 530 |
+
metadata.update(self._layer2_repository_files(model_id))
|
| 531 |
+
metadata.update(self._layer3_stp_extraction(model_card, model_id))
|
| 532 |
+
metadata.update(self._layer4_external_references(model_id, metadata))
|
| 533 |
+
metadata.update(self._layer5_intelligent_defaults(model_id, metadata))
|
| 534 |
+
|
| 535 |
+
return metadata
|
| 536 |
+
|
| 537 |
+
def _generate_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 538 |
+
"""Generate external references for the model"""
|
| 539 |
+
external_refs = []
|
| 540 |
+
|
| 541 |
+
# Model repository
|
| 542 |
+
repo_url = f"https://huggingface.co/{model_id}"
|
| 543 |
+
external_refs.append({
|
| 544 |
+
"type": "website",
|
| 545 |
+
"url": repo_url,
|
| 546 |
+
"comment": "Model repository"
|
| 547 |
+
})
|
| 548 |
+
|
| 549 |
+
# Model files
|
| 550 |
+
files_url = f"https://huggingface.co/{model_id}/tree/main"
|
| 551 |
+
external_refs.append({
|
| 552 |
+
"type": "distribution",
|
| 553 |
+
"url": files_url,
|
| 554 |
+
"comment": "Model files"
|
| 555 |
+
})
|
| 556 |
+
|
| 557 |
+
# Commit URL if available
|
| 558 |
+
if 'commit' in metadata:
|
| 559 |
+
commit_url = f"https://huggingface.co/{model_id}/commit/{metadata['commit']}"
|
| 560 |
+
external_refs.append({
|
| 561 |
+
"type": "vcs",
|
| 562 |
+
"url": commit_url,
|
| 563 |
+
"comment": "Specific commit"
|
| 564 |
+
})
|
| 565 |
+
|
| 566 |
+
# Dataset references
|
| 567 |
+
if 'datasets' in metadata:
|
| 568 |
+
datasets = metadata['datasets']
|
| 569 |
+
if isinstance(datasets, list):
|
| 570 |
+
for dataset in datasets:
|
| 571 |
+
if isinstance(dataset, str):
|
| 572 |
+
dataset_url = f"https://huggingface.co/datasets/{dataset}"
|
| 573 |
+
external_refs.append({
|
| 574 |
+
"type": "distribution",
|
| 575 |
+
"url": dataset_url,
|
| 576 |
+
"comment": f"Training dataset: {dataset}"
|
| 577 |
+
})
|
| 578 |
+
|
| 579 |
+
result = {'external_references': external_refs}
|
| 580 |
+
|
| 581 |
+
self.extraction_results['external_references'] = ExtractionResult(
|
| 582 |
+
value=external_refs,
|
| 583 |
+
source=DataSource.EXTERNAL_REFERENCE,
|
| 584 |
+
confidence=ConfidenceLevel.HIGH,
|
| 585 |
+
extraction_method="url_generation"
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
return result
|
| 589 |
+
|
| 590 |
+
# Legacy methods for backward compatibility
|
| 591 |
+
def _layer1_structured_api(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
| 592 |
+
"""Legacy Layer 1: Enhanced structured data extraction from HF API and model card."""
|
| 593 |
+
logger.info("π Executing Legacy Layer 1: Enhanced Structured API Extraction")
|
| 594 |
+
metadata = {}
|
| 595 |
+
|
| 596 |
+
# Enhanced model info extraction
|
| 597 |
+
if model_info:
|
| 598 |
+
try:
|
| 599 |
+
# Extract author with fallback logic
|
| 600 |
+
author = getattr(model_info, "author", None)
|
| 601 |
+
if not author or author.strip() == "":
|
| 602 |
+
parts = model_id.split("/")
|
| 603 |
+
author = parts[0] if len(parts) > 1 else "unknown"
|
| 604 |
+
|
| 605 |
+
metadata['author'] = author
|
| 606 |
+
metadata['name'] = getattr(model_info, "modelId", model_id).split("/")[-1]
|
| 607 |
+
metadata['tags'] = getattr(model_info, "tags", [])
|
| 608 |
+
metadata['pipeline_tag'] = getattr(model_info, "pipeline_tag", None)
|
| 609 |
+
metadata['downloads'] = getattr(model_info, "downloads", 0)
|
| 610 |
+
|
| 611 |
+
# Commit information
|
| 612 |
+
commit_sha = getattr(model_info, "sha", None)
|
| 613 |
+
if commit_sha:
|
| 614 |
+
metadata['commit'] = commit_sha[:7]
|
| 615 |
+
|
| 616 |
+
except Exception as e:
|
| 617 |
+
logger.error(f"β Legacy Layer 1: Error extracting from model_info: {e}")
|
| 618 |
+
|
| 619 |
+
# Enhanced model card extraction
|
| 620 |
+
if model_card and hasattr(model_card, "data") and model_card.data:
|
| 621 |
+
try:
|
| 622 |
+
card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
|
| 623 |
+
|
| 624 |
+
metadata['license'] = card_data.get("license")
|
| 625 |
+
metadata['language'] = card_data.get("language")
|
| 626 |
+
metadata['library_name'] = card_data.get("library_name")
|
| 627 |
+
metadata['base_model'] = card_data.get("base_model")
|
| 628 |
+
metadata['datasets'] = card_data.get("datasets")
|
| 629 |
+
metadata['description'] = card_data.get("model_summary") or card_data.get("description")
|
| 630 |
+
|
| 631 |
+
except Exception as e:
|
| 632 |
+
logger.error(f"β Legacy Layer 1: Error extracting from model card: {e}")
|
| 633 |
+
|
| 634 |
+
# Add standard AI metadata
|
| 635 |
+
metadata["primaryPurpose"] = metadata.get("pipeline_tag", "text-generation")
|
| 636 |
+
metadata["suppliedBy"] = metadata.get("author", "unknown")
|
| 637 |
+
metadata["typeOfModel"] = "transformer"
|
| 638 |
+
|
| 639 |
+
return metadata
|
| 640 |
+
|
| 641 |
+
def _layer2_repository_files(self, model_id: str) -> Dict[str, Any]:
|
| 642 |
+
"""Legacy Layer 2: Repository file analysis"""
|
| 643 |
+
logger.info("π§ Executing Legacy Layer 2: Repository File Analysis")
|
| 644 |
+
metadata = {}
|
| 645 |
+
|
| 646 |
+
try:
|
| 647 |
+
config_data = self._download_and_parse_config(model_id, "config.json")
|
| 648 |
+
if config_data:
|
| 649 |
+
metadata['model_type'] = config_data.get("model_type")
|
| 650 |
+
metadata['architectures'] = config_data.get("architectures", [])
|
| 651 |
+
metadata['vocab_size'] = config_data.get("vocab_size")
|
| 652 |
+
|
| 653 |
+
tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
|
| 654 |
+
if tokenizer_config:
|
| 655 |
+
metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
|
| 656 |
+
|
| 657 |
+
except Exception as e:
|
| 658 |
+
logger.warning(f"β οΈ Legacy Layer 2: Could not analyze repository files: {e}")
|
| 659 |
+
|
| 660 |
+
return metadata
|
| 661 |
+
|
| 662 |
+
def _layer3_stp_extraction(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
| 663 |
+
"""Legacy Layer 3: Smart Text Parsing"""
|
| 664 |
+
logger.info("π Executing Legacy Layer 3: Smart Text Parsing")
|
| 665 |
+
metadata = {}
|
| 666 |
+
|
| 667 |
+
try:
|
| 668 |
+
readme_content = self._get_readme_content(model_card, model_id)
|
| 669 |
+
if readme_content:
|
| 670 |
+
extracted_info = self._extract_from_text(readme_content)
|
| 671 |
+
metadata.update(extracted_info)
|
| 672 |
+
except Exception as e:
|
| 673 |
+
logger.warning(f"β οΈ Legacy Layer 3: Error in Smart Text Parsing: {e}")
|
| 674 |
+
|
| 675 |
+
return metadata
|
| 676 |
+
|
| 677 |
+
def _layer4_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 678 |
+
"""Legacy Layer 4: External reference generation"""
|
| 679 |
+
logger.info("π Executing Legacy Layer 4: External Reference Generation")
|
| 680 |
+
return self._generate_external_references(model_id, metadata)
|
| 681 |
+
|
| 682 |
+
def _layer5_intelligent_defaults(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 683 |
+
"""Legacy Layer 5: Intelligent default generation"""
|
| 684 |
+
logger.info("π§ Executing Legacy Layer 5: Intelligent Default Generation")
|
| 685 |
+
|
| 686 |
+
if 'author' not in metadata or not metadata['author']:
|
| 687 |
+
parts = model_id.split("/")
|
| 688 |
+
metadata['author'] = parts[0] if len(parts) > 1 else "unknown"
|
| 689 |
+
|
| 690 |
+
if 'license' not in metadata or not metadata['license']:
|
| 691 |
+
metadata['license'] = "NOASSERTION"
|
| 692 |
+
|
| 693 |
+
return metadata
|
| 694 |
+
|
| 695 |
+
# Utility methods
|
| 696 |
+
def _download_and_parse_config(self, model_id: str, filename: str) -> Optional[Dict[str, Any]]:
|
| 697 |
+
"""Download and parse a configuration file from the model repository"""
|
| 698 |
+
try:
|
| 699 |
+
file_path = hf_hub_download(repo_id=model_id, filename=filename)
|
| 700 |
+
with open(file_path, 'r') as f:
|
| 701 |
+
return json.load(f)
|
| 702 |
+
except (RepositoryNotFoundError, EntryNotFoundError, json.JSONDecodeError) as e:
|
| 703 |
+
logger.debug(f"Could not download/parse {filename}: {e}")
|
| 704 |
+
return None
|
| 705 |
+
except Exception as e:
|
| 706 |
+
logger.warning(f"Unexpected error downloading {filename}: {e}")
|
| 707 |
+
return None
|
| 708 |
+
|
| 709 |
+
def _get_readme_content(self, model_card: Optional[ModelCard], model_id: str) -> Optional[str]:
|
| 710 |
+
"""Get README content from model card or by downloading"""
|
| 711 |
+
try:
|
| 712 |
+
if model_card and hasattr(model_card, 'content'):
|
| 713 |
+
return model_card.content
|
| 714 |
+
|
| 715 |
+
readme_path = hf_hub_download(repo_id=model_id, filename="README.md")
|
| 716 |
+
with open(readme_path, 'r', encoding='utf-8') as f:
|
| 717 |
+
return f.read()
|
| 718 |
+
|
| 719 |
+
except Exception as e:
|
| 720 |
+
logger.debug(f"Could not get README content: {e}")
|
| 721 |
+
return None
|
| 722 |
+
|
| 723 |
+
def _extract_from_text(self, text: str) -> Dict[str, Any]:
|
| 724 |
+
"""Extract structured information from unstructured text"""
|
| 725 |
+
metadata = {}
|
| 726 |
+
|
| 727 |
+
# Extract license information
|
| 728 |
+
license_matches = self._find_pattern_matches(text, self.patterns['license'])
|
| 729 |
+
if license_matches:
|
| 730 |
+
metadata['license_from_text'] = license_matches[0]
|
| 731 |
+
|
| 732 |
+
# Extract dataset information
|
| 733 |
+
dataset_matches = self._find_pattern_matches(text, self.patterns['datasets'])
|
| 734 |
+
if dataset_matches:
|
| 735 |
+
metadata['datasets_from_text'] = dataset_matches
|
| 736 |
+
|
| 737 |
+
# Extract performance metrics
|
| 738 |
+
metric_matches = self._extract_metrics(text)
|
| 739 |
+
if metric_matches:
|
| 740 |
+
metadata['performance_metrics'] = metric_matches
|
| 741 |
+
|
| 742 |
+
return metadata
|
| 743 |
+
|
| 744 |
+
def _find_pattern_matches(self, text: str, patterns: List[re.Pattern]) -> List[str]:
|
| 745 |
+
"""Find matches for a list of regex patterns in text"""
|
| 746 |
+
matches = []
|
| 747 |
+
for pattern in patterns:
|
| 748 |
+
found = pattern.findall(text)
|
| 749 |
+
matches.extend(found)
|
| 750 |
+
return list(set(matches)) # Remove duplicates
|
| 751 |
+
|
| 752 |
+
def _extract_metrics(self, text: str) -> Dict[str, float]:
|
| 753 |
+
"""Extract performance metrics from text"""
|
| 754 |
+
metrics = {}
|
| 755 |
+
|
| 756 |
+
metric_patterns = [
|
| 757 |
+
r'accuracy[:\s]+([0-9\.]+)',
|
| 758 |
+
r'f1[:\s]+([0-9\.]+)',
|
| 759 |
+
r'bleu[:\s]+([0-9\.]+)',
|
| 760 |
+
r'rouge[:\s]+([0-9\.]+)',
|
| 761 |
+
]
|
| 762 |
+
|
| 763 |
+
for pattern_str in metric_patterns:
|
| 764 |
+
pattern = re.compile(pattern_str, re.IGNORECASE)
|
| 765 |
+
matches = pattern.findall(text)
|
| 766 |
+
if matches:
|
| 767 |
+
metric_name = pattern_str.split('[')[0]
|
| 768 |
+
try:
|
| 769 |
+
metrics[metric_name] = float(matches[0])
|
| 770 |
+
except ValueError:
|
| 771 |
+
continue
|
| 772 |
+
|
| 773 |
+
return metrics
|
| 774 |
+
|
| 775 |
+
def _log_extraction_summary(self, model_id: str, metadata: Dict[str, Any]):
|
| 776 |
+
"""Log comprehensive extraction summary"""
|
| 777 |
+
logger.info("=" * 60)
|
| 778 |
+
logger.info(f"π REGISTRY-DRIVEN EXTRACTION SUMMARY FOR: {model_id}")
|
| 779 |
+
logger.info("=" * 60)
|
| 780 |
+
|
| 781 |
+
if self.registry_fields:
|
| 782 |
+
logger.info(f"π Registry fields available: {len(self.registry_fields)}")
|
| 783 |
+
logger.info(f"π Total fields extracted: {len(self.extraction_results)}")
|
| 784 |
+
|
| 785 |
+
# Count fields by confidence level
|
| 786 |
+
confidence_counts = {}
|
| 787 |
+
source_counts = {}
|
| 788 |
+
|
| 789 |
+
for field_name, result in self.extraction_results.items():
|
| 790 |
+
conf = result.confidence.value
|
| 791 |
+
source = result.source.value
|
| 792 |
+
confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
|
| 793 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
| 794 |
+
|
| 795 |
+
logger.info("π Confidence distribution:")
|
| 796 |
+
for conf, count in confidence_counts.items():
|
| 797 |
+
logger.info(f" {conf}: {count} fields")
|
| 798 |
+
|
| 799 |
+
logger.info("π Source distribution:")
|
| 800 |
+
for source, count in source_counts.items():
|
| 801 |
+
logger.info(f" {source}: {count} fields")
|
| 802 |
+
|
| 803 |
+
# Log registry field coverage
|
| 804 |
+
extracted_fields = set(self.extraction_results.keys())
|
| 805 |
+
registry_field_names = set(self.registry_fields.keys())
|
| 806 |
+
coverage = len(extracted_fields & registry_field_names) / len(registry_field_names) * 100
|
| 807 |
+
logger.info(f"π Registry field coverage: {coverage:.1f}%")
|
| 808 |
+
|
| 809 |
+
# Log missing registry fields
|
| 810 |
+
missing_fields = registry_field_names - extracted_fields
|
| 811 |
+
if missing_fields:
|
| 812 |
+
logger.info(f"β Missing registry fields: {', '.join(sorted(missing_fields))}")
|
| 813 |
+
else:
|
| 814 |
+
logger.info(f"π Legacy extraction mode: {len(metadata)} fields extracted")
|
| 815 |
+
|
| 816 |
+
logger.info("=" * 60)
|
| 817 |
+
|
| 818 |
+
def get_extraction_results(self) -> Dict[str, ExtractionResult]:
|
| 819 |
+
"""Get detailed extraction results with provenance"""
|
| 820 |
+
return self.extraction_results.copy()
|
| 821 |
+
|
| 822 |
+
|
| 823 |
+
# Convenience function for drop-in replacement
|
| 824 |
+
def extract_enhanced_metadata(model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard], hf_api: Optional[HfApi] = None) -> Dict[str, Any]:
|
| 825 |
+
"""
|
| 826 |
+
Drop-in replacement function for _extract_structured_metadata with registry integration.
|
| 827 |
+
|
| 828 |
+
This function automatically picks up new fields from the registry without code changes.
|
| 829 |
+
|
| 830 |
+
Args:
|
| 831 |
+
model_id: Hugging Face model identifier
|
| 832 |
+
model_info: Model information from HF API
|
| 833 |
+
model_card: Model card object from HF
|
| 834 |
+
hf_api: Optional HuggingFace API instance
|
| 835 |
+
|
| 836 |
+
Returns:
|
| 837 |
+
Dictionary of extracted metadata
|
| 838 |
+
"""
|
| 839 |
+
extractor = EnhancedExtractor(hf_api)
|
| 840 |
+
return extractor.extract_metadata(model_id, model_info, model_card)
|
| 841 |
+
|
| 842 |
+
|
| 843 |
+
if __name__ == "__main__":
|
| 844 |
+
# Test the registry-integrated enhanced extractor
|
| 845 |
+
import sys
|
| 846 |
+
|
| 847 |
+
if len(sys.argv) > 1:
|
| 848 |
+
test_model_id = sys.argv[1]
|
| 849 |
+
else:
|
| 850 |
+
test_model_id = "deepseek-ai/DeepSeek-R1"
|
| 851 |
+
|
| 852 |
+
print(f"Testing registry-integrated enhanced extractor with model: {test_model_id}")
|
| 853 |
+
|
| 854 |
+
# Initialize HF API
|
| 855 |
+
hf_api = HfApi()
|
| 856 |
+
|
| 857 |
+
try:
|
| 858 |
+
# Fetch model info and card
|
| 859 |
+
model_info = hf_api.model_info(test_model_id)
|
| 860 |
+
model_card = ModelCard.load(test_model_id)
|
| 861 |
+
|
| 862 |
+
# Test extraction
|
| 863 |
+
extractor = EnhancedExtractor(hf_api)
|
| 864 |
+
metadata = extractor.extract_metadata(test_model_id, model_info, model_card)
|
| 865 |
+
|
| 866 |
+
print(f"\nExtracted {len(metadata)} metadata fields:")
|
| 867 |
+
for key, value in metadata.items():
|
| 868 |
+
print(f" {key}: {value}")
|
| 869 |
+
|
| 870 |
+
print(f"\nExtraction results with provenance:")
|
| 871 |
+
for field, result in extractor.get_extraction_results().items():
|
| 872 |
+
print(f" {field}: {result}")
|
| 873 |
+
|
| 874 |
+
except Exception as e:
|
| 875 |
+
print(f"Error testing extractor: {e}")
|
| 876 |
+
|
src/aibom-generator/field_registry.json
ADDED
|
@@ -0,0 +1,737 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"registry_metadata": {
|
| 3 |
+
"description": "Field registry for configurable AI SBOM generation and scoring"
|
| 4 |
+
},
|
| 5 |
+
"scoring_config": {
|
| 6 |
+
"tier_weights": {
|
| 7 |
+
"critical": 3,
|
| 8 |
+
"important": 2,
|
| 9 |
+
"supplementary": 1
|
| 10 |
+
},
|
| 11 |
+
"category_weights": {
|
| 12 |
+
"required_fields": 20,
|
| 13 |
+
"metadata": 20,
|
| 14 |
+
"component_basic": 20,
|
| 15 |
+
"component_model_card": 30,
|
| 16 |
+
"external_references": 10
|
| 17 |
+
},
|
| 18 |
+
"scoring_profiles": {
|
| 19 |
+
"basic": {
|
| 20 |
+
"description": "Minimal fields required for identification",
|
| 21 |
+
"required_categories": ["required_fields", "component_basic"],
|
| 22 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
|
| 23 |
+
"minimum_score": 40,
|
| 24 |
+
"weight_multiplier": 1.0
|
| 25 |
+
},
|
| 26 |
+
"standard": {
|
| 27 |
+
"description": "Comprehensive fields for proper documentation",
|
| 28 |
+
"required_categories": ["required_fields", "metadata", "component_basic"],
|
| 29 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy"],
|
| 30 |
+
"minimum_score": 70,
|
| 31 |
+
"weight_multiplier": 1.0
|
| 32 |
+
},
|
| 33 |
+
"advanced": {
|
| 34 |
+
"description": "Extensive documentation for maximum transparency",
|
| 35 |
+
"required_categories": ["required_fields", "metadata", "component_basic", "component_model_card", "external_references"],
|
| 36 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy", "type", "purl", "description", "licenses", "hyperparameter", "limitation", "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
|
| 37 |
+
"minimum_score": 85,
|
| 38 |
+
"weight_multiplier": 1.0
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"algorithm_config": {
|
| 42 |
+
"type": "weighted_sum",
|
| 43 |
+
"max_score": 100,
|
| 44 |
+
"normalization": "category_based",
|
| 45 |
+
"penalty_for_missing_critical": 0.5,
|
| 46 |
+
"bonus_for_complete_categories": 0.1
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"aibom_config": {
|
| 50 |
+
"structure_template": "cyclonedx_1.6",
|
| 51 |
+
"generator_info": {
|
| 52 |
+
"name": "aetheris-aibom-generator",
|
| 53 |
+
"version": "1.0",
|
| 54 |
+
"manufacturer": "Aetheris AI"
|
| 55 |
+
},
|
| 56 |
+
"generation_rules": {
|
| 57 |
+
"include_metadata_properties": true,
|
| 58 |
+
"include_model_card": true,
|
| 59 |
+
"include_external_references": true,
|
| 60 |
+
"include_dependencies": true
|
| 61 |
+
},
|
| 62 |
+
"validation_rules": {
|
| 63 |
+
"require_critical_fields": true,
|
| 64 |
+
"validate_jsonpath_expressions": true,
|
| 65 |
+
"enforce_cyclonedx_schema": true
|
| 66 |
+
}
|
| 67 |
+
},
|
| 68 |
+
"fields": {
|
| 69 |
+
"bomFormat": {
|
| 70 |
+
"tier": "critical",
|
| 71 |
+
"weight": 4.0,
|
| 72 |
+
"category": "required_fields",
|
| 73 |
+
"description": "Format identifier for the SBOM",
|
| 74 |
+
"jsonpath": "$.bomFormat",
|
| 75 |
+
"aibom_generation": {
|
| 76 |
+
"location": "$.bomFormat",
|
| 77 |
+
"rule": "always_include",
|
| 78 |
+
"source_fields": ["bomFormat"],
|
| 79 |
+
"validation": "required",
|
| 80 |
+
"data_type": "string"
|
| 81 |
+
},
|
| 82 |
+
"scoring": {
|
| 83 |
+
"points": 4.0,
|
| 84 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
| 85 |
+
"category_contribution": 0.2
|
| 86 |
+
},
|
| 87 |
+
"validation_message": {
|
| 88 |
+
"missing": "Missing critical field: bomFormat - essential for SBOM identification",
|
| 89 |
+
"recommendation": "Ensure bomFormat is set to 'CycloneDX'"
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
"specVersion": {
|
| 93 |
+
"tier": "critical",
|
| 94 |
+
"weight": 4.0,
|
| 95 |
+
"category": "required_fields",
|
| 96 |
+
"description": "CycloneDX specification version",
|
| 97 |
+
"jsonpath": "$.specVersion",
|
| 98 |
+
"aibom_generation": {
|
| 99 |
+
"location": "$.specVersion",
|
| 100 |
+
"rule": "always_include",
|
| 101 |
+
"source_fields": ["specVersion"],
|
| 102 |
+
"validation": "required",
|
| 103 |
+
"data_type": "string"
|
| 104 |
+
},
|
| 105 |
+
"scoring": {
|
| 106 |
+
"points": 4.0,
|
| 107 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
| 108 |
+
"category_contribution": 0.2
|
| 109 |
+
},
|
| 110 |
+
"validation_message": {
|
| 111 |
+
"missing": "Missing critical field: specVersion - required for CycloneDX compliance",
|
| 112 |
+
"recommendation": "Set specVersion to '1.6' for CycloneDX 1.6 compliance"
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"serialNumber": {
|
| 116 |
+
"tier": "critical",
|
| 117 |
+
"weight": 4.0,
|
| 118 |
+
"category": "required_fields",
|
| 119 |
+
"description": "Unique identifier for this SBOM instance",
|
| 120 |
+
"jsonpath": "$.serialNumber",
|
| 121 |
+
"aibom_generation": {
|
| 122 |
+
"location": "$.serialNumber",
|
| 123 |
+
"rule": "always_include",
|
| 124 |
+
"source_fields": ["serialNumber"],
|
| 125 |
+
"validation": "required",
|
| 126 |
+
"data_type": "string"
|
| 127 |
+
},
|
| 128 |
+
"scoring": {
|
| 129 |
+
"points": 4.0,
|
| 130 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
| 131 |
+
"category_contribution": 0.2
|
| 132 |
+
},
|
| 133 |
+
"validation_message": {
|
| 134 |
+
"missing": "Missing critical field: serialNumber - unique identifier required",
|
| 135 |
+
"recommendation": "Generate a UUID for the SBOM instance"
|
| 136 |
+
}
|
| 137 |
+
},
|
| 138 |
+
"version": {
|
| 139 |
+
"tier": "critical",
|
| 140 |
+
"weight": 4.0,
|
| 141 |
+
"category": "required_fields",
|
| 142 |
+
"description": "Version of this SBOM document",
|
| 143 |
+
"jsonpath": "$.version",
|
| 144 |
+
"aibom_generation": {
|
| 145 |
+
"location": "$.version",
|
| 146 |
+
"rule": "always_include",
|
| 147 |
+
"source_fields": ["version"],
|
| 148 |
+
"validation": "required",
|
| 149 |
+
"data_type": "integer"
|
| 150 |
+
},
|
| 151 |
+
"scoring": {
|
| 152 |
+
"points": 4.0,
|
| 153 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
| 154 |
+
"category_contribution": 0.2
|
| 155 |
+
},
|
| 156 |
+
"validation_message": {
|
| 157 |
+
"missing": "Missing critical field: version - document version required",
|
| 158 |
+
"recommendation": "Set version to 1 for initial SBOM generation"
|
| 159 |
+
}
|
| 160 |
+
},
|
| 161 |
+
"primaryPurpose": {
|
| 162 |
+
"tier": "critical",
|
| 163 |
+
"weight": 4.0,
|
| 164 |
+
"category": "metadata",
|
| 165 |
+
"description": "Primary purpose or task of the AI model",
|
| 166 |
+
"jsonpath": "$.metadata.properties[?(@.name=='primaryPurpose')].value",
|
| 167 |
+
"aibom_generation": {
|
| 168 |
+
"location": "$.metadata.properties",
|
| 169 |
+
"rule": "include_if_available",
|
| 170 |
+
"source_fields": ["primaryPurpose", "pipeline_tag", "ai:task"],
|
| 171 |
+
"validation": "recommended",
|
| 172 |
+
"data_type": "string"
|
| 173 |
+
},
|
| 174 |
+
"scoring": {
|
| 175 |
+
"points": 4.0,
|
| 176 |
+
"required_for_profiles": ["standard", "advanced"],
|
| 177 |
+
"category_contribution": 0.2
|
| 178 |
+
},
|
| 179 |
+
"validation_message": {
|
| 180 |
+
"missing": "Missing critical field: primaryPurpose - essential for understanding model intent",
|
| 181 |
+
"recommendation": "Add the primary task or purpose of the AI model"
|
| 182 |
+
}
|
| 183 |
+
},
|
| 184 |
+
"suppliedBy": {
|
| 185 |
+
"tier": "critical",
|
| 186 |
+
"weight": 4.0,
|
| 187 |
+
"category": "metadata",
|
| 188 |
+
"description": "Organization or individual that supplied the model",
|
| 189 |
+
"jsonpath": "$.metadata.properties[?(@.name=='suppliedBy')].value",
|
| 190 |
+
"aibom_generation": {
|
| 191 |
+
"location": "$.metadata.properties",
|
| 192 |
+
"rule": "include_if_available",
|
| 193 |
+
"source_fields": ["suppliedBy", "author", "publisher"],
|
| 194 |
+
"validation": "recommended",
|
| 195 |
+
"data_type": "string"
|
| 196 |
+
},
|
| 197 |
+
"scoring": {
|
| 198 |
+
"points": 4.0,
|
| 199 |
+
"required_for_profiles": ["standard", "advanced"],
|
| 200 |
+
"category_contribution": 0.2
|
| 201 |
+
},
|
| 202 |
+
"validation_message": {
|
| 203 |
+
"missing": "Missing critical field: suppliedBy - supplier identification required",
|
| 204 |
+
"recommendation": "Add the organization or individual who provided the model"
|
| 205 |
+
}
|
| 206 |
+
},
|
| 207 |
+
"standardCompliance": {
|
| 208 |
+
"tier": "supplementary",
|
| 209 |
+
"weight": 1.0,
|
| 210 |
+
"category": "metadata",
|
| 211 |
+
"description": "Standards or regulations the model complies with",
|
| 212 |
+
"jsonpath": "$.metadata.properties[?(@.name=='standardCompliance')].value",
|
| 213 |
+
"aibom_generation": {
|
| 214 |
+
"location": "$.metadata.properties",
|
| 215 |
+
"rule": "include_if_available",
|
| 216 |
+
"source_fields": ["standardCompliance", "compliance"],
|
| 217 |
+
"validation": "optional",
|
| 218 |
+
"data_type": "string"
|
| 219 |
+
},
|
| 220 |
+
"scoring": {
|
| 221 |
+
"points": 1.0,
|
| 222 |
+
"required_for_profiles": ["advanced"],
|
| 223 |
+
"category_contribution": 0.05
|
| 224 |
+
},
|
| 225 |
+
"validation_message": {
|
| 226 |
+
"missing": "Missing supplementary field: standardCompliance - compliance information helpful",
|
| 227 |
+
"recommendation": "Add any relevant standards or regulations the model complies with"
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"domain": {
|
| 231 |
+
"tier": "supplementary",
|
| 232 |
+
"weight": 1.0,
|
| 233 |
+
"category": "metadata",
|
| 234 |
+
"description": "Domain or field of application",
|
| 235 |
+
"jsonpath": "$.metadata.properties[?(@.name=='domain')].value",
|
| 236 |
+
"aibom_generation": {
|
| 237 |
+
"location": "$.metadata.properties",
|
| 238 |
+
"rule": "include_if_available",
|
| 239 |
+
"source_fields": ["domain", "field", "application_area"],
|
| 240 |
+
"validation": "optional",
|
| 241 |
+
"data_type": "string"
|
| 242 |
+
},
|
| 243 |
+
"scoring": {
|
| 244 |
+
"points": 1.0,
|
| 245 |
+
"required_for_profiles": ["advanced"],
|
| 246 |
+
"category_contribution": 0.05
|
| 247 |
+
},
|
| 248 |
+
"validation_message": {
|
| 249 |
+
"missing": "Missing supplementary field: domain - application domain helpful for context",
|
| 250 |
+
"recommendation": "Add the domain or field where this model is typically applied"
|
| 251 |
+
}
|
| 252 |
+
},
|
| 253 |
+
"autonomyType": {
|
| 254 |
+
"tier": "supplementary",
|
| 255 |
+
"weight": 1.0,
|
| 256 |
+
"category": "metadata",
|
| 257 |
+
"description": "Level of autonomy or human involvement required",
|
| 258 |
+
"jsonpath": "$.metadata.properties[?(@.name=='autonomyType')].value",
|
| 259 |
+
"aibom_generation": {
|
| 260 |
+
"location": "$.metadata.properties",
|
| 261 |
+
"rule": "include_if_available",
|
| 262 |
+
"source_fields": ["autonomyType", "autonomy_level"],
|
| 263 |
+
"validation": "optional",
|
| 264 |
+
"data_type": "string"
|
| 265 |
+
},
|
| 266 |
+
"scoring": {
|
| 267 |
+
"points": 1.0,
|
| 268 |
+
"required_for_profiles": ["advanced"],
|
| 269 |
+
"category_contribution": 0.05
|
| 270 |
+
},
|
| 271 |
+
"validation_message": {
|
| 272 |
+
"missing": "Missing supplementary field: autonomyType - autonomy level information helpful",
|
| 273 |
+
"recommendation": "Add information about the level of human oversight required"
|
| 274 |
+
}
|
| 275 |
+
},
|
| 276 |
+
"name": {
|
| 277 |
+
"tier": "critical",
|
| 278 |
+
"weight": 4.0,
|
| 279 |
+
"category": "component_basic",
|
| 280 |
+
"description": "Name of the AI model component",
|
| 281 |
+
"jsonpath": "$.components[0].name",
|
| 282 |
+
"aibom_generation": {
|
| 283 |
+
"location": "$.components[0].name",
|
| 284 |
+
"rule": "always_include",
|
| 285 |
+
"source_fields": ["name", "model_name"],
|
| 286 |
+
"validation": "required",
|
| 287 |
+
"data_type": "string"
|
| 288 |
+
},
|
| 289 |
+
"scoring": {
|
| 290 |
+
"points": 4.0,
|
| 291 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
| 292 |
+
"category_contribution": 0.2
|
| 293 |
+
},
|
| 294 |
+
"validation_message": {
|
| 295 |
+
"missing": "Missing critical field: name - essential for model identification",
|
| 296 |
+
"recommendation": "Add a descriptive name for the model"
|
| 297 |
+
}
|
| 298 |
+
},
|
| 299 |
+
"type": {
|
| 300 |
+
"tier": "important",
|
| 301 |
+
"weight": 3.0,
|
| 302 |
+
"category": "component_basic",
|
| 303 |
+
"description": "Type of component (machine-learning-model)",
|
| 304 |
+
"jsonpath": "$.components[0].type",
|
| 305 |
+
"aibom_generation": {
|
| 306 |
+
"location": "$.components[0].type",
|
| 307 |
+
"rule": "always_include",
|
| 308 |
+
"source_fields": ["type"],
|
| 309 |
+
"validation": "required",
|
| 310 |
+
"data_type": "string"
|
| 311 |
+
},
|
| 312 |
+
"scoring": {
|
| 313 |
+
"points": 3.0,
|
| 314 |
+
"required_for_profiles": ["standard", "advanced"],
|
| 315 |
+
"category_contribution": 0.15
|
| 316 |
+
},
|
| 317 |
+
"validation_message": {
|
| 318 |
+
"missing": "Missing important field: type - component type classification needed",
|
| 319 |
+
"recommendation": "Set type to 'machine-learning-model' for AI models"
|
| 320 |
+
}
|
| 321 |
+
},
|
| 322 |
+
"purl": {
|
| 323 |
+
"tier": "important",
|
| 324 |
+
"weight": 3.0,
|
| 325 |
+
"category": "component_basic",
|
| 326 |
+
"description": "Package URL identifier",
|
| 327 |
+
"jsonpath": "$.components[0].purl",
|
| 328 |
+
"aibom_generation": {
|
| 329 |
+
"location": "$.components[0].purl",
|
| 330 |
+
"rule": "include_if_available",
|
| 331 |
+
"source_fields": ["purl", "package_url"],
|
| 332 |
+
"validation": "recommended",
|
| 333 |
+
"data_type": "string"
|
| 334 |
+
},
|
| 335 |
+
"scoring": {
|
| 336 |
+
"points": 3.0,
|
| 337 |
+
"required_for_profiles": ["standard", "advanced"],
|
| 338 |
+
"category_contribution": 0.15
|
| 339 |
+
},
|
| 340 |
+
"validation_message": {
|
| 341 |
+
"missing": "Missing important field: purl - package URL for identification",
|
| 342 |
+
"recommendation": "Add a Package URL (PURL) for the model"
|
| 343 |
+
}
|
| 344 |
+
},
|
| 345 |
+
"description": {
|
| 346 |
+
"tier": "important",
|
| 347 |
+
"weight": 3.0,
|
| 348 |
+
"category": "component_basic",
|
| 349 |
+
"description": "Description of the AI model",
|
| 350 |
+
"jsonpath": "$.components[0].description",
|
| 351 |
+
"aibom_generation": {
|
| 352 |
+
"location": "$.components[0].description",
|
| 353 |
+
"rule": "include_if_available",
|
| 354 |
+
"source_fields": ["description", "summary"],
|
| 355 |
+
"validation": "recommended",
|
| 356 |
+
"data_type": "string"
|
| 357 |
+
},
|
| 358 |
+
"scoring": {
|
| 359 |
+
"points": 3.0,
|
| 360 |
+
"required_for_profiles": ["standard", "advanced"],
|
| 361 |
+
"category_contribution": 0.15
|
| 362 |
+
},
|
| 363 |
+
"validation_message": {
|
| 364 |
+
"missing": "Missing important field: description - model description helpful for understanding",
|
| 365 |
+
"recommendation": "Add a clear description of what the model does"
|
| 366 |
+
}
|
| 367 |
+
},
|
| 368 |
+
"licenses": {
|
| 369 |
+
"tier": "important",
|
| 370 |
+
"weight": 3.0,
|
| 371 |
+
"category": "component_basic",
|
| 372 |
+
"description": "License information for the model",
|
| 373 |
+
"jsonpath": "$.components[0].licenses",
|
| 374 |
+
"aibom_generation": {
|
| 375 |
+
"location": "$.components[0].licenses",
|
| 376 |
+
"rule": "include_if_available",
|
| 377 |
+
"source_fields": ["licenses", "license"],
|
| 378 |
+
"validation": "recommended",
|
| 379 |
+
"data_type": "array"
|
| 380 |
+
},
|
| 381 |
+
"scoring": {
|
| 382 |
+
"points": 3.0,
|
| 383 |
+
"required_for_profiles": ["standard", "advanced"],
|
| 384 |
+
"category_contribution": 0.15
|
| 385 |
+
},
|
| 386 |
+
"validation_message": {
|
| 387 |
+
"missing": "Missing important field: licenses - license information important for compliance",
|
| 388 |
+
"recommendation": "Add license information for the model"
|
| 389 |
+
}
|
| 390 |
+
},
|
| 391 |
+
"energyConsumption": {
|
| 392 |
+
"tier": "important",
|
| 393 |
+
"weight": 2.0,
|
| 394 |
+
"category": "component_model_card",
|
| 395 |
+
"description": "Energy consumption information",
|
| 396 |
+
"jsonpath": "$.metadata.properties[?(@.name=='energyConsumption')].value",
|
| 397 |
+
"aibom_generation": {
|
| 398 |
+
"location": "$.metadata.properties",
|
| 399 |
+
"rule": "include_if_available",
|
| 400 |
+
"source_fields": ["energyConsumption", "energy_usage"],
|
| 401 |
+
"validation": "optional",
|
| 402 |
+
"data_type": "string"
|
| 403 |
+
},
|
| 404 |
+
"scoring": {
|
| 405 |
+
"points": 2.0,
|
| 406 |
+
"required_for_profiles": ["advanced"],
|
| 407 |
+
"category_contribution": 0.067
|
| 408 |
+
},
|
| 409 |
+
"validation_message": {
|
| 410 |
+
"missing": "Missing important field: energyConsumption - energy usage information helpful for sustainability",
|
| 411 |
+
"recommendation": "Add information about the model's energy consumption"
|
| 412 |
+
}
|
| 413 |
+
},
|
| 414 |
+
"hyperparameter": {
|
| 415 |
+
"tier": "important",
|
| 416 |
+
"weight": 2.0,
|
| 417 |
+
"category": "component_model_card",
|
| 418 |
+
"description": "Key hyperparameters used in training",
|
| 419 |
+
"jsonpath": "$.metadata.properties[?(@.name=='hyperparameter')].value",
|
| 420 |
+
"aibom_generation": {
|
| 421 |
+
"location": "$.metadata.properties",
|
| 422 |
+
"rule": "include_if_available",
|
| 423 |
+
"source_fields": ["hyperparameter", "hyperparameters", "training_params"],
|
| 424 |
+
"validation": "optional",
|
| 425 |
+
"data_type": "string"
|
| 426 |
+
},
|
| 427 |
+
"scoring": {
|
| 428 |
+
"points": 2.0,
|
| 429 |
+
"required_for_profiles": ["advanced"],
|
| 430 |
+
"category_contribution": 0.067
|
| 431 |
+
},
|
| 432 |
+
"validation_message": {
|
| 433 |
+
"missing": "Missing important field: hyperparameter - training configuration helpful for reproducibility",
|
| 434 |
+
"recommendation": "Add key hyperparameters used during model training"
|
| 435 |
+
}
|
| 436 |
+
},
|
| 437 |
+
"limitation": {
|
| 438 |
+
"tier": "important",
|
| 439 |
+
"weight": 2.0,
|
| 440 |
+
"category": "component_model_card",
|
| 441 |
+
"description": "Known limitations of the model",
|
| 442 |
+
"jsonpath": "$.metadata.properties[?(@.name=='limitation')].value",
|
| 443 |
+
"aibom_generation": {
|
| 444 |
+
"location": "$.metadata.properties",
|
| 445 |
+
"rule": "include_if_available",
|
| 446 |
+
"source_fields": ["limitation", "limitations", "known_issues"],
|
| 447 |
+
"validation": "optional",
|
| 448 |
+
"data_type": "string"
|
| 449 |
+
},
|
| 450 |
+
"scoring": {
|
| 451 |
+
"points": 2.0,
|
| 452 |
+
"required_for_profiles": ["advanced"],
|
| 453 |
+
"category_contribution": 0.067
|
| 454 |
+
},
|
| 455 |
+
"validation_message": {
|
| 456 |
+
"missing": "Missing important field: limitation - known limitations important for responsible use",
|
| 457 |
+
"recommendation": "Add information about known limitations or constraints"
|
| 458 |
+
}
|
| 459 |
+
},
|
| 460 |
+
"safetyRiskAssessment": {
|
| 461 |
+
"tier": "important",
|
| 462 |
+
"weight": 2.0,
|
| 463 |
+
"category": "component_model_card",
|
| 464 |
+
"description": "Safety and risk assessment information",
|
| 465 |
+
"jsonpath": "$.metadata.properties[?(@.name=='safetyRiskAssessment')].value",
|
| 466 |
+
"aibom_generation": {
|
| 467 |
+
"location": "$.metadata.properties",
|
| 468 |
+
"rule": "include_if_available",
|
| 469 |
+
"source_fields": ["safetyRiskAssessment", "safety_assessment", "risk_analysis"],
|
| 470 |
+
"validation": "optional",
|
| 471 |
+
"data_type": "string"
|
| 472 |
+
},
|
| 473 |
+
"scoring": {
|
| 474 |
+
"points": 2.0,
|
| 475 |
+
"required_for_profiles": ["advanced"],
|
| 476 |
+
"category_contribution": 0.067
|
| 477 |
+
},
|
| 478 |
+
"validation_message": {
|
| 479 |
+
"missing": "Missing important field: safetyRiskAssessment - safety assessment important for responsible deployment",
|
| 480 |
+
"recommendation": "Add safety and risk assessment information"
|
| 481 |
+
}
|
| 482 |
+
},
|
| 483 |
+
"typeOfModel": {
|
| 484 |
+
"tier": "important",
|
| 485 |
+
"weight": 2.0,
|
| 486 |
+
"category": "component_model_card",
|
| 487 |
+
"description": "Type or architecture of the model",
|
| 488 |
+
"jsonpath": "$.metadata.properties[?(@.name=='typeOfModel')].value",
|
| 489 |
+
"aibom_generation": {
|
| 490 |
+
"location": "$.metadata.properties",
|
| 491 |
+
"rule": "include_if_available",
|
| 492 |
+
"source_fields": ["typeOfModel", "model_type", "architecture"],
|
| 493 |
+
"validation": "recommended",
|
| 494 |
+
"data_type": "string"
|
| 495 |
+
},
|
| 496 |
+
"scoring": {
|
| 497 |
+
"points": 2.0,
|
| 498 |
+
"required_for_profiles": ["advanced"],
|
| 499 |
+
"category_contribution": 0.067
|
| 500 |
+
},
|
| 501 |
+
"validation_message": {
|
| 502 |
+
"missing": "Missing important field: typeOfModel - model architecture information helpful",
|
| 503 |
+
"recommendation": "Add the type or architecture of the model (e.g., Transformer, CNN)"
|
| 504 |
+
}
|
| 505 |
+
},
|
| 506 |
+
"modelExplainability": {
|
| 507 |
+
"tier": "supplementary",
|
| 508 |
+
"weight": 1.0,
|
| 509 |
+
"category": "component_model_card",
|
| 510 |
+
"description": "Information about model explainability",
|
| 511 |
+
"jsonpath": "$.metadata.properties[?(@.name=='modelExplainability')].value",
|
| 512 |
+
"aibom_generation": {
|
| 513 |
+
"location": "$.metadata.properties",
|
| 514 |
+
"rule": "include_if_available",
|
| 515 |
+
"source_fields": ["modelExplainability", "explainability", "interpretability"],
|
| 516 |
+
"validation": "optional",
|
| 517 |
+
"data_type": "string"
|
| 518 |
+
},
|
| 519 |
+
"scoring": {
|
| 520 |
+
"points": 1.0,
|
| 521 |
+
"required_for_profiles": ["advanced"],
|
| 522 |
+
"category_contribution": 0.033
|
| 523 |
+
},
|
| 524 |
+
"validation_message": {
|
| 525 |
+
"missing": "Missing supplementary field: modelExplainability - explainability information helpful for transparency",
|
| 526 |
+
"recommendation": "Add information about model explainability or interpretability features"
|
| 527 |
+
}
|
| 528 |
+
},
|
| 529 |
+
"energyQuantity": {
|
| 530 |
+
"tier": "supplementary",
|
| 531 |
+
"weight": 1.0,
|
| 532 |
+
"category": "component_model_card",
|
| 533 |
+
"description": "Quantitative energy consumption data",
|
| 534 |
+
"jsonpath": "$.metadata.properties[?(@.name=='energyQuantity')].value",
|
| 535 |
+
"aibom_generation": {
|
| 536 |
+
"location": "$.metadata.properties",
|
| 537 |
+
"rule": "include_if_available",
|
| 538 |
+
"source_fields": ["energyQuantity", "energy_amount"],
|
| 539 |
+
"validation": "optional",
|
| 540 |
+
"data_type": "number"
|
| 541 |
+
},
|
| 542 |
+
"scoring": {
|
| 543 |
+
"points": 1.0,
|
| 544 |
+
"required_for_profiles": ["advanced"],
|
| 545 |
+
"category_contribution": 0.033
|
| 546 |
+
},
|
| 547 |
+
"validation_message": {
|
| 548 |
+
"missing": "Missing supplementary field: energyQuantity - quantitative energy data helpful for sustainability metrics",
|
| 549 |
+
"recommendation": "Add specific energy consumption quantities"
|
| 550 |
+
}
|
| 551 |
+
},
|
| 552 |
+
"energyUnit": {
|
| 553 |
+
"tier": "supplementary",
|
| 554 |
+
"weight": 1.0,
|
| 555 |
+
"category": "component_model_card",
|
| 556 |
+
"description": "Unit of measurement for energy consumption",
|
| 557 |
+
"jsonpath": "$.metadata.properties[?(@.name=='energyUnit')].value",
|
| 558 |
+
"aibom_generation": {
|
| 559 |
+
"location": "$.metadata.properties",
|
| 560 |
+
"rule": "include_if_available",
|
| 561 |
+
"source_fields": ["energyUnit", "energy_unit"],
|
| 562 |
+
"validation": "optional",
|
| 563 |
+
"data_type": "string"
|
| 564 |
+
},
|
| 565 |
+
"scoring": {
|
| 566 |
+
"points": 1.0,
|
| 567 |
+
"required_for_profiles": ["advanced"],
|
| 568 |
+
"category_contribution": 0.033
|
| 569 |
+
},
|
| 570 |
+
"validation_message": {
|
| 571 |
+
"missing": "Missing supplementary field: energyUnit - energy measurement unit helpful for standardization",
|
| 572 |
+
"recommendation": "Add the unit of measurement for energy consumption (e.g., kWh, Joules)"
|
| 573 |
+
}
|
| 574 |
+
},
|
| 575 |
+
"informationAboutTraining": {
|
| 576 |
+
"tier": "supplementary",
|
| 577 |
+
"weight": 1.0,
|
| 578 |
+
"category": "component_model_card",
|
| 579 |
+
"description": "Information about the training process",
|
| 580 |
+
"jsonpath": "$.metadata.properties[?(@.name=='informationAboutTraining')].value",
|
| 581 |
+
"aibom_generation": {
|
| 582 |
+
"location": "$.metadata.properties",
|
| 583 |
+
"rule": "include_if_available",
|
| 584 |
+
"source_fields": ["informationAboutTraining", "training_info", "training_details"],
|
| 585 |
+
"validation": "optional",
|
| 586 |
+
"data_type": "string"
|
| 587 |
+
},
|
| 588 |
+
"scoring": {
|
| 589 |
+
"points": 1.0,
|
| 590 |
+
"required_for_profiles": ["advanced"],
|
| 591 |
+
"category_contribution": 0.033
|
| 592 |
+
},
|
| 593 |
+
"validation_message": {
|
| 594 |
+
"missing": "Missing supplementary field: informationAboutTraining - training details helpful for understanding model development",
|
| 595 |
+
"recommendation": "Add information about the training process and methodology"
|
| 596 |
+
}
|
| 597 |
+
},
|
| 598 |
+
"informationAboutApplication": {
|
| 599 |
+
"tier": "supplementary",
|
| 600 |
+
"weight": 1.0,
|
| 601 |
+
"category": "component_model_card",
|
| 602 |
+
"description": "Information about intended applications",
|
| 603 |
+
"jsonpath": "$.metadata.properties[?(@.name=='informationAboutApplication')].value",
|
| 604 |
+
"aibom_generation": {
|
| 605 |
+
"location": "$.metadata.properties",
|
| 606 |
+
"rule": "include_if_available",
|
| 607 |
+
"source_fields": ["informationAboutApplication", "application_info", "intended_use"],
|
| 608 |
+
"validation": "optional",
|
| 609 |
+
"data_type": "string"
|
| 610 |
+
},
|
| 611 |
+
"scoring": {
|
| 612 |
+
"points": 1.0,
|
| 613 |
+
"required_for_profiles": ["advanced"],
|
| 614 |
+
"category_contribution": 0.033
|
| 615 |
+
},
|
| 616 |
+
"validation_message": {
|
| 617 |
+
"missing": "Missing supplementary field: informationAboutApplication - application guidance helpful for proper usage",
|
| 618 |
+
"recommendation": "Add information about intended applications and use cases"
|
| 619 |
+
}
|
| 620 |
+
},
|
| 621 |
+
"metric": {
|
| 622 |
+
"tier": "supplementary",
|
| 623 |
+
"weight": 1.0,
|
| 624 |
+
"category": "component_model_card",
|
| 625 |
+
"description": "Performance metrics and evaluation results",
|
| 626 |
+
"jsonpath": "$.metadata.properties[?(@.name=='metric')].value",
|
| 627 |
+
"aibom_generation": {
|
| 628 |
+
"location": "$.metadata.properties",
|
| 629 |
+
"rule": "include_if_available",
|
| 630 |
+
"source_fields": ["metric", "metrics", "performance"],
|
| 631 |
+
"validation": "optional",
|
| 632 |
+
"data_type": "string"
|
| 633 |
+
},
|
| 634 |
+
"scoring": {
|
| 635 |
+
"points": 1.0,
|
| 636 |
+
"required_for_profiles": ["advanced"],
|
| 637 |
+
"category_contribution": 0.033
|
| 638 |
+
},
|
| 639 |
+
"validation_message": {
|
| 640 |
+
"missing": "Missing supplementary field: metric - performance metrics helpful for evaluation",
|
| 641 |
+
"recommendation": "Add performance metrics and evaluation results"
|
| 642 |
+
}
|
| 643 |
+
},
|
| 644 |
+
"metricDecisionThreshold": {
|
| 645 |
+
"tier": "supplementary",
|
| 646 |
+
"weight": 1.0,
|
| 647 |
+
"category": "component_model_card",
|
| 648 |
+
"description": "Decision thresholds for metrics",
|
| 649 |
+
"jsonpath": "$.metadata.properties[?(@.name=='metricDecisionThreshold')].value",
|
| 650 |
+
"aibom_generation": {
|
| 651 |
+
"location": "$.metadata.properties",
|
| 652 |
+
"rule": "include_if_available",
|
| 653 |
+
"source_fields": ["metricDecisionThreshold", "decision_threshold", "threshold"],
|
| 654 |
+
"validation": "optional",
|
| 655 |
+
"data_type": "number"
|
| 656 |
+
},
|
| 657 |
+
"scoring": {
|
| 658 |
+
"points": 1.0,
|
| 659 |
+
"required_for_profiles": ["advanced"],
|
| 660 |
+
"category_contribution": 0.033
|
| 661 |
+
},
|
| 662 |
+
"validation_message": {
|
| 663 |
+
"missing": "Missing supplementary field: metricDecisionThreshold - decision thresholds helpful for operational guidance",
|
| 664 |
+
"recommendation": "Add decision thresholds for performance metrics"
|
| 665 |
+
}
|
| 666 |
+
},
|
| 667 |
+
"modelDataPreprocessing": {
|
| 668 |
+
"tier": "supplementary",
|
| 669 |
+
"weight": 1.0,
|
| 670 |
+
"category": "component_model_card",
|
| 671 |
+
"description": "Data preprocessing information",
|
| 672 |
+
"jsonpath": "$.metadata.properties[?(@.name=='modelDataPreprocessing')].value",
|
| 673 |
+
"aibom_generation": {
|
| 674 |
+
"location": "$.metadata.properties",
|
| 675 |
+
"rule": "include_if_available",
|
| 676 |
+
"source_fields": ["modelDataPreprocessing", "data_preprocessing", "preprocessing"],
|
| 677 |
+
"validation": "optional",
|
| 678 |
+
"data_type": "string"
|
| 679 |
+
},
|
| 680 |
+
"scoring": {
|
| 681 |
+
"points": 1.0,
|
| 682 |
+
"required_for_profiles": ["advanced"],
|
| 683 |
+
"category_contribution": 0.033
|
| 684 |
+
},
|
| 685 |
+
"validation_message": {
|
| 686 |
+
"missing": "Missing supplementary field: modelDataPreprocessing - preprocessing details helpful for reproducibility",
|
| 687 |
+
"recommendation": "Add information about data preprocessing steps"
|
| 688 |
+
}
|
| 689 |
+
},
|
| 690 |
+
"useSensitivePersonalInformation": {
|
| 691 |
+
"tier": "supplementary",
|
| 692 |
+
"weight": 1.0,
|
| 693 |
+
"category": "component_model_card",
|
| 694 |
+
"description": "Information about use of sensitive personal data",
|
| 695 |
+
"jsonpath": "$.metadata.properties[?(@.name=='useSensitivePersonalInformation')].value",
|
| 696 |
+
"aibom_generation": {
|
| 697 |
+
"location": "$.metadata.properties",
|
| 698 |
+
"rule": "include_if_available",
|
| 699 |
+
"source_fields": ["useSensitivePersonalInformation", "sensitive_data", "personal_data"],
|
| 700 |
+
"validation": "optional",
|
| 701 |
+
"data_type": "boolean"
|
| 702 |
+
},
|
| 703 |
+
"scoring": {
|
| 704 |
+
"points": 1.0,
|
| 705 |
+
"required_for_profiles": ["advanced"],
|
| 706 |
+
"category_contribution": 0.033
|
| 707 |
+
},
|
| 708 |
+
"validation_message": {
|
| 709 |
+
"missing": "Missing supplementary field: useSensitivePersonalInformation - privacy information important for compliance",
|
| 710 |
+
"recommendation": "Add information about use of sensitive or personal data"
|
| 711 |
+
}
|
| 712 |
+
},
|
| 713 |
+
"downloadLocation": {
|
| 714 |
+
"tier": "critical",
|
| 715 |
+
"weight": 4.0,
|
| 716 |
+
"category": "external_references",
|
| 717 |
+
"description": "Location where the model can be downloaded",
|
| 718 |
+
"jsonpath": "$.externalReferences[0].url",
|
| 719 |
+
"aibom_generation": {
|
| 720 |
+
"location": "$.externalReferences",
|
| 721 |
+
"rule": "include_if_available",
|
| 722 |
+
"source_fields": ["downloadLocation", "download_url", "repository_url"],
|
| 723 |
+
"validation": "recommended",
|
| 724 |
+
"data_type": "string"
|
| 725 |
+
},
|
| 726 |
+
"scoring": {
|
| 727 |
+
"points": 4.0,
|
| 728 |
+
"required_for_profiles": ["standard", "advanced"],
|
| 729 |
+
"category_contribution": 1.0
|
| 730 |
+
},
|
| 731 |
+
"validation_message": {
|
| 732 |
+
"missing": "Missing critical field: downloadLocation - download location essential for model access",
|
| 733 |
+
"recommendation": "Add the URL where the model can be downloaded or accessed"
|
| 734 |
+
}
|
| 735 |
+
}
|
| 736 |
+
}
|
| 737 |
+
}
|
src/aibom-generator/field_registry_manager.py
ADDED
|
@@ -0,0 +1,648 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Field Registry Manager for AI SBOM Generator
|
| 3 |
+
Combines registry loading, configuration generation, and field detection functionality
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from typing import Dict, Any, Optional, List, Tuple
|
| 10 |
+
from functools import lru_cache
|
| 11 |
+
|
| 12 |
+
class FieldRegistryManager:
|
| 13 |
+
"""
|
| 14 |
+
Field registry manager that handles:
|
| 15 |
+
1. Registry loading and validation
|
| 16 |
+
2. Configuration generation for utils.py compatibility
|
| 17 |
+
3. Field detection and JSONPath parsing
|
| 18 |
+
4. AIBOM completeness analysis
|
| 19 |
+
5. Scoring calculations
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, registry_path: Optional[str] = None):
|
| 23 |
+
"""
|
| 24 |
+
Initialize the field registry manager
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
registry_path: Path to the field registry JSON file. If None, auto-detects.
|
| 28 |
+
"""
|
| 29 |
+
if registry_path is None:
|
| 30 |
+
# Auto-detect registry path relative to this file
|
| 31 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 32 |
+
registry_path = os.path.join(current_dir, "field_registry.json")
|
| 33 |
+
|
| 34 |
+
self.registry_path = registry_path
|
| 35 |
+
self.registry = self._load_registry()
|
| 36 |
+
|
| 37 |
+
# Cache for performance
|
| 38 |
+
self._field_classification = None
|
| 39 |
+
self._completeness_profiles = None
|
| 40 |
+
self._validation_messages = None
|
| 41 |
+
self._scoring_weights = None
|
| 42 |
+
|
| 43 |
+
def _load_registry(self) -> Dict[str, Any]:
|
| 44 |
+
"""Load the complete field registry from JSON file"""
|
| 45 |
+
try:
|
| 46 |
+
with open(self.registry_path, 'r', encoding='utf-8') as f:
|
| 47 |
+
registry = json.load(f)
|
| 48 |
+
|
| 49 |
+
# Validate basic structure
|
| 50 |
+
required_sections = ["fields"]
|
| 51 |
+
missing_sections = [section for section in required_sections if section not in registry]
|
| 52 |
+
|
| 53 |
+
if missing_sections:
|
| 54 |
+
raise ValueError(f"Registry missing required sections: {missing_sections}")
|
| 55 |
+
|
| 56 |
+
# Validate fields structure
|
| 57 |
+
fields = registry.get('fields', {})
|
| 58 |
+
if not fields:
|
| 59 |
+
raise ValueError("Registry 'fields' section is empty")
|
| 60 |
+
|
| 61 |
+
print(f"β
Field registry loaded: {len(fields)} fields from {self.registry_path}")
|
| 62 |
+
return registry
|
| 63 |
+
|
| 64 |
+
except FileNotFoundError:
|
| 65 |
+
raise FileNotFoundError(f"Field registry not found at: {self.registry_path}")
|
| 66 |
+
except json.JSONDecodeError as e:
|
| 67 |
+
raise ValueError(f"Invalid JSON in field registry: {e}")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
raise Exception(f"Failed to load field registry: {e}")
|
| 70 |
+
|
| 71 |
+
# =============================================================================
|
| 72 |
+
# CONFIGURATION GENERATION
|
| 73 |
+
# =============================================================================
|
| 74 |
+
|
| 75 |
+
@lru_cache(maxsize=1)
|
| 76 |
+
def get_scoring_config(self) -> Dict[str, Any]:
|
| 77 |
+
"""Get scoring configuration from registry"""
|
| 78 |
+
return self.registry.get('scoring_config', {})
|
| 79 |
+
|
| 80 |
+
@lru_cache(maxsize=1)
|
| 81 |
+
def get_aibom_config(self) -> Dict[str, Any]:
|
| 82 |
+
"""Get AIBOM generation configuration from registry"""
|
| 83 |
+
return self.registry.get('aibom_config', {})
|
| 84 |
+
|
| 85 |
+
@lru_cache(maxsize=1)
|
| 86 |
+
def get_field_definitions(self) -> Dict[str, Any]:
|
| 87 |
+
"""Get all field definitions from registry"""
|
| 88 |
+
return self.registry.get('fields', {})
|
| 89 |
+
|
| 90 |
+
def generate_field_classification(self) -> Dict[str, Any]:
|
| 91 |
+
"""
|
| 92 |
+
Generate FIELD_CLASSIFICATION dictionary from registry
|
| 93 |
+
"""
|
| 94 |
+
if self._field_classification is not None:
|
| 95 |
+
return self._field_classification
|
| 96 |
+
|
| 97 |
+
fields = self.get_field_definitions()
|
| 98 |
+
classification = {}
|
| 99 |
+
|
| 100 |
+
for field_name, field_config in fields.items():
|
| 101 |
+
classification[field_name] = {
|
| 102 |
+
"tier": field_config.get("tier", "supplementary"),
|
| 103 |
+
"weight": field_config.get("weight", 1),
|
| 104 |
+
"category": field_config.get("category", "unknown")
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
self._field_classification = classification
|
| 108 |
+
return classification
|
| 109 |
+
|
| 110 |
+
def generate_completeness_profiles(self) -> Dict[str, Any]:
|
| 111 |
+
"""
|
| 112 |
+
Generate COMPLETENESS_PROFILES dictionary from registry
|
| 113 |
+
"""
|
| 114 |
+
if self._completeness_profiles is not None:
|
| 115 |
+
return self._completeness_profiles
|
| 116 |
+
|
| 117 |
+
scoring_config = self.get_scoring_config()
|
| 118 |
+
profiles = scoring_config.get("scoring_profiles", {})
|
| 119 |
+
|
| 120 |
+
# Convert to utils.py format
|
| 121 |
+
completeness_profiles = {}
|
| 122 |
+
for profile_name, profile_config in profiles.items():
|
| 123 |
+
completeness_profiles[profile_name] = {
|
| 124 |
+
"description": profile_config.get("description", f"{profile_name.title()} completeness profile"),
|
| 125 |
+
"required_fields": profile_config.get("required_fields", []),
|
| 126 |
+
"minimum_score": profile_config.get("minimum_score", 50)
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# Fallback profiles if none defined in registry
|
| 130 |
+
if not completeness_profiles:
|
| 131 |
+
completeness_profiles = {
|
| 132 |
+
"basic": {
|
| 133 |
+
"description": "Minimal fields required for identification",
|
| 134 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
|
| 135 |
+
"minimum_score": 40
|
| 136 |
+
},
|
| 137 |
+
"standard": {
|
| 138 |
+
"description": "Comprehensive fields for proper documentation",
|
| 139 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
| 140 |
+
"downloadLocation", "primaryPurpose", "suppliedBy"],
|
| 141 |
+
"minimum_score": 70
|
| 142 |
+
},
|
| 143 |
+
"advanced": {
|
| 144 |
+
"description": "Extensive documentation for maximum transparency",
|
| 145 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
| 146 |
+
"downloadLocation", "primaryPurpose", "suppliedBy",
|
| 147 |
+
"type", "purl", "description", "licenses", "hyperparameter", "limitation",
|
| 148 |
+
"energyConsumption", "safetyRiskAssessment", "typeOfModel"],
|
| 149 |
+
"minimum_score": 85
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
self._completeness_profiles = completeness_profiles
|
| 154 |
+
return completeness_profiles
|
| 155 |
+
|
| 156 |
+
def generate_validation_messages(self) -> Dict[str, Any]:
|
| 157 |
+
"""
|
| 158 |
+
Generate VALIDATION_MESSAGES dictionary from registry
|
| 159 |
+
"""
|
| 160 |
+
if self._validation_messages is not None:
|
| 161 |
+
return self._validation_messages
|
| 162 |
+
|
| 163 |
+
fields = self.get_field_definitions()
|
| 164 |
+
validation_messages = {}
|
| 165 |
+
|
| 166 |
+
for field_name, field_config in fields.items():
|
| 167 |
+
validation_msg = field_config.get("validation_message", {})
|
| 168 |
+
if validation_msg:
|
| 169 |
+
validation_messages[field_name] = {
|
| 170 |
+
"missing": validation_msg.get("missing", f"Missing field: {field_name}"),
|
| 171 |
+
"recommendation": validation_msg.get("recommendation", f"Consider adding {field_name} field")
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
self._validation_messages = validation_messages
|
| 175 |
+
return validation_messages
|
| 176 |
+
|
| 177 |
+
def get_configurable_scoring_weights(self) -> Dict[str, Any]:
|
| 178 |
+
"""Get configurable scoring weights from registry"""
|
| 179 |
+
if self._scoring_weights is not None:
|
| 180 |
+
return self._scoring_weights
|
| 181 |
+
|
| 182 |
+
scoring_config = self.get_scoring_config()
|
| 183 |
+
|
| 184 |
+
weights = {
|
| 185 |
+
"tier_weights": scoring_config.get("tier_weights", {
|
| 186 |
+
"critical": 3,
|
| 187 |
+
"important": 2,
|
| 188 |
+
"supplementary": 1
|
| 189 |
+
}),
|
| 190 |
+
"category_weights": scoring_config.get("category_weights", {
|
| 191 |
+
"required_fields": 20,
|
| 192 |
+
"metadata": 20,
|
| 193 |
+
"component_basic": 20,
|
| 194 |
+
"component_model_card": 30,
|
| 195 |
+
"external_references": 10
|
| 196 |
+
}),
|
| 197 |
+
"algorithm_config": scoring_config.get("algorithm_config", {
|
| 198 |
+
"type": "weighted_sum",
|
| 199 |
+
"max_score": 100,
|
| 200 |
+
"normalization": "category_based"
|
| 201 |
+
})
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
self._scoring_weights = weights
|
| 205 |
+
return weights
|
| 206 |
+
|
| 207 |
+
# =============================================================================
|
| 208 |
+
# FIELD DETECTION
|
| 209 |
+
# =============================================================================
|
| 210 |
+
|
| 211 |
+
def _get_nested_value(self, data: dict, path: str) -> Tuple[bool, Any]:
|
| 212 |
+
"""
|
| 213 |
+
Get value from nested dictionary using dot notation and array filters
|
| 214 |
+
Supports paths like: $.components[0].name, $.metadata.properties[?(@.name=='primaryPurpose')].value
|
| 215 |
+
"""
|
| 216 |
+
try:
|
| 217 |
+
# Remove leading $. if present
|
| 218 |
+
if path.startswith('$.'):
|
| 219 |
+
path = path[2:]
|
| 220 |
+
|
| 221 |
+
# Handle special JSONPath-like syntax for property arrays
|
| 222 |
+
if '[?(@.name==' in path:
|
| 223 |
+
return self._handle_property_array_path(data, path)
|
| 224 |
+
|
| 225 |
+
# Split path and traverse
|
| 226 |
+
parts = self._split_path(path)
|
| 227 |
+
current = data
|
| 228 |
+
|
| 229 |
+
for part in parts:
|
| 230 |
+
if '[' in part and ']' in part:
|
| 231 |
+
# Handle array access like components[0]
|
| 232 |
+
key, index_str = part.split('[')
|
| 233 |
+
index = int(index_str.rstrip(']'))
|
| 234 |
+
|
| 235 |
+
if key and key in current:
|
| 236 |
+
current = current[key]
|
| 237 |
+
|
| 238 |
+
if isinstance(current, list) and 0 <= index < len(current):
|
| 239 |
+
current = current[index]
|
| 240 |
+
else:
|
| 241 |
+
return False, None
|
| 242 |
+
else:
|
| 243 |
+
# Regular key access
|
| 244 |
+
if isinstance(current, dict) and part in current:
|
| 245 |
+
current = current[part]
|
| 246 |
+
else:
|
| 247 |
+
return False, None
|
| 248 |
+
|
| 249 |
+
# Check if value is meaningful
|
| 250 |
+
if current is not None and current != "" and current != []:
|
| 251 |
+
return True, current
|
| 252 |
+
|
| 253 |
+
return False, None
|
| 254 |
+
|
| 255 |
+
except Exception as e:
|
| 256 |
+
print(f"Error getting value at path {path}: {e}")
|
| 257 |
+
return False, None
|
| 258 |
+
|
| 259 |
+
def _handle_property_array_path(self, data: dict, path: str) -> Tuple[bool, Any]:
|
| 260 |
+
"""
|
| 261 |
+
Handle JSONPath-like syntax for property arrays
|
| 262 |
+
Example: metadata.properties[?(@.name=='primaryPurpose')].value
|
| 263 |
+
"""
|
| 264 |
+
try:
|
| 265 |
+
# Extract the base path, property name, and final key
|
| 266 |
+
match = re.match(r'(.+)\.properties\[\?\(@\.name==\'(.+)\'\)\]\.(.+)', path)
|
| 267 |
+
if not match:
|
| 268 |
+
return False, None
|
| 269 |
+
|
| 270 |
+
base_path, prop_name, final_key = match.groups()
|
| 271 |
+
|
| 272 |
+
# Get to the properties array
|
| 273 |
+
base_found, base_value = self._get_nested_value(data, base_path + '.properties')
|
| 274 |
+
if not base_found or not isinstance(base_value, list):
|
| 275 |
+
return False, None
|
| 276 |
+
|
| 277 |
+
# Find the property with matching name
|
| 278 |
+
for prop in base_value:
|
| 279 |
+
if isinstance(prop, dict) and prop.get('name') == prop_name:
|
| 280 |
+
if final_key in prop:
|
| 281 |
+
value = prop[final_key]
|
| 282 |
+
if value is not None and value != "" and value != []:
|
| 283 |
+
return True, value
|
| 284 |
+
|
| 285 |
+
return False, None
|
| 286 |
+
|
| 287 |
+
except Exception as e:
|
| 288 |
+
print(f"Error handling property array path {path}: {e}")
|
| 289 |
+
return False, None
|
| 290 |
+
|
| 291 |
+
def _split_path(self, path: str) -> List[str]:
|
| 292 |
+
"""Split path into parts, handling array notation"""
|
| 293 |
+
parts = []
|
| 294 |
+
current_part = ""
|
| 295 |
+
in_brackets = False
|
| 296 |
+
|
| 297 |
+
for char in path:
|
| 298 |
+
if char == '[':
|
| 299 |
+
in_brackets = True
|
| 300 |
+
current_part += char
|
| 301 |
+
elif char == ']':
|
| 302 |
+
in_brackets = False
|
| 303 |
+
current_part += char
|
| 304 |
+
elif char == '.' and not in_brackets:
|
| 305 |
+
if current_part:
|
| 306 |
+
parts.append(current_part)
|
| 307 |
+
current_part = ""
|
| 308 |
+
else:
|
| 309 |
+
current_part += char
|
| 310 |
+
|
| 311 |
+
if current_part:
|
| 312 |
+
parts.append(current_part)
|
| 313 |
+
|
| 314 |
+
return parts
|
| 315 |
+
|
| 316 |
+
def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
|
| 317 |
+
"""
|
| 318 |
+
Detect if a field exists at the given path in the AIBOM
|
| 319 |
+
Returns: (field_exists, field_value)
|
| 320 |
+
"""
|
| 321 |
+
return self._get_nested_value(aibom, field_path)
|
| 322 |
+
|
| 323 |
+
def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
|
| 324 |
+
"""
|
| 325 |
+
Analyze AIBOM completeness against the enhanced field registry
|
| 326 |
+
Compatible with enhanced registry structure: registry['fields'][field_name]
|
| 327 |
+
"""
|
| 328 |
+
results = {
|
| 329 |
+
'category_scores': {},
|
| 330 |
+
'total_score': 0,
|
| 331 |
+
'field_details': {},
|
| 332 |
+
'summary': {}
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
# Get fields from enhanced registry structure
|
| 336 |
+
fields = self.get_field_definitions()
|
| 337 |
+
if not fields:
|
| 338 |
+
print("β No fields found in registry")
|
| 339 |
+
return results
|
| 340 |
+
|
| 341 |
+
# Get scoring configuration
|
| 342 |
+
scoring_weights = self.get_configurable_scoring_weights()
|
| 343 |
+
category_weights = scoring_weights.get('category_weights', {})
|
| 344 |
+
|
| 345 |
+
# Group fields by category
|
| 346 |
+
categories = {}
|
| 347 |
+
for field_name, field_config in fields.items():
|
| 348 |
+
category = field_config.get('category', 'unknown')
|
| 349 |
+
if category not in categories:
|
| 350 |
+
categories[category] = []
|
| 351 |
+
categories[category].append((field_name, field_config))
|
| 352 |
+
|
| 353 |
+
print(f"π Analyzing {len(fields)} fields across {len(categories)} categories")
|
| 354 |
+
|
| 355 |
+
total_weighted_score = 0
|
| 356 |
+
|
| 357 |
+
for category_name, category_fields in categories.items():
|
| 358 |
+
category_weight = category_weights.get(category_name, 20)
|
| 359 |
+
|
| 360 |
+
present_fields = 0
|
| 361 |
+
total_fields = len(category_fields)
|
| 362 |
+
field_details = {}
|
| 363 |
+
|
| 364 |
+
print(f"\nπ Category: {category_name} (weight: {category_weight})")
|
| 365 |
+
|
| 366 |
+
for field_name, field_config in category_fields:
|
| 367 |
+
field_path = field_config.get('jsonpath', '')
|
| 368 |
+
tier = field_config.get('tier', 'supplementary')
|
| 369 |
+
weight = field_config.get('weight', 1)
|
| 370 |
+
|
| 371 |
+
if not field_path:
|
| 372 |
+
print(f"β οΈ Field {field_name} has no jsonpath defined")
|
| 373 |
+
field_details[field_name] = {
|
| 374 |
+
'present': False,
|
| 375 |
+
'value': None,
|
| 376 |
+
'path': field_path,
|
| 377 |
+
'tier': tier,
|
| 378 |
+
'weight': weight,
|
| 379 |
+
'error': 'No jsonpath defined'
|
| 380 |
+
}
|
| 381 |
+
continue
|
| 382 |
+
|
| 383 |
+
is_present, value = self.detect_field_presence(aibom, field_path)
|
| 384 |
+
|
| 385 |
+
field_details[field_name] = {
|
| 386 |
+
'present': is_present,
|
| 387 |
+
'value': value,
|
| 388 |
+
'path': field_path,
|
| 389 |
+
'tier': tier,
|
| 390 |
+
'weight': weight
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
if is_present:
|
| 394 |
+
present_fields += 1
|
| 395 |
+
print(f"β
FOUND: {field_name} = {value} (tier: {tier}, weight: {weight})")
|
| 396 |
+
else:
|
| 397 |
+
print(f"β MISSING: {field_name} at {field_path} (tier: {tier})")
|
| 398 |
+
|
| 399 |
+
# Calculate category score
|
| 400 |
+
category_percentage = (present_fields / total_fields) * 100 if total_fields > 0 else 0
|
| 401 |
+
category_score = (category_percentage / 100) * category_weight
|
| 402 |
+
|
| 403 |
+
results['category_scores'][category_name] = category_score
|
| 404 |
+
results['field_details'][category_name] = field_details
|
| 405 |
+
results['summary'][category_name] = {
|
| 406 |
+
'present': present_fields,
|
| 407 |
+
'total': total_fields,
|
| 408 |
+
'percentage': category_percentage,
|
| 409 |
+
'weight': category_weight
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
total_weighted_score += category_score
|
| 413 |
+
|
| 414 |
+
print(f"π {category_name}: {present_fields}/{total_fields} ({category_percentage:.1f}%) Γ {category_weight} = {category_score:.1f} pts")
|
| 415 |
+
|
| 416 |
+
results['total_score'] = total_weighted_score
|
| 417 |
+
|
| 418 |
+
print(f"\nπ― TOTAL SCORE: {total_weighted_score:.1f}")
|
| 419 |
+
|
| 420 |
+
return results
|
| 421 |
+
|
| 422 |
+
# =============================================================================
|
| 423 |
+
# UTILITY METHODS
|
| 424 |
+
# =============================================================================
|
| 425 |
+
|
| 426 |
+
def get_field_info(self, field_name: str) -> Optional[Dict[str, Any]]:
|
| 427 |
+
"""Get complete information for a specific field"""
|
| 428 |
+
fields = self.get_field_definitions()
|
| 429 |
+
return fields.get(field_name)
|
| 430 |
+
|
| 431 |
+
def get_field_jsonpath(self, field_name: str) -> Optional[str]:
|
| 432 |
+
"""Get JSONPath expression for a specific field"""
|
| 433 |
+
field_info = self.get_field_info(field_name)
|
| 434 |
+
return field_info.get("jsonpath") if field_info else None
|
| 435 |
+
|
| 436 |
+
def get_fields_by_category(self, category: str) -> List[str]:
|
| 437 |
+
"""Get all field names in a specific category"""
|
| 438 |
+
fields = self.get_field_definitions()
|
| 439 |
+
return [
|
| 440 |
+
field_name for field_name, field_config in fields.items()
|
| 441 |
+
if field_config.get("category") == category
|
| 442 |
+
]
|
| 443 |
+
|
| 444 |
+
def get_fields_by_tier(self, tier: str) -> List[str]:
|
| 445 |
+
"""Get all field names in a specific tier"""
|
| 446 |
+
fields = self.get_field_definitions()
|
| 447 |
+
return [
|
| 448 |
+
field_name for field_name, field_config in fields.items()
|
| 449 |
+
if field_config.get("tier") == tier
|
| 450 |
+
]
|
| 451 |
+
|
| 452 |
+
def validate_registry_integrity(self) -> Dict[str, Any]:
|
| 453 |
+
"""Validate the integrity of the loaded registry"""
|
| 454 |
+
validation_results = {
|
| 455 |
+
"valid": True,
|
| 456 |
+
"errors": [],
|
| 457 |
+
"warnings": [],
|
| 458 |
+
"field_count": 0,
|
| 459 |
+
"category_distribution": {},
|
| 460 |
+
"tier_distribution": {}
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
try:
|
| 464 |
+
fields = self.get_field_definitions()
|
| 465 |
+
validation_results["field_count"] = len(fields)
|
| 466 |
+
|
| 467 |
+
# Check category and tier distribution
|
| 468 |
+
categories = {}
|
| 469 |
+
tiers = {}
|
| 470 |
+
|
| 471 |
+
for field_name, field_config in fields.items():
|
| 472 |
+
# Check required field properties
|
| 473 |
+
required_props = ["tier", "weight", "category", "jsonpath"]
|
| 474 |
+
missing_props = [prop for prop in required_props if prop not in field_config]
|
| 475 |
+
|
| 476 |
+
if missing_props:
|
| 477 |
+
validation_results["errors"].append(
|
| 478 |
+
f"Field '{field_name}' missing properties: {missing_props}"
|
| 479 |
+
)
|
| 480 |
+
validation_results["valid"] = False
|
| 481 |
+
|
| 482 |
+
# Count categories and tiers
|
| 483 |
+
category = field_config.get("category", "unknown")
|
| 484 |
+
tier = field_config.get("tier", "unknown")
|
| 485 |
+
|
| 486 |
+
categories[category] = categories.get(category, 0) + 1
|
| 487 |
+
tiers[tier] = tiers.get(tier, 0) + 1
|
| 488 |
+
|
| 489 |
+
validation_results["category_distribution"] = categories
|
| 490 |
+
validation_results["tier_distribution"] = tiers
|
| 491 |
+
|
| 492 |
+
# Check scoring configuration
|
| 493 |
+
scoring_config = self.get_scoring_config()
|
| 494 |
+
if not scoring_config.get("tier_weights"):
|
| 495 |
+
validation_results["warnings"].append("Missing tier_weights in scoring_config")
|
| 496 |
+
|
| 497 |
+
if not scoring_config.get("category_weights"):
|
| 498 |
+
validation_results["warnings"].append("Missing category_weights in scoring_config")
|
| 499 |
+
|
| 500 |
+
except Exception as e:
|
| 501 |
+
validation_results["valid"] = False
|
| 502 |
+
validation_results["errors"].append(f"Registry validation error: {e}")
|
| 503 |
+
|
| 504 |
+
return validation_results
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
# =============================================================================
|
| 508 |
+
# GLOBAL INSTANCE AND CONVENIENCE FUNCTIONS
|
| 509 |
+
# =============================================================================
|
| 510 |
+
|
| 511 |
+
# Global registry manager instance (initialized on first import)
|
| 512 |
+
_registry_manager = None
|
| 513 |
+
|
| 514 |
+
def get_field_registry_manager() -> FieldRegistryManager:
|
| 515 |
+
"""Get the global field registry manager instance (singleton pattern)"""
|
| 516 |
+
global _registry_manager
|
| 517 |
+
if _registry_manager is None:
|
| 518 |
+
_registry_manager = FieldRegistryManager()
|
| 519 |
+
return _registry_manager
|
| 520 |
+
|
| 521 |
+
# Convenience functions for backward compatibility with existing code
|
| 522 |
+
|
| 523 |
+
def load_field_registry() -> Dict[str, Any]:
|
| 524 |
+
"""Load the complete field registry (convenience function)"""
|
| 525 |
+
manager = get_field_registry_manager()
|
| 526 |
+
return manager.registry
|
| 527 |
+
|
| 528 |
+
def generate_field_classification() -> Dict[str, Any]:
|
| 529 |
+
"""Generate FIELD_CLASSIFICATION from registry (convenience function)"""
|
| 530 |
+
manager = get_field_registry_manager()
|
| 531 |
+
return manager.generate_field_classification()
|
| 532 |
+
|
| 533 |
+
def generate_completeness_profiles() -> Dict[str, Any]:
|
| 534 |
+
"""Generate COMPLETENESS_PROFILES from registry (convenience function)"""
|
| 535 |
+
manager = get_field_registry_manager()
|
| 536 |
+
return manager.generate_completeness_profiles()
|
| 537 |
+
|
| 538 |
+
def generate_validation_messages() -> Dict[str, Any]:
|
| 539 |
+
"""Generate VALIDATION_MESSAGES from registry (convenience function)"""
|
| 540 |
+
manager = get_field_registry_manager()
|
| 541 |
+
return manager.generate_validation_messages()
|
| 542 |
+
|
| 543 |
+
def get_configurable_scoring_weights() -> Dict[str, Any]:
|
| 544 |
+
"""Get configurable scoring weights from registry"""
|
| 545 |
+
manager = get_field_registry_manager()
|
| 546 |
+
return manager.get_configurable_scoring_weights()
|
| 547 |
+
|
| 548 |
+
# For compatibility with old DynamicFieldDetector usage
|
| 549 |
+
class DynamicFieldDetector:
|
| 550 |
+
"""Compatibility wrapper for old DynamicFieldDetector usage"""
|
| 551 |
+
|
| 552 |
+
def __init__(self, registry_path: str):
|
| 553 |
+
"""Initialize with field registry manager"""
|
| 554 |
+
self.manager = FieldRegistryManager(registry_path)
|
| 555 |
+
self.registry = self.manager.registry
|
| 556 |
+
|
| 557 |
+
def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
|
| 558 |
+
"""Detect field presence using the manager"""
|
| 559 |
+
return self.manager.detect_field_presence(aibom, field_path)
|
| 560 |
+
|
| 561 |
+
def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
|
| 562 |
+
"""Analyze AIBOM completeness using the manager"""
|
| 563 |
+
return self.manager.analyze_aibom_completeness(aibom)
|
| 564 |
+
|
| 565 |
+
# Validation function for testing
|
| 566 |
+
def validate_registry_setup() -> bool:
|
| 567 |
+
"""Validate that the registry is properly set up and accessible"""
|
| 568 |
+
try:
|
| 569 |
+
manager = get_field_registry_manager()
|
| 570 |
+
validation_results = manager.validate_registry_integrity()
|
| 571 |
+
|
| 572 |
+
if validation_results["valid"]:
|
| 573 |
+
print(f"β
Registry validation successful")
|
| 574 |
+
print(f" Fields loaded: {validation_results['field_count']}")
|
| 575 |
+
print(f" Categories: {list(validation_results['category_distribution'].keys())}")
|
| 576 |
+
print(f" Tiers: {list(validation_results['tier_distribution'].keys())}")
|
| 577 |
+
return True
|
| 578 |
+
else:
|
| 579 |
+
print(f"β Registry validation failed")
|
| 580 |
+
for error in validation_results["errors"]:
|
| 581 |
+
print(f" Error: {error}")
|
| 582 |
+
return False
|
| 583 |
+
|
| 584 |
+
except Exception as e:
|
| 585 |
+
print(f"β Registry setup validation failed: {e}")
|
| 586 |
+
return False
|
| 587 |
+
|
| 588 |
+
def test_field_registry_manager():
|
| 589 |
+
"""
|
| 590 |
+
This function is temporary (or optional later on).
|
| 591 |
+
It serves the purpose of validating the field registry manager after refactoring
|
| 592 |
+
such as replacing old files or methods within for field detection and score calculations
|
| 593 |
+
and comes handy as a debugging tool.
|
| 594 |
+
"""
|
| 595 |
+
try:
|
| 596 |
+
print("π§ͺ Testing Consolidated Field Registry Manager...")
|
| 597 |
+
|
| 598 |
+
# Test manager initialization
|
| 599 |
+
manager = get_field_registry_manager()
|
| 600 |
+
print(f"β
Manager initialized with registry: {manager.registry_path}")
|
| 601 |
+
|
| 602 |
+
# Test configuration generation
|
| 603 |
+
field_classification = manager.generate_field_classification()
|
| 604 |
+
print(f"β
Generated FIELD_CLASSIFICATION with {len(field_classification)} fields")
|
| 605 |
+
|
| 606 |
+
completeness_profiles = manager.generate_completeness_profiles()
|
| 607 |
+
print(f"β
Generated COMPLETENESS_PROFILES with {len(completeness_profiles)} profiles")
|
| 608 |
+
|
| 609 |
+
validation_messages = manager.generate_validation_messages()
|
| 610 |
+
print(f"β
Generated VALIDATION_MESSAGES with {len(validation_messages)} messages")
|
| 611 |
+
|
| 612 |
+
scoring_weights = manager.get_configurable_scoring_weights()
|
| 613 |
+
print(f"β
Generated SCORING_WEIGHTS with {len(scoring_weights)} sections")
|
| 614 |
+
|
| 615 |
+
# Test field detection capabilities
|
| 616 |
+
test_fields = ['bomFormat', 'primaryPurpose', 'energyConsumption']
|
| 617 |
+
for field_name in test_fields:
|
| 618 |
+
field_info = manager.get_field_info(field_name)
|
| 619 |
+
if field_info:
|
| 620 |
+
jsonpath = field_info.get('jsonpath', 'N/A')
|
| 621 |
+
category = field_info.get('category', 'N/A')
|
| 622 |
+
tier = field_info.get('tier', 'N/A')
|
| 623 |
+
print(f"β
Field '{field_name}': {jsonpath} (category: {category}, tier: {tier})")
|
| 624 |
+
else:
|
| 625 |
+
print(f"β Field '{field_name}' not found in registry")
|
| 626 |
+
|
| 627 |
+
# Test registry validation
|
| 628 |
+
validation_results = manager.validate_registry_integrity()
|
| 629 |
+
if validation_results["valid"]:
|
| 630 |
+
print("β
Registry integrity validation passed")
|
| 631 |
+
else:
|
| 632 |
+
print("β οΈ Registry integrity validation issues found")
|
| 633 |
+
for error in validation_results["errors"]:
|
| 634 |
+
print(f" Error: {error}")
|
| 635 |
+
|
| 636 |
+
print("π Consolidated field registry manager test completed successfully!")
|
| 637 |
+
return True
|
| 638 |
+
|
| 639 |
+
except Exception as e:
|
| 640 |
+
print(f"β Field registry manager test failed: {e}")
|
| 641 |
+
import traceback
|
| 642 |
+
traceback.print_exc()
|
| 643 |
+
return False
|
| 644 |
+
|
| 645 |
+
if __name__ == "__main__":
|
| 646 |
+
# Test the consolidated manager when run directly
|
| 647 |
+
test_field_registry_manager()
|
| 648 |
+
|
src/aibom-generator/generator.py
CHANGED
|
@@ -1,13 +1,30 @@
|
|
| 1 |
import json
|
| 2 |
import uuid
|
| 3 |
import datetime
|
|
|
|
| 4 |
from typing import Dict, Optional, Any, List
|
| 5 |
|
| 6 |
-
|
| 7 |
from huggingface_hub import HfApi, ModelCard
|
|
|
|
| 8 |
from urllib.parse import urlparse
|
| 9 |
from .utils import calculate_completeness_score
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class AIBOMGenerator:
|
| 13 |
def __init__(
|
|
@@ -16,7 +33,7 @@ class AIBOMGenerator:
|
|
| 16 |
inference_model_url: Optional[str] = None,
|
| 17 |
use_inference: bool = True,
|
| 18 |
cache_dir: Optional[str] = None,
|
| 19 |
-
use_best_practices: bool = True, #
|
| 20 |
):
|
| 21 |
self.hf_api = HfApi(token=hf_token)
|
| 22 |
self.inference_model_url = inference_model_url
|
|
@@ -24,13 +41,48 @@ class AIBOMGenerator:
|
|
| 24 |
self.cache_dir = cache_dir
|
| 25 |
self.enhancement_report = None # Store enhancement report as instance variable
|
| 26 |
self.use_best_practices = use_best_practices # Store best practices flag
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def generate_aibom(
|
| 29 |
self,
|
| 30 |
model_id: str,
|
| 31 |
output_file: Optional[str] = None,
|
| 32 |
include_inference: Optional[bool] = None,
|
| 33 |
-
use_best_practices: Optional[bool] = None, #
|
| 34 |
) -> Dict[str, Any]:
|
| 35 |
try:
|
| 36 |
model_id = self._normalise_model_id(model_id)
|
|
@@ -43,12 +95,59 @@ class AIBOMGenerator:
|
|
| 43 |
|
| 44 |
# Store original metadata before any AI enhancement
|
| 45 |
original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# Create initial AIBOM with original metadata
|
| 48 |
original_aibom = self._create_aibom_structure(model_id, original_metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# Calculate initial score with industry-neutral approach if enabled
|
| 51 |
-
original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices)
|
|
|
|
| 52 |
|
| 53 |
# Final metadata starts with original metadata
|
| 54 |
final_metadata = original_metadata.copy() if original_metadata else {}
|
|
@@ -74,12 +173,19 @@ class AIBOMGenerator:
|
|
| 74 |
except Exception as e:
|
| 75 |
print(f"Error during AI enhancement: {e}")
|
| 76 |
# Continue with original metadata if enhancement fails
|
| 77 |
-
|
|
|
|
| 78 |
# Create final AIBOM with potentially enhanced metadata
|
| 79 |
aibom = self._create_aibom_structure(model_id, final_metadata)
|
| 80 |
|
| 81 |
-
# Calculate final score with
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
if output_file:
|
|
@@ -98,8 +204,8 @@ class AIBOMGenerator:
|
|
| 98 |
# Return only the AIBOM to maintain compatibility with existing code
|
| 99 |
return aibom
|
| 100 |
except Exception as e:
|
| 101 |
-
print(f"Error generating
|
| 102 |
-
# Return a minimal valid
|
| 103 |
return self._create_minimal_aibom(model_id)
|
| 104 |
|
| 105 |
def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
|
|
@@ -156,7 +262,7 @@ class AIBOMGenerator:
|
|
| 156 |
print(f"Error fetching model info for {model_id}: {e}")
|
| 157 |
return {}
|
| 158 |
|
| 159 |
-
|
| 160 |
@staticmethod
|
| 161 |
def _normalise_model_id(raw_id: str) -> str:
|
| 162 |
"""
|
|
@@ -171,7 +277,7 @@ class AIBOMGenerator:
|
|
| 171 |
return "/".join(parts[:2])
|
| 172 |
return path
|
| 173 |
return raw_id
|
| 174 |
-
|
| 175 |
|
| 176 |
def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
|
| 177 |
try:
|
|
@@ -185,6 +291,12 @@ class AIBOMGenerator:
|
|
| 185 |
model_id: str,
|
| 186 |
metadata: Dict[str, Any],
|
| 187 |
) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
# Extract owner and model name from model_id
|
| 189 |
parts = model_id.split("/")
|
| 190 |
group = parts[0] if len(parts) > 1 else ""
|
|
@@ -192,6 +304,9 @@ class AIBOMGenerator:
|
|
| 192 |
|
| 193 |
# Get version from metadata or use default
|
| 194 |
version = metadata.get("commit", "1.0")
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
aibom = {
|
| 197 |
"bomFormat": "CycloneDX",
|
|
@@ -206,7 +321,10 @@ class AIBOMGenerator:
|
|
| 206 |
"dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
|
| 207 |
}
|
| 208 |
]
|
| 209 |
-
}
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
# ALWAYS add root-level external references
|
| 212 |
aibom["externalReferences"] = [{
|
|
@@ -220,6 +338,7 @@ class AIBOMGenerator:
|
|
| 220 |
"url": metadata["commit_url"]
|
| 221 |
} )
|
| 222 |
|
|
|
|
| 223 |
return aibom
|
| 224 |
|
| 225 |
def _extract_structured_metadata(
|
|
@@ -228,6 +347,48 @@ class AIBOMGenerator:
|
|
| 228 |
model_info: Dict[str, Any],
|
| 229 |
model_card: Optional[ModelCard],
|
| 230 |
) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
metadata = {}
|
| 232 |
|
| 233 |
if model_info:
|
|
@@ -248,7 +409,7 @@ class AIBOMGenerator:
|
|
| 248 |
"downloads": getattr(model_info, "downloads", 0),
|
| 249 |
"last_modified": getattr(model_info, "lastModified", None),
|
| 250 |
"commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
|
| 251 |
-
"commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None) else None,
|
| 252 |
})
|
| 253 |
except Exception as e:
|
| 254 |
print(f"Error extracting model info metadata: {e}")
|
|
@@ -290,6 +451,7 @@ class AIBOMGenerator:
|
|
| 290 |
print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
|
| 291 |
|
| 292 |
return {k: v for k, v in metadata.items() if v is not None}
|
|
|
|
| 293 |
|
| 294 |
|
| 295 |
def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
|
@@ -301,6 +463,9 @@ class AIBOMGenerator:
|
|
| 301 |
|
| 302 |
|
| 303 |
def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
| 304 |
timestamp = datetime.datetime.utcnow().isoformat() + "Z"
|
| 305 |
|
| 306 |
# Get version from metadata or use default
|
|
@@ -358,24 +523,43 @@ class AIBOMGenerator:
|
|
| 358 |
|
| 359 |
# ALWAYS add critical fields for scoring
|
| 360 |
critical_fields = {
|
| 361 |
-
"primaryPurpose": metadata.get("primaryPurpose",
|
| 362 |
-
"suppliedBy": metadata.get("suppliedBy",
|
| 363 |
-
"typeOfModel": metadata.get("
|
| 364 |
}
|
| 365 |
-
|
| 366 |
-
# Add critical fields first
|
| 367 |
for key, value in critical_fields.items():
|
| 368 |
-
|
| 369 |
-
properties.append({"name": key, "value": str(value)})
|
| 370 |
|
| 371 |
-
# Add
|
| 372 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
for key, value in metadata.items():
|
| 374 |
-
|
|
|
|
|
|
|
| 375 |
if isinstance(value, (list, dict)):
|
| 376 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
value = json.dumps(value)
|
|
|
|
| 378 |
properties.append({"name": key, "value": str(value)})
|
|
|
|
| 379 |
|
| 380 |
# Assemble metadata section
|
| 381 |
metadata_section = {
|
|
@@ -388,6 +572,9 @@ class AIBOMGenerator:
|
|
| 388 |
return metadata_section
|
| 389 |
|
| 390 |
def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
| 391 |
# Extract owner and model name from model_id
|
| 392 |
parts = model_id.split("/")
|
| 393 |
group = parts[0] if len(parts) > 1 else ""
|
|
@@ -412,7 +599,7 @@ class AIBOMGenerator:
|
|
| 412 |
"purl": purl
|
| 413 |
}
|
| 414 |
|
| 415 |
-
#
|
| 416 |
if metadata and "license" in metadata and metadata["license"]:
|
| 417 |
component["licenses"] = [{
|
| 418 |
"license": {
|
|
@@ -420,14 +607,48 @@ class AIBOMGenerator:
|
|
| 420 |
"url": self._get_license_url(metadata["license"])
|
| 421 |
}
|
| 422 |
}]
|
|
|
|
| 423 |
else:
|
| 424 |
-
# Add default license structure for consistency
|
| 425 |
component["licenses"] = [{
|
| 426 |
"license": {
|
| 427 |
-
"id": "
|
| 428 |
"url": "https://spdx.org/licenses/"
|
| 429 |
}
|
| 430 |
}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
# Debug
|
| 432 |
print(f"DEBUG: License in metadata: {'license' in metadata}" )
|
| 433 |
if "license" in metadata:
|
|
@@ -435,6 +656,21 @@ class AIBOMGenerator:
|
|
| 435 |
|
| 436 |
# ALWAYS add description
|
| 437 |
component["description"] = metadata.get("description", f"AI model {model_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
# Add external references
|
| 440 |
external_refs = [{
|
|
@@ -470,26 +706,70 @@ class AIBOMGenerator:
|
|
| 470 |
|
| 471 |
return component
|
| 472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
| 474 |
model_card_section = {}
|
| 475 |
|
| 476 |
# Add quantitative analysis section
|
| 477 |
if "eval_results" in metadata:
|
| 478 |
model_card_section["quantitativeAnalysis"] = {
|
| 479 |
-
"performanceMetrics": metadata["eval_results"],
|
| 480 |
"graphics": {} # Empty graphics object as in the example
|
| 481 |
}
|
| 482 |
else:
|
| 483 |
model_card_section["quantitativeAnalysis"] = {"graphics": {}}
|
| 484 |
|
| 485 |
-
# Add properties section
|
| 486 |
properties = []
|
| 487 |
-
for key, value in metadata.items():
|
| 488 |
-
if key in ["author", "library_name", "license", "downloads", "likes", "tags", "created_at", "last_modified"]:
|
| 489 |
-
properties.append({"name": key, "value": str(value)})
|
| 490 |
|
| 491 |
-
|
| 492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
|
| 494 |
# Create model parameters section
|
| 495 |
model_parameters = {}
|
|
@@ -538,6 +818,25 @@ class AIBOMGenerator:
|
|
| 538 |
|
| 539 |
# Add model parameters to model card section
|
| 540 |
model_card_section["modelParameters"] = model_parameters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
# Add considerations section
|
| 543 |
considerations = {}
|
|
@@ -578,4 +877,112 @@ class AIBOMGenerator:
|
|
| 578 |
logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
|
| 579 |
return None
|
| 580 |
time.sleep(1 * (attempt + 1)) # Exponential backoff
|
| 581 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import uuid
|
| 3 |
import datetime
|
| 4 |
+
import json
|
| 5 |
from typing import Dict, Optional, Any, List
|
| 6 |
|
|
|
|
| 7 |
from huggingface_hub import HfApi, ModelCard
|
| 8 |
+
from huggingface_hub.repocard_data import EvalResult
|
| 9 |
from urllib.parse import urlparse
|
| 10 |
from .utils import calculate_completeness_score
|
| 11 |
|
| 12 |
+
# Import registry-aware enhanced extraction if available
|
| 13 |
+
try:
|
| 14 |
+
from .enhanced_extractor import EnhancedExtractor
|
| 15 |
+
from .field_registry_manager import get_field_registry_manager
|
| 16 |
+
ENHANCED_EXTRACTION_AVAILABLE = True
|
| 17 |
+
print("β
Registry-aware enhanced extraction module loaded successfully")
|
| 18 |
+
except ImportError:
|
| 19 |
+
try:
|
| 20 |
+
from enhanced_extractor import EnhancedExtractor
|
| 21 |
+
from field_registry_manager import get_field_registry_manager
|
| 22 |
+
ENHANCED_EXTRACTION_AVAILABLE = True
|
| 23 |
+
print("β
Registry-aware enhanced extraction module loaded successfully (direct import)")
|
| 24 |
+
except ImportError:
|
| 25 |
+
ENHANCED_EXTRACTION_AVAILABLE = False
|
| 26 |
+
print("β οΈ Registry-aware enhanced extraction not available, using basic extraction")
|
| 27 |
+
|
| 28 |
|
| 29 |
class AIBOMGenerator:
|
| 30 |
def __init__(
|
|
|
|
| 33 |
inference_model_url: Optional[str] = None,
|
| 34 |
use_inference: bool = True,
|
| 35 |
cache_dir: Optional[str] = None,
|
| 36 |
+
use_best_practices: bool = True, # parameter for industry-neutral scoring
|
| 37 |
):
|
| 38 |
self.hf_api = HfApi(token=hf_token)
|
| 39 |
self.inference_model_url = inference_model_url
|
|
|
|
| 41 |
self.cache_dir = cache_dir
|
| 42 |
self.enhancement_report = None # Store enhancement report as instance variable
|
| 43 |
self.use_best_practices = use_best_practices # Store best practices flag
|
| 44 |
+
self._setup_enhanced_logging()
|
| 45 |
+
|
| 46 |
+
self.extraction_results = {} # Store extraction results for scoring
|
| 47 |
+
|
| 48 |
+
# Initialize registry manager for enhanced extraction
|
| 49 |
+
self.registry_manager = None
|
| 50 |
+
if ENHANCED_EXTRACTION_AVAILABLE:
|
| 51 |
+
try:
|
| 52 |
+
self.registry_manager = get_field_registry_manager()
|
| 53 |
+
print("β
Registry manager initialized for generator")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"β οΈ Could not initialize registry manager: {e}")
|
| 56 |
+
self.registry_manager = None
|
| 57 |
+
|
| 58 |
+
def get_extraction_results(self):
|
| 59 |
+
"""Return the enhanced extraction results from the last extraction"""
|
| 60 |
+
return getattr(self, 'extraction_results', {})
|
| 61 |
|
| 62 |
+
def _setup_enhanced_logging(self):
|
| 63 |
+
"""Setup enhanced logging for extraction tracking"""
|
| 64 |
+
import logging
|
| 65 |
+
|
| 66 |
+
# Configure logging to show in HF Spaces
|
| 67 |
+
logging.basicConfig(
|
| 68 |
+
level=logging.INFO,
|
| 69 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 70 |
+
force=True # Override any existing configuration
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Ensure logger shows up
|
| 74 |
+
logger = logging.getLogger('enhanced_extractor')
|
| 75 |
+
logger.setLevel(logging.INFO)
|
| 76 |
+
|
| 77 |
+
print("π§ Enhanced logging configured for AI SBOM generation")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
def generate_aibom(
|
| 81 |
self,
|
| 82 |
model_id: str,
|
| 83 |
output_file: Optional[str] = None,
|
| 84 |
include_inference: Optional[bool] = None,
|
| 85 |
+
use_best_practices: Optional[bool] = None, # parameter for industry-neutral scoring
|
| 86 |
) -> Dict[str, Any]:
|
| 87 |
try:
|
| 88 |
model_id = self._normalise_model_id(model_id)
|
|
|
|
| 95 |
|
| 96 |
# Store original metadata before any AI enhancement
|
| 97 |
original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
|
| 98 |
+
print(f"π ENHANCED EXTRACTION DEBUG: Returned {len(original_metadata)} fields:")
|
| 99 |
+
for key, value in original_metadata.items():
|
| 100 |
+
print(f" {key}: {value}")
|
| 101 |
+
print(f"π EXTRACTION RESULTS: {len(self.extraction_results) if hasattr(self, 'extraction_results') and self.extraction_results else 0} extraction results available")
|
| 102 |
|
| 103 |
# Create initial AIBOM with original metadata
|
| 104 |
original_aibom = self._create_aibom_structure(model_id, original_metadata)
|
| 105 |
+
|
| 106 |
+
print(f"π AI SBOM CREATION DEBUG: Checking what made it into AIBOM:")
|
| 107 |
+
if 'components' in original_aibom and original_aibom['components']:
|
| 108 |
+
component = original_aibom['components'][0]
|
| 109 |
+
if 'properties' in component:
|
| 110 |
+
print(f" Found {len(component['properties'])} properties in AIBOM:")
|
| 111 |
+
for prop in component['properties']:
|
| 112 |
+
print(f" {prop.get('name')}: {prop.get('value')}")
|
| 113 |
+
else:
|
| 114 |
+
print(" No properties found in component")
|
| 115 |
+
else:
|
| 116 |
+
print(" No components found in AI SBOM")
|
| 117 |
+
print(f"π FIELD PRESERVATION VERIFICATION:")
|
| 118 |
+
print(f" Enhanced extraction returned: {len(original_metadata)} fields")
|
| 119 |
+
|
| 120 |
+
# Count fields in final AIBOM
|
| 121 |
+
aibom_field_count = 0
|
| 122 |
+
|
| 123 |
+
# Count component properties
|
| 124 |
+
if 'components' in original_aibom and original_aibom['components']:
|
| 125 |
+
component = original_aibom['components'][0]
|
| 126 |
+
if 'properties' in component:
|
| 127 |
+
aibom_field_count += len(component['properties'])
|
| 128 |
+
|
| 129 |
+
# Count model card properties
|
| 130 |
+
if 'modelCard' in component and 'properties' in component['modelCard']:
|
| 131 |
+
aibom_field_count += len(component['modelCard']['properties'])
|
| 132 |
+
|
| 133 |
+
# Count metadata properties
|
| 134 |
+
if 'metadata' in original_aibom and 'properties' in original_aibom['metadata']:
|
| 135 |
+
aibom_field_count += len(original_aibom['metadata']['properties'])
|
| 136 |
+
|
| 137 |
+
print(f" Final AIBOM contains: {aibom_field_count} fields")
|
| 138 |
+
print(f" Field preservation rate: {(aibom_field_count/len(original_metadata)*100):.1f}%")
|
| 139 |
+
|
| 140 |
+
if aibom_field_count >= len(original_metadata) * 0.9: # 90% or better
|
| 141 |
+
print("β
EXCELLENT: Field preservation successful!")
|
| 142 |
+
elif aibom_field_count >= len(original_metadata) * 0.7: # 70% or better
|
| 143 |
+
print("β οΈ GOOD: Most fields preserved, some optimization possible")
|
| 144 |
+
else:
|
| 145 |
+
print("β POOR: Significant field loss detected")
|
| 146 |
+
|
| 147 |
|
| 148 |
# Calculate initial score with industry-neutral approach if enabled
|
| 149 |
+
original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices, extraction_results=self.extraction_results)
|
| 150 |
+
|
| 151 |
|
| 152 |
# Final metadata starts with original metadata
|
| 153 |
final_metadata = original_metadata.copy() if original_metadata else {}
|
|
|
|
| 173 |
except Exception as e:
|
| 174 |
print(f"Error during AI enhancement: {e}")
|
| 175 |
# Continue with original metadata if enhancement fails
|
| 176 |
+
print("π¨ FALLBACK: Using _create_minimal_aibom due to error!")
|
| 177 |
+
print(f"π¨ ERROR DETAILS: {str(e)}")
|
| 178 |
# Create final AIBOM with potentially enhanced metadata
|
| 179 |
aibom = self._create_aibom_structure(model_id, final_metadata)
|
| 180 |
|
| 181 |
+
# Calculate final score with enhanced extraction results
|
| 182 |
+
extraction_results = self.get_extraction_results()
|
| 183 |
+
final_score = calculate_completeness_score(
|
| 184 |
+
aibom,
|
| 185 |
+
validate=True,
|
| 186 |
+
use_best_practices=use_best_practices,
|
| 187 |
+
extraction_results=extraction_results # Pass enhanced results
|
| 188 |
+
)
|
| 189 |
|
| 190 |
|
| 191 |
if output_file:
|
|
|
|
| 204 |
# Return only the AIBOM to maintain compatibility with existing code
|
| 205 |
return aibom
|
| 206 |
except Exception as e:
|
| 207 |
+
print(f"Error generating AI SBOM: {e}")
|
| 208 |
+
# Return a minimal valid AI SBOM structure in case of error
|
| 209 |
return self._create_minimal_aibom(model_id)
|
| 210 |
|
| 211 |
def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
|
|
|
|
| 262 |
print(f"Error fetching model info for {model_id}: {e}")
|
| 263 |
return {}
|
| 264 |
|
| 265 |
+
|
| 266 |
@staticmethod
|
| 267 |
def _normalise_model_id(raw_id: str) -> str:
|
| 268 |
"""
|
|
|
|
| 277 |
return "/".join(parts[:2])
|
| 278 |
return path
|
| 279 |
return raw_id
|
| 280 |
+
|
| 281 |
|
| 282 |
def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
|
| 283 |
try:
|
|
|
|
| 291 |
model_id: str,
|
| 292 |
metadata: Dict[str, Any],
|
| 293 |
) -> Dict[str, Any]:
|
| 294 |
+
# π CRASH DEBUG: troubleshoot where the process is crashing and falling back to minimal AIBOM
|
| 295 |
+
print(f"π CRASH_DEBUG: _create_aibom_structure called")
|
| 296 |
+
print(f"π CRASH_DEBUG: model_id = {model_id}")
|
| 297 |
+
print(f"π CRASH_DEBUG: metadata type = {type(metadata)}")
|
| 298 |
+
print(f"π CRASH_DEBUG: metadata keys = {list(metadata.keys()) if isinstance(metadata, dict) else 'NOT A DICT'}")
|
| 299 |
+
|
| 300 |
# Extract owner and model name from model_id
|
| 301 |
parts = model_id.split("/")
|
| 302 |
group = parts[0] if len(parts) > 1 else ""
|
|
|
|
| 304 |
|
| 305 |
# Get version from metadata or use default
|
| 306 |
version = metadata.get("commit", "1.0")
|
| 307 |
+
|
| 308 |
+
# π CRASH DEBUG: Check metadata before creating sections
|
| 309 |
+
print(f"π CRASH_DEBUG: About to create metadata section")
|
| 310 |
|
| 311 |
aibom = {
|
| 312 |
"bomFormat": "CycloneDX",
|
|
|
|
| 321 |
"dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
|
| 322 |
}
|
| 323 |
]
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
# π CRASH DEBUG: Check if we got this far
|
| 327 |
+
print(f"π CRASH_DEBUG: Successfully created basic AIBOM structure")
|
| 328 |
|
| 329 |
# ALWAYS add root-level external references
|
| 330 |
aibom["externalReferences"] = [{
|
|
|
|
| 338 |
"url": metadata["commit_url"]
|
| 339 |
} )
|
| 340 |
|
| 341 |
+
print(f"π CRASH_DEBUG: _create_aibom_structure completed successfully")
|
| 342 |
return aibom
|
| 343 |
|
| 344 |
def _extract_structured_metadata(
|
|
|
|
| 347 |
model_info: Dict[str, Any],
|
| 348 |
model_card: Optional[ModelCard],
|
| 349 |
) -> Dict[str, Any]:
|
| 350 |
+
|
| 351 |
+
# Use registry-aware enhanced extraction if available
|
| 352 |
+
if ENHANCED_EXTRACTION_AVAILABLE:
|
| 353 |
+
try:
|
| 354 |
+
print(f"π Using registry-aware enhanced extraction for: {model_id}")
|
| 355 |
+
|
| 356 |
+
# Create registry-aware enhanced extractor instance
|
| 357 |
+
extractor = EnhancedExtractor(self.hf_api, self.registry_manager)
|
| 358 |
+
|
| 359 |
+
# Get both metadata and extraction results
|
| 360 |
+
metadata = extractor.extract_metadata(model_id, model_info, model_card)
|
| 361 |
+
|
| 362 |
+
# Store extraction results for scoring
|
| 363 |
+
self.extraction_results = extractor.extraction_results
|
| 364 |
+
|
| 365 |
+
# Log extraction summary
|
| 366 |
+
if extractor.registry_fields:
|
| 367 |
+
registry_field_count = len(extractor.registry_fields)
|
| 368 |
+
extracted_count = len([k for k, v in metadata.items() if v is not None])
|
| 369 |
+
extraction_results_count = len(extractor.extraction_results)
|
| 370 |
+
|
| 371 |
+
print(f"β
Registry-driven extraction completed:")
|
| 372 |
+
print(f" π Registry fields available: {registry_field_count}")
|
| 373 |
+
print(f" π Fields attempted: {extraction_results_count}")
|
| 374 |
+
print(f" β
Fields extracted: {extracted_count}")
|
| 375 |
+
|
| 376 |
+
# Log field coverage
|
| 377 |
+
if registry_field_count > 0:
|
| 378 |
+
coverage = (extracted_count / registry_field_count) * 100
|
| 379 |
+
print(f" π Registry field coverage: {coverage:.1f}%")
|
| 380 |
+
else:
|
| 381 |
+
extracted_count = len([k for k, v in metadata.items() if v is not None])
|
| 382 |
+
print(f"β
Legacy extraction completed: {extracted_count} fields extracted")
|
| 383 |
+
|
| 384 |
+
return metadata
|
| 385 |
+
|
| 386 |
+
except Exception as e:
|
| 387 |
+
print(f"β Registry-aware enhanced extraction failed: {e}")
|
| 388 |
+
print("π Falling back to original extraction method")
|
| 389 |
+
# Fall back to original extraction code here
|
| 390 |
+
|
| 391 |
+
# ORIGINAL EXTRACTION METHOD (as fallback)
|
| 392 |
metadata = {}
|
| 393 |
|
| 394 |
if model_info:
|
|
|
|
| 409 |
"downloads": getattr(model_info, "downloads", 0),
|
| 410 |
"last_modified": getattr(model_info, "lastModified", None),
|
| 411 |
"commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
|
| 412 |
+
"commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None ) else None,
|
| 413 |
})
|
| 414 |
except Exception as e:
|
| 415 |
print(f"Error extracting model info metadata: {e}")
|
|
|
|
| 451 |
print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
|
| 452 |
|
| 453 |
return {k: v for k, v in metadata.items() if v is not None}
|
| 454 |
+
|
| 455 |
|
| 456 |
|
| 457 |
def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
|
|
|
| 463 |
|
| 464 |
|
| 465 |
def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 466 |
+
print(f"π CRASH_DEBUG: _create_metadata_section called")
|
| 467 |
+
print(f"π CRASH_DEBUG: metadata type in metadata_section = {type(metadata)}")
|
| 468 |
+
|
| 469 |
timestamp = datetime.datetime.utcnow().isoformat() + "Z"
|
| 470 |
|
| 471 |
# Get version from metadata or use default
|
|
|
|
| 523 |
|
| 524 |
# ALWAYS add critical fields for scoring
|
| 525 |
critical_fields = {
|
| 526 |
+
"primaryPurpose": metadata.get("primaryPurpose", "text-generation"),
|
| 527 |
+
"suppliedBy": metadata.get("suppliedBy", "unknown"),
|
| 528 |
+
"typeOfModel": metadata.get("typeOfModel", "Transformer")
|
| 529 |
}
|
|
|
|
|
|
|
| 530 |
for key, value in critical_fields.items():
|
| 531 |
+
properties.append({"name": key, "value": str(value)})
|
|
|
|
| 532 |
|
| 533 |
+
# Add enhanced extraction fields to properties
|
| 534 |
+
# Organize fields by category for better AIBOM structure
|
| 535 |
+
component_fields = ["name", "author", "description", "commit"] # These go in component section
|
| 536 |
+
critical_fields = ["primaryPurpose", "suppliedBy", "typeOfModel"] # Always include these
|
| 537 |
+
|
| 538 |
+
# Add all other enhanced extraction fields (preserve everything!)
|
| 539 |
+
enhanced_fields = ["model_type", "tokenizer_class", "architectures", "library_name",
|
| 540 |
+
"pipeline_tag", "tags", "datasets", "base_model", "language",
|
| 541 |
+
"downloads", "last_modified", "commit_url", "ai:type", "ai:task",
|
| 542 |
+
"ai:framework", "eval_results"]
|
| 543 |
+
|
| 544 |
+
print(f"π CRASH_DEBUG: About to call .items() on metadata")
|
| 545 |
+
print(f"π CRASH_DEBUG: metadata type before .items() = {type(metadata)}")
|
| 546 |
+
|
| 547 |
for key, value in metadata.items():
|
| 548 |
+
# Skip component fields (handled elsewhere) but include everything else
|
| 549 |
+
if key not in component_fields and value is not None:
|
| 550 |
+
# Handle different data types properly
|
| 551 |
if isinstance(value, (list, dict)):
|
| 552 |
+
if isinstance(value, list) and len(value) > 0:
|
| 553 |
+
# Convert list to comma-separated string for better display
|
| 554 |
+
if all(isinstance(item, str) for item in value):
|
| 555 |
+
value = ", ".join(value)
|
| 556 |
+
else:
|
| 557 |
+
value = json.dumps(value)
|
| 558 |
+
elif isinstance(value, dict):
|
| 559 |
value = json.dumps(value)
|
| 560 |
+
|
| 561 |
properties.append({"name": key, "value": str(value)})
|
| 562 |
+
print(f"β
METADATA: Added {key} = {value} to properties")
|
| 563 |
|
| 564 |
# Assemble metadata section
|
| 565 |
metadata_section = {
|
|
|
|
| 572 |
return metadata_section
|
| 573 |
|
| 574 |
def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 575 |
+
print(f"π CRASH_DEBUG: _create_component_section called")
|
| 576 |
+
print(f"π CRASH_DEBUG: metadata type in component_section = {type(metadata)}")
|
| 577 |
+
|
| 578 |
# Extract owner and model name from model_id
|
| 579 |
parts = model_id.split("/")
|
| 580 |
group = parts[0] if len(parts) > 1 else ""
|
|
|
|
| 599 |
"purl": purl
|
| 600 |
}
|
| 601 |
|
| 602 |
+
# Handle license
|
| 603 |
if metadata and "license" in metadata and metadata["license"]:
|
| 604 |
component["licenses"] = [{
|
| 605 |
"license": {
|
|
|
|
| 607 |
"url": self._get_license_url(metadata["license"])
|
| 608 |
}
|
| 609 |
}]
|
| 610 |
+
print(f"β
COMPONENT: Added license = {metadata['license']}")
|
| 611 |
else:
|
|
|
|
| 612 |
component["licenses"] = [{
|
| 613 |
"license": {
|
| 614 |
+
"id": "NOASSERTION",
|
| 615 |
"url": "https://spdx.org/licenses/"
|
| 616 |
}
|
| 617 |
}]
|
| 618 |
+
print(f"β οΈ COMPONENT: No license found, using NOASSERTION")
|
| 619 |
+
|
| 620 |
+
# ALWAYS add description
|
| 621 |
+
component["description"] = metadata.get("description", f"AI model {model_id}")
|
| 622 |
+
|
| 623 |
+
# Add enhanced technical properties to component
|
| 624 |
+
technical_properties = []
|
| 625 |
+
|
| 626 |
+
# Add model type information
|
| 627 |
+
if "model_type" in metadata:
|
| 628 |
+
technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
|
| 629 |
+
print(f"β
COMPONENT: Added model_type = {metadata['model_type']}")
|
| 630 |
+
|
| 631 |
+
# Add tokenizer information
|
| 632 |
+
if "tokenizer_class" in metadata:
|
| 633 |
+
technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
|
| 634 |
+
print(f"β
COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
|
| 635 |
+
|
| 636 |
+
# Add architecture information
|
| 637 |
+
if "architectures" in metadata:
|
| 638 |
+
arch_value = metadata["architectures"]
|
| 639 |
+
if isinstance(arch_value, list):
|
| 640 |
+
arch_value = ", ".join(arch_value)
|
| 641 |
+
technical_properties.append({"name": "architectures", "value": str(arch_value)})
|
| 642 |
+
print(f"β
COMPONENT: Added architectures = {arch_value}")
|
| 643 |
+
|
| 644 |
+
# Add library information
|
| 645 |
+
if "library_name" in metadata:
|
| 646 |
+
technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
|
| 647 |
+
print(f"β
COMPONENT: Added library_name = {metadata['library_name']}")
|
| 648 |
+
|
| 649 |
+
# Add technical properties to component if any exist
|
| 650 |
+
if technical_properties:
|
| 651 |
+
component["properties"] = technical_properties
|
| 652 |
# Debug
|
| 653 |
print(f"DEBUG: License in metadata: {'license' in metadata}" )
|
| 654 |
if "license" in metadata:
|
|
|
|
| 656 |
|
| 657 |
# ALWAYS add description
|
| 658 |
component["description"] = metadata.get("description", f"AI model {model_id}")
|
| 659 |
+
if metadata.get("license"):
|
| 660 |
+
component["licenses"] = [{
|
| 661 |
+
"license": {
|
| 662 |
+
"id": metadata["license"],
|
| 663 |
+
"url": self._get_license_url(metadata["license"])
|
| 664 |
+
}
|
| 665 |
+
}]
|
| 666 |
+
else:
|
| 667 |
+
component["licenses"] = [{
|
| 668 |
+
"license": {
|
| 669 |
+
"id": "unknown",
|
| 670 |
+
"url": "https://spdx.org/licenses/"
|
| 671 |
+
}
|
| 672 |
+
}]
|
| 673 |
+
|
| 674 |
|
| 675 |
# Add external references
|
| 676 |
external_refs = [{
|
|
|
|
| 706 |
|
| 707 |
return component
|
| 708 |
|
| 709 |
+
def _eval_results_to_json(self, eval_results: List[EvalResult]) -> List[Dict[str, str]]:
|
| 710 |
+
res = []
|
| 711 |
+
for eval_result in eval_results:
|
| 712 |
+
if hasattr(eval_result, "metric_type") and hasattr(eval_result, "metric_value"):
|
| 713 |
+
res.append({"type": eval_result.metric_type, "value": str(eval_result.metric_value)})
|
| 714 |
+
return res
|
| 715 |
+
|
| 716 |
+
|
| 717 |
def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 718 |
+
print(f"π CRASH_DEBUG: _create_model_card_section called")
|
| 719 |
+
print(f"π CRASH_DEBUG: metadata type in model_card_section = {type(metadata)}")
|
| 720 |
+
|
| 721 |
model_card_section = {}
|
| 722 |
|
| 723 |
# Add quantitative analysis section
|
| 724 |
if "eval_results" in metadata:
|
| 725 |
model_card_section["quantitativeAnalysis"] = {
|
| 726 |
+
"performanceMetrics": self._eval_results_to_json(metadata["eval_results"]),
|
| 727 |
"graphics": {} # Empty graphics object as in the example
|
| 728 |
}
|
| 729 |
else:
|
| 730 |
model_card_section["quantitativeAnalysis"] = {"graphics": {}}
|
| 731 |
|
| 732 |
+
# Add properties section with enhanced extraction fields
|
| 733 |
properties = []
|
|
|
|
|
|
|
|
|
|
| 734 |
|
| 735 |
+
# Component-level fields that shouldn't be duplicated in model card
|
| 736 |
+
component_level_fields = ["name", "author", "license", "description", "commit"]
|
| 737 |
+
|
| 738 |
+
# DEBUG: troubleshooting AIBOM generation
|
| 739 |
+
print(f"π DEBUG: About to iterate metadata.items()")
|
| 740 |
+
print(f"π DEBUG: metadata type = {type(metadata)}")
|
| 741 |
+
if isinstance(metadata, dict):
|
| 742 |
+
print(f"π DEBUG: metadata keys = {list(metadata.keys())}")
|
| 743 |
+
else:
|
| 744 |
+
print(f"π DEBUG: metadata value = {metadata}")
|
| 745 |
+
print(f"π DEBUG: This is the problem - metadata should be a dict!")
|
| 746 |
+
|
| 747 |
+
# Add all enhanced extraction fields to model card properties
|
| 748 |
+
try:
|
| 749 |
+
for key, value in metadata.items():
|
| 750 |
+
if key not in component_level_fields and value is not None:
|
| 751 |
+
# Handle different data types properly
|
| 752 |
+
if isinstance(value, (list, dict)):
|
| 753 |
+
if isinstance(value, list) and len(value) > 0:
|
| 754 |
+
# Convert list to readable format
|
| 755 |
+
if all(isinstance(item, str) for item in value):
|
| 756 |
+
value = ", ".join(value)
|
| 757 |
+
else:
|
| 758 |
+
value = json.dumps(value)
|
| 759 |
+
elif isinstance(value, dict):
|
| 760 |
+
value = json.dumps(value)
|
| 761 |
+
|
| 762 |
+
properties.append({"name": key, "value": str(value)})
|
| 763 |
+
print(f"β
MODEL_CARD: Added {key} = {value}")
|
| 764 |
+
except AttributeError as e:
|
| 765 |
+
print(f"β FOUND THE ERROR: {e}")
|
| 766 |
+
print(f"β metadata type: {type(metadata)}")
|
| 767 |
+
print(f"β metadata value: {metadata}")
|
| 768 |
+
raise e
|
| 769 |
+
|
| 770 |
+
# Always include properties section (even if empty for consistency)
|
| 771 |
+
model_card_section["properties"] = properties
|
| 772 |
+
print(f"β
MODEL_CARD: Added {len(properties)} properties to model card")
|
| 773 |
|
| 774 |
# Create model parameters section
|
| 775 |
model_parameters = {}
|
|
|
|
| 818 |
|
| 819 |
# Add model parameters to model card section
|
| 820 |
model_card_section["modelParameters"] = model_parameters
|
| 821 |
+
# Add enhanced technical parameters
|
| 822 |
+
if "model_type" in metadata or "tokenizer_class" in metadata or "architectures" in metadata:
|
| 823 |
+
technical_details = {}
|
| 824 |
+
|
| 825 |
+
if "model_type" in metadata:
|
| 826 |
+
technical_details["modelType"] = metadata["model_type"]
|
| 827 |
+
|
| 828 |
+
if "tokenizer_class" in metadata:
|
| 829 |
+
technical_details["tokenizerClass"] = metadata["tokenizer_class"]
|
| 830 |
+
|
| 831 |
+
if "architectures" in metadata:
|
| 832 |
+
technical_details["architectures"] = metadata["architectures"]
|
| 833 |
+
|
| 834 |
+
# Add to model parameters
|
| 835 |
+
model_parameters.update(technical_details)
|
| 836 |
+
print(f"β
MODEL_CARD: Added technical details: {list(technical_details.keys())}")
|
| 837 |
+
|
| 838 |
+
# Update model parameters with enhanced details
|
| 839 |
+
model_card_section["modelParameters"] = model_parameters
|
| 840 |
|
| 841 |
# Add considerations section
|
| 842 |
considerations = {}
|
|
|
|
| 877 |
logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
|
| 878 |
return None
|
| 879 |
time.sleep(1 * (attempt + 1)) # Exponential backoff
|
| 880 |
+
return None
|
| 881 |
+
|
| 882 |
+
def validate_registry_integration(self) -> Dict[str, Any]:
|
| 883 |
+
"""
|
| 884 |
+
Validate that the registry integration is working correctly.
|
| 885 |
+
This method helps debug registry-related issues.
|
| 886 |
+
"""
|
| 887 |
+
validation_results = {
|
| 888 |
+
'registry_manager_available': bool(self.registry_manager),
|
| 889 |
+
'enhanced_extraction_available': ENHANCED_EXTRACTION_AVAILABLE,
|
| 890 |
+
'registry_fields_count': 0,
|
| 891 |
+
'registry_fields_loaded': False,
|
| 892 |
+
'validation_status': 'unknown'
|
| 893 |
+
}
|
| 894 |
+
|
| 895 |
+
try:
|
| 896 |
+
if self.registry_manager:
|
| 897 |
+
registry = self.registry_manager.registry
|
| 898 |
+
registry_fields = registry.get('fields', {})
|
| 899 |
+
validation_results['registry_fields_count'] = len(registry_fields)
|
| 900 |
+
validation_results['registry_fields_loaded'] = len(registry_fields) > 0
|
| 901 |
+
|
| 902 |
+
if len(registry_fields) > 0:
|
| 903 |
+
validation_results['validation_status'] = 'success'
|
| 904 |
+
print(f"β
Registry validation successful: {len(registry_fields)} fields loaded")
|
| 905 |
+
|
| 906 |
+
# Log sample fields
|
| 907 |
+
sample_fields = list(registry_fields.keys())[:5]
|
| 908 |
+
print(f"π Sample registry fields: {', '.join(sample_fields)}")
|
| 909 |
+
else:
|
| 910 |
+
validation_results['validation_status'] = 'no_fields'
|
| 911 |
+
print("β οΈ Registry loaded but no fields found")
|
| 912 |
+
else:
|
| 913 |
+
validation_results['validation_status'] = 'no_registry_manager'
|
| 914 |
+
print("β Registry manager not available")
|
| 915 |
+
|
| 916 |
+
except Exception as e:
|
| 917 |
+
validation_results['validation_status'] = 'error'
|
| 918 |
+
validation_results['error'] = str(e)
|
| 919 |
+
print(f"β Registry validation failed: {e}")
|
| 920 |
+
|
| 921 |
+
return validation_results
|
| 922 |
+
|
| 923 |
+
def test_registry_integration():
|
| 924 |
+
"""
|
| 925 |
+
Test function to validate registry integration is working correctly.
|
| 926 |
+
This function can be called to debug registry-related issues.
|
| 927 |
+
"""
|
| 928 |
+
print("π§ͺ Testing Registry Integration...")
|
| 929 |
+
print("=" * 50)
|
| 930 |
+
|
| 931 |
+
try:
|
| 932 |
+
# Test generator initialization
|
| 933 |
+
generator = AIBOMGenerator()
|
| 934 |
+
|
| 935 |
+
# Validate registry integration
|
| 936 |
+
validation_results = generator.validate_registry_integration()
|
| 937 |
+
|
| 938 |
+
print("π Validation Results:")
|
| 939 |
+
for key, value in validation_results.items():
|
| 940 |
+
print(f" {key}: {value}")
|
| 941 |
+
|
| 942 |
+
# Test with a sample model
|
| 943 |
+
test_model = "deepseek-ai/DeepSeek-R1"
|
| 944 |
+
print(f"\nπ Testing extraction with model: {test_model}")
|
| 945 |
+
|
| 946 |
+
try:
|
| 947 |
+
# Test model info retrieval
|
| 948 |
+
model_info = generator.hf_api.model_info(test_model)
|
| 949 |
+
model_card = ModelCard.load(test_model)
|
| 950 |
+
|
| 951 |
+
# Test extraction
|
| 952 |
+
if ENHANCED_EXTRACTION_AVAILABLE and generator.registry_manager:
|
| 953 |
+
extractor = EnhancedExtractor(generator.hf_api, generator.registry_manager)
|
| 954 |
+
metadata = extractor.extract_metadata(test_model, model_info, model_card)
|
| 955 |
+
|
| 956 |
+
print(f"β
Test extraction successful: {len(metadata)} fields extracted")
|
| 957 |
+
|
| 958 |
+
# Show sample extracted fields
|
| 959 |
+
sample_fields = dict(list(metadata.items())[:5])
|
| 960 |
+
print("π Sample extracted fields:")
|
| 961 |
+
for key, value in sample_fields.items():
|
| 962 |
+
print(f" {key}: {value}")
|
| 963 |
+
|
| 964 |
+
# Show extraction results summary
|
| 965 |
+
extraction_results = extractor.get_extraction_results()
|
| 966 |
+
confidence_counts = {}
|
| 967 |
+
for result in extraction_results.values():
|
| 968 |
+
conf = result.confidence.value
|
| 969 |
+
confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
|
| 970 |
+
|
| 971 |
+
print("π Extraction confidence distribution:")
|
| 972 |
+
for conf, count in confidence_counts.items():
|
| 973 |
+
print(f" {conf}: {count} fields")
|
| 974 |
+
|
| 975 |
+
else:
|
| 976 |
+
print("β οΈ Registry-aware extraction not available for testing")
|
| 977 |
+
|
| 978 |
+
except Exception as e:
|
| 979 |
+
print(f"β Test extraction failed: {e}")
|
| 980 |
+
|
| 981 |
+
except Exception as e:
|
| 982 |
+
print(f"β Registry integration test failed: {e}")
|
| 983 |
+
|
| 984 |
+
print("=" * 50)
|
| 985 |
+
print("π§ͺ Registry Integration Test Complete")
|
| 986 |
+
|
| 987 |
+
# Uncomment this line to run the test automatically when generator.py is imported
|
| 988 |
+
test_registry_integration()
|
src/aibom-generator/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
import json
|
|
@@ -9,6 +9,14 @@ import re
|
|
| 9 |
import uuid
|
| 10 |
from typing import Dict, List, Optional, Any, Union, Tuple
|
| 11 |
from enum import Enum
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
|
|
@@ -18,98 +26,123 @@ class ValidationSeverity(Enum):
|
|
| 18 |
WARNING = "warning"
|
| 19 |
INFO = "info"
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
"
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
"
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
"
|
| 68 |
-
"
|
| 69 |
-
|
| 70 |
-
"
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
}
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
"downloadLocation": {
|
| 89 |
-
"missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
|
| 90 |
-
"recommendation": "Add information about where the model can be downloaded"
|
| 91 |
-
},
|
| 92 |
-
"primaryPurpose": {
|
| 93 |
-
"missing": "Missing critical field: primaryPurpose - important for understanding model intent",
|
| 94 |
-
"recommendation": "Add information about the primary purpose of this model"
|
| 95 |
-
},
|
| 96 |
-
"suppliedBy": {
|
| 97 |
-
"missing": "Missing critical field: suppliedBy - needed for provenance tracking",
|
| 98 |
-
"recommendation": "Add information about who supplied this model"
|
| 99 |
-
},
|
| 100 |
-
"energyConsumption": {
|
| 101 |
-
"missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
|
| 102 |
-
"recommendation": "Consider documenting energy consumption metrics for better transparency"
|
| 103 |
-
},
|
| 104 |
-
"hyperparameter": {
|
| 105 |
-
"missing": "Missing important field: hyperparameter - valuable for reproducibility",
|
| 106 |
-
"recommendation": "Document key hyperparameters used in training"
|
| 107 |
-
},
|
| 108 |
-
"limitation": {
|
| 109 |
-
"missing": "Missing important field: limitation - important for responsible use",
|
| 110 |
-
"recommendation": "Document known limitations of the model to guide appropriate usage"
|
| 111 |
}
|
| 112 |
-
}
|
| 113 |
|
| 114 |
|
| 115 |
def setup_logging(level=logging.INFO):
|
|
@@ -207,77 +240,53 @@ def check_field_in_aibom(aibom: Dict[str, Any], field: str) -> bool:
|
|
| 207 |
Returns:
|
| 208 |
True if the field is present, False otherwise
|
| 209 |
"""
|
| 210 |
-
# Check in root level
|
| 211 |
if field in aibom:
|
| 212 |
return True
|
| 213 |
-
|
| 214 |
-
# Check in metadata
|
| 215 |
if "metadata" in aibom:
|
| 216 |
metadata = aibom["metadata"]
|
| 217 |
if field in metadata:
|
| 218 |
return True
|
| 219 |
-
|
| 220 |
-
# Check in metadata properties
|
| 221 |
if "properties" in metadata:
|
| 222 |
for prop in metadata["properties"]:
|
| 223 |
-
|
|
|
|
| 224 |
return True
|
| 225 |
-
|
| 226 |
-
# Check in components
|
| 227 |
if "components" in aibom and aibom["components"]:
|
| 228 |
-
component = aibom["components"][0]
|
| 229 |
-
|
| 230 |
if field in component:
|
| 231 |
return True
|
| 232 |
-
|
| 233 |
-
# Check in component properties
|
| 234 |
if "properties" in component:
|
| 235 |
for prop in component["properties"]:
|
| 236 |
-
|
|
|
|
| 237 |
return True
|
| 238 |
-
|
| 239 |
-
# Check in model card
|
| 240 |
if "modelCard" in component:
|
| 241 |
model_card = component["modelCard"]
|
| 242 |
-
|
| 243 |
if field in model_card:
|
| 244 |
return True
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
if "modelParameters" in model_card:
|
| 248 |
-
if field in model_card["modelParameters"]:
|
| 249 |
-
return True
|
| 250 |
-
|
| 251 |
-
# Check in model parameters properties
|
| 252 |
-
if "properties" in model_card["modelParameters"]:
|
| 253 |
-
for prop in model_card["modelParameters"]["properties"]:
|
| 254 |
-
if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
|
| 255 |
-
return True
|
| 256 |
-
|
| 257 |
-
# Check in considerations
|
| 258 |
if "considerations" in model_card:
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
if
|
| 268 |
-
return True
|
| 269 |
-
if field == "energyConsumption" and section == "environmentalConsiderations":
|
| 270 |
return True
|
| 271 |
-
|
| 272 |
-
|
| 273 |
if field == "downloadLocation" and "externalReferences" in aibom:
|
| 274 |
for ref in aibom["externalReferences"]:
|
| 275 |
-
if ref.get("type") == "distribution":
|
| 276 |
return True
|
| 277 |
-
|
| 278 |
return False
|
| 279 |
|
| 280 |
|
|
|
|
| 281 |
def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
|
| 282 |
"""
|
| 283 |
Determine which completeness profile the AIBOM satisfies.
|
|
@@ -835,8 +844,113 @@ def get_validation_summary(report: Dict[str, Any]) -> str:
|
|
| 835 |
|
| 836 |
return summary
|
| 837 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 838 |
|
| 839 |
-
|
|
|
|
| 840 |
"""
|
| 841 |
Calculate completeness score using industry best practices with proper normalization and penalties.
|
| 842 |
|
|
@@ -875,8 +989,8 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 875 |
# Count total fields in this category
|
| 876 |
fields_by_category[category]["total"] += 1
|
| 877 |
|
| 878 |
-
#
|
| 879 |
-
is_present =
|
| 880 |
|
| 881 |
if is_present:
|
| 882 |
fields_by_category[category]["present"] += 1
|
|
@@ -898,6 +1012,19 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 898 |
category_scores[category] = round(raw_score, 1)
|
| 899 |
else:
|
| 900 |
category_scores[category] = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 901 |
|
| 902 |
# Calculate subtotal (sum of rounded category scores)
|
| 903 |
subtotal_score = sum(category_scores.values())
|
|
@@ -1033,7 +1160,7 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 1033 |
return result
|
| 1034 |
|
| 1035 |
|
| 1036 |
-
def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True) -> Dict[str, Any]:
|
| 1037 |
"""
|
| 1038 |
Calculate completeness score for an AIBOM and optionally validate against AI requirements.
|
| 1039 |
Enhanced with industry best practices scoring.
|
|
@@ -1046,9 +1173,16 @@ def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, u
|
|
| 1046 |
Returns:
|
| 1047 |
Dictionary containing score and validation results
|
| 1048 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
# If using best practices scoring, use the enhanced industry-neutral approach
|
| 1050 |
if use_best_practices:
|
| 1051 |
-
result = calculate_industry_neutral_score(aibom)
|
| 1052 |
|
| 1053 |
# Add validation if requested
|
| 1054 |
if validate:
|
|
@@ -1525,4 +1659,64 @@ def format_score_summary(score_result: Dict[str, Any]) -> str:
|
|
| 1525 |
summary += f"\nCompleteness Profile: {profile['name']}\n"
|
| 1526 |
summary += f"Description: {profile['description']}\n"
|
| 1527 |
|
| 1528 |
-
return summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Mostly score calculation functions for the AI SBOM Generator.
|
| 3 |
"""
|
| 4 |
|
| 5 |
import json
|
|
|
|
| 9 |
import uuid
|
| 10 |
from typing import Dict, List, Optional, Any, Union, Tuple
|
| 11 |
from enum import Enum
|
| 12 |
+
from .field_registry_manager import (
|
| 13 |
+
get_field_registry_manager,
|
| 14 |
+
generate_field_classification,
|
| 15 |
+
generate_completeness_profiles,
|
| 16 |
+
generate_validation_messages,
|
| 17 |
+
get_configurable_scoring_weights,
|
| 18 |
+
DynamicFieldDetector # Compatibility wrapper
|
| 19 |
+
)
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
|
|
|
|
| 26 |
WARNING = "warning"
|
| 27 |
INFO = "info"
|
| 28 |
|
| 29 |
+
# Registry-driven field definitions
|
| 30 |
+
try:
|
| 31 |
+
REGISTRY_MANAGER = get_field_registry_manager()
|
| 32 |
+
FIELD_CLASSIFICATION = generate_field_classification()
|
| 33 |
+
COMPLETENESS_PROFILES = generate_completeness_profiles()
|
| 34 |
+
VALIDATION_MESSAGES = generate_validation_messages()
|
| 35 |
+
SCORING_WEIGHTS = get_configurable_scoring_weights()
|
| 36 |
+
|
| 37 |
+
print(f"β
Registry-driven configuration loaded: {len(FIELD_CLASSIFICATION)} fields")
|
| 38 |
+
REGISTRY_AVAILABLE = True
|
| 39 |
+
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(f"β Failed to load registry configuration: {e}")
|
| 42 |
+
print("π Falling back to hardcoded definitions...")
|
| 43 |
+
REGISTRY_AVAILABLE = False
|
| 44 |
+
|
| 45 |
+
# Hardcoded definitions as fallback
|
| 46 |
+
FIELD_CLASSIFICATION = {
|
| 47 |
+
# Critical fields (silently aligned with SPDX mandatory fields)
|
| 48 |
+
"bomFormat": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
| 49 |
+
"specVersion": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
| 50 |
+
"serialNumber": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
| 51 |
+
"version": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
| 52 |
+
"name": {"tier": "critical", "weight": 4, "category": "component_basic"},
|
| 53 |
+
"downloadLocation": {"tier": "critical", "weight": 4, "category": "external_references"},
|
| 54 |
+
"primaryPurpose": {"tier": "critical", "weight": 3, "category": "metadata"},
|
| 55 |
+
"suppliedBy": {"tier": "critical", "weight": 4, "category": "metadata"},
|
| 56 |
+
|
| 57 |
+
# Important fields (aligned with key SPDX optional fields)
|
| 58 |
+
"type": {"tier": "important", "weight": 2, "category": "component_basic"},
|
| 59 |
+
"purl": {"tier": "important", "weight": 4, "category": "component_basic"},
|
| 60 |
+
"description": {"tier": "important", "weight": 4, "category": "component_basic"},
|
| 61 |
+
"licenses": {"tier": "important", "weight": 4, "category": "component_basic"},
|
| 62 |
+
"energyConsumption": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
| 63 |
+
"hyperparameter": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
| 64 |
+
"limitation": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
| 65 |
+
"safetyRiskAssessment": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
| 66 |
+
"typeOfModel": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
| 67 |
+
|
| 68 |
+
# Supplementary fields (aligned with remaining SPDX optional fields)
|
| 69 |
+
"modelExplainability": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 70 |
+
"standardCompliance": {"tier": "supplementary", "weight": 2, "category": "metadata"},
|
| 71 |
+
"domain": {"tier": "supplementary", "weight": 2, "category": "metadata"},
|
| 72 |
+
"energyQuantity": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 73 |
+
"energyUnit": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 74 |
+
"informationAboutTraining": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 75 |
+
"informationAboutApplication": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 76 |
+
"metric": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 77 |
+
"metricDecisionThreshold": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 78 |
+
"modelDataPreprocessing": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
| 79 |
+
"autonomyType": {"tier": "supplementary", "weight": 1, "category": "metadata"},
|
| 80 |
+
"useSensitivePersonalInformation": {"tier": "supplementary", "weight": 2, "category": "component_model_card"}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Completeness profiles (silently aligned with SPDX requirements)
|
| 84 |
+
COMPLETENESS_PROFILES = {
|
| 85 |
+
"basic": {
|
| 86 |
+
"description": "Minimal fields required for identification",
|
| 87 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
|
| 88 |
+
"minimum_score": 40
|
| 89 |
+
},
|
| 90 |
+
"standard": {
|
| 91 |
+
"description": "Comprehensive fields for proper documentation",
|
| 92 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
| 93 |
+
"downloadLocation", "primaryPurpose", "suppliedBy"],
|
| 94 |
+
"minimum_score": 70
|
| 95 |
+
},
|
| 96 |
+
"advanced": {
|
| 97 |
+
"description": "Extensive documentation for maximum transparency",
|
| 98 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
| 99 |
+
"downloadLocation", "primaryPurpose", "suppliedBy",
|
| 100 |
+
"type", "purl", "description", "licenses", "hyperparameter", "limitation",
|
| 101 |
+
"energyConsumption", "safetyRiskAssessment", "typeOfModel"],
|
| 102 |
+
"minimum_score": 85
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Validation messages framed as best practices
|
| 107 |
+
VALIDATION_MESSAGES = {
|
| 108 |
+
"name": {
|
| 109 |
+
"missing": "Missing critical field: name - essential for model identification",
|
| 110 |
+
"recommendation": "Add a descriptive name for the model"
|
| 111 |
+
},
|
| 112 |
+
"downloadLocation": {
|
| 113 |
+
"missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
|
| 114 |
+
"recommendation": "Add information about where the model can be downloaded"
|
| 115 |
+
},
|
| 116 |
+
"primaryPurpose": {
|
| 117 |
+
"missing": "Missing critical field: primaryPurpose - important for understanding model intent",
|
| 118 |
+
"recommendation": "Add information about the primary purpose of this model"
|
| 119 |
+
},
|
| 120 |
+
"suppliedBy": {
|
| 121 |
+
"missing": "Missing critical field: suppliedBy - needed for provenance tracking",
|
| 122 |
+
"recommendation": "Add information about who supplied this model"
|
| 123 |
+
},
|
| 124 |
+
"energyConsumption": {
|
| 125 |
+
"missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
|
| 126 |
+
"recommendation": "Consider documenting energy consumption metrics for better transparency"
|
| 127 |
+
},
|
| 128 |
+
"hyperparameter": {
|
| 129 |
+
"missing": "Missing important field: hyperparameter - valuable for reproducibility",
|
| 130 |
+
"recommendation": "Document key hyperparameters used in training"
|
| 131 |
+
},
|
| 132 |
+
"limitation": {
|
| 133 |
+
"missing": "Missing important field: limitation - important for responsible use",
|
| 134 |
+
"recommendation": "Document known limitations of the model to guide appropriate usage"
|
| 135 |
+
}
|
| 136 |
}
|
| 137 |
+
|
| 138 |
+
SCORING_WEIGHTS = {
|
| 139 |
+
"tier_weights": {"critical": 3, "important": 2, "supplementary": 1},
|
| 140 |
+
"category_weights": {
|
| 141 |
+
"required_fields": 20, "metadata": 20, "component_basic": 20,
|
| 142 |
+
"component_model_card": 30, "external_references": 10
|
| 143 |
+
},
|
| 144 |
+
"algorithm_config": {"type": "weighted_sum", "max_score": 100}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
}
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
def setup_logging(level=logging.INFO):
|
|
|
|
| 240 |
Returns:
|
| 241 |
True if the field is present, False otherwise
|
| 242 |
"""
|
|
|
|
| 243 |
if field in aibom:
|
| 244 |
return True
|
|
|
|
|
|
|
| 245 |
if "metadata" in aibom:
|
| 246 |
metadata = aibom["metadata"]
|
| 247 |
if field in metadata:
|
| 248 |
return True
|
|
|
|
|
|
|
| 249 |
if "properties" in metadata:
|
| 250 |
for prop in metadata["properties"]:
|
| 251 |
+
prop_name = prop.get("name", "")
|
| 252 |
+
if prop_name in {field, f"spdx:{field}"}:
|
| 253 |
return True
|
|
|
|
|
|
|
| 254 |
if "components" in aibom and aibom["components"]:
|
| 255 |
+
component = aibom["components"][0]
|
|
|
|
| 256 |
if field in component:
|
| 257 |
return True
|
|
|
|
|
|
|
| 258 |
if "properties" in component:
|
| 259 |
for prop in component["properties"]:
|
| 260 |
+
prop_name = prop.get("name", "")
|
| 261 |
+
if prop_name in {field, f"spdx:{field}"}:
|
| 262 |
return True
|
|
|
|
|
|
|
| 263 |
if "modelCard" in component:
|
| 264 |
model_card = component["modelCard"]
|
|
|
|
| 265 |
if field in model_card:
|
| 266 |
return True
|
| 267 |
+
if "modelParameters" in model_card and field in model_card["modelParameters"]:
|
| 268 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
if "considerations" in model_card:
|
| 270 |
+
considerations = model_card["considerations"]
|
| 271 |
+
field_mappings = {
|
| 272 |
+
"limitation": ["technicalLimitations", "limitations"],
|
| 273 |
+
"safetyRiskAssessment": ["ethicalConsiderations", "safetyRiskAssessment"],
|
| 274 |
+
"energyConsumption": ["environmentalConsiderations", "energyConsumption"]
|
| 275 |
+
}
|
| 276 |
+
if field in field_mappings:
|
| 277 |
+
for section in field_mappings[field]:
|
| 278 |
+
if section in considerations and considerations[section]:
|
|
|
|
|
|
|
| 279 |
return True
|
| 280 |
+
if field in considerations:
|
| 281 |
+
return True
|
| 282 |
if field == "downloadLocation" and "externalReferences" in aibom:
|
| 283 |
for ref in aibom["externalReferences"]:
|
| 284 |
+
if ref.get("type") == "distribution" and ref.get("url"):
|
| 285 |
return True
|
|
|
|
| 286 |
return False
|
| 287 |
|
| 288 |
|
| 289 |
+
|
| 290 |
def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
|
| 291 |
"""
|
| 292 |
Determine which completeness profile the AIBOM satisfies.
|
|
|
|
| 844 |
|
| 845 |
return summary
|
| 846 |
|
| 847 |
+
def check_field_with_enhanced_results(aibom: Dict[str, Any], field: str, extraction_results: Optional[Dict[str, Any]] = None) -> bool:
|
| 848 |
+
"""
|
| 849 |
+
Enhanced field detection using consolidated field registry manager.
|
| 850 |
+
|
| 851 |
+
Args:
|
| 852 |
+
aibom: The AIBOM to check
|
| 853 |
+
field: The field name to check (must match field registry)
|
| 854 |
+
extraction_results: Enhanced extraction results with confidence levels
|
| 855 |
+
|
| 856 |
+
Returns:
|
| 857 |
+
True if the field is present and should count toward score, False otherwise
|
| 858 |
+
"""
|
| 859 |
+
try:
|
| 860 |
+
# Initialize dynamic field detector (cached)
|
| 861 |
+
if not hasattr(check_field_with_enhanced_results, '_detector'):
|
| 862 |
+
try:
|
| 863 |
+
if REGISTRY_AVAILABLE:
|
| 864 |
+
# Use the consolidated registry manager
|
| 865 |
+
registry_manager = get_field_registry_manager()
|
| 866 |
+
check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
|
| 867 |
+
print(f"β
Dynamic field detector initialized with registry manager")
|
| 868 |
+
else:
|
| 869 |
+
# Create registry manager from path
|
| 870 |
+
from field_registry_manager import FieldRegistryManager
|
| 871 |
+
registry_path = os.path.join(current_dir, "field_registry.json")
|
| 872 |
+
registry_manager = FieldRegistryManager(registry_path)
|
| 873 |
+
check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
|
| 874 |
+
print(f"β
Dynamic field detector initialized with fallback registry manager")
|
| 875 |
+
|
| 876 |
+
except Exception as e:
|
| 877 |
+
print(f"β Failed to initialize dynamic field detector: {e}")
|
| 878 |
+
# Final fallback
|
| 879 |
+
import os
|
| 880 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 881 |
+
registry_path = os.path.join(current_dir, "field_registry.json")
|
| 882 |
+
try:
|
| 883 |
+
check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_path)
|
| 884 |
+
print(f"π Dynamic field detector initialized with emergency fallback")
|
| 885 |
+
except Exception as final_error:
|
| 886 |
+
print(f"β Complete failure to initialize dynamic field detector: {final_error}")
|
| 887 |
+
check_field_with_enhanced_results._detector = None
|
| 888 |
+
|
| 889 |
+
detector = check_field_with_enhanced_results._detector
|
| 890 |
+
|
| 891 |
+
if detector is None:
|
| 892 |
+
print(f"β οΈ No detector available, using fallback for {field}")
|
| 893 |
+
return check_field_in_aibom(aibom, field)
|
| 894 |
+
|
| 895 |
+
# First, try dynamic detection from AIBOM structure using ENHANCED REGISTRY FORMAT
|
| 896 |
+
field_found_in_registry = False
|
| 897 |
+
|
| 898 |
+
# Use the enhanced registry structure (registry['fields'][field_name])
|
| 899 |
+
fields = detector.registry.get('fields', {})
|
| 900 |
+
if field in fields:
|
| 901 |
+
field_found_in_registry = True
|
| 902 |
+
field_config = fields[field]
|
| 903 |
+
field_path = field_config.get('jsonpath', '')
|
| 904 |
+
|
| 905 |
+
if field_path:
|
| 906 |
+
# Use dynamic detection
|
| 907 |
+
is_present, value = detector.detect_field_presence(aibom, field_path)
|
| 908 |
+
|
| 909 |
+
if is_present:
|
| 910 |
+
print(f"β
DYNAMIC: Found {field} = {value}")
|
| 911 |
+
return True
|
| 912 |
+
else:
|
| 913 |
+
print(f"β DYNAMIC: Missing {field} at {field_path}")
|
| 914 |
+
else:
|
| 915 |
+
print(f"β οΈ Field '{field}' has no jsonpath defined in registry")
|
| 916 |
+
|
| 917 |
+
# If field not in registry, log warning but continue
|
| 918 |
+
if not field_found_in_registry:
|
| 919 |
+
print(f"β οΈ WARNING: Field '{field}' not found in field registry")
|
| 920 |
+
|
| 921 |
+
# Second, check extraction results (existing logic)
|
| 922 |
+
if extraction_results and field in extraction_results:
|
| 923 |
+
extraction_result = extraction_results[field]
|
| 924 |
+
|
| 925 |
+
# Check if this field has actual extracted data (not just placeholder)
|
| 926 |
+
if hasattr(extraction_result, 'confidence'):
|
| 927 |
+
# Don't count fields with 'none' confidence (placeholders like NOASSERTION)
|
| 928 |
+
if extraction_result.confidence.value == 'none':
|
| 929 |
+
print(f"β EXTRACTION: {field} has 'none' confidence")
|
| 930 |
+
return False
|
| 931 |
+
# Count fields with medium or high confidence
|
| 932 |
+
is_confident = extraction_result.confidence.value in ['medium', 'high']
|
| 933 |
+
print(f"{'β
' if is_confident else 'β'} EXTRACTION: {field} confidence = {extraction_result.confidence.value}")
|
| 934 |
+
return is_confident
|
| 935 |
+
elif hasattr(extraction_result, 'value'):
|
| 936 |
+
# For simple extraction results, check if value is meaningful
|
| 937 |
+
value = extraction_result.value
|
| 938 |
+
if value in ['NOASSERTION', 'NOT_FOUND', None, '']:
|
| 939 |
+
print(f"β EXTRACTION: {field} has placeholder value: {value}")
|
| 940 |
+
return False
|
| 941 |
+
print(f"β
EXTRACTION: {field} = {value}")
|
| 942 |
+
return True
|
| 943 |
+
|
| 944 |
+
# Third, fallback to original AIBOM detection
|
| 945 |
+
print(f"π FALLBACK: Using original detection for {field}")
|
| 946 |
+
return check_field_in_aibom(aibom, field)
|
| 947 |
+
|
| 948 |
+
except Exception as e:
|
| 949 |
+
print(f"β Error in enhanced field detection for {field}: {e}")
|
| 950 |
+
return check_field_in_aibom(aibom, field)
|
| 951 |
|
| 952 |
+
|
| 953 |
+
def calculate_industry_neutral_score(aibom: Dict[str, Any], extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 954 |
"""
|
| 955 |
Calculate completeness score using industry best practices with proper normalization and penalties.
|
| 956 |
|
|
|
|
| 989 |
# Count total fields in this category
|
| 990 |
fields_by_category[category]["total"] += 1
|
| 991 |
|
| 992 |
+
# Enhanced field detection using extraction results
|
| 993 |
+
is_present = check_field_with_enhanced_results(aibom, field, extraction_results)
|
| 994 |
|
| 995 |
if is_present:
|
| 996 |
fields_by_category[category]["present"] += 1
|
|
|
|
| 1012 |
category_scores[category] = round(raw_score, 1)
|
| 1013 |
else:
|
| 1014 |
category_scores[category] = 0.0
|
| 1015 |
+
|
| 1016 |
+
# Log field extraction summary
|
| 1017 |
+
total_fields = sum(counts["total"] for counts in fields_by_category.values())
|
| 1018 |
+
total_present = sum(counts["present"] for counts in fields_by_category.values())
|
| 1019 |
+
|
| 1020 |
+
print(f"π SCORING SUMMARY:")
|
| 1021 |
+
print(f" Total fields evaluated: {total_fields}")
|
| 1022 |
+
print(f" Fields successfully extracted: {total_present}")
|
| 1023 |
+
print(f" Extraction success rate: {round((total_present/total_fields)*100, 1)}%")
|
| 1024 |
+
print(f" Category breakdown:")
|
| 1025 |
+
for category, counts in fields_by_category.items():
|
| 1026 |
+
percentage = round((counts["present"]/counts["total"])*100, 1) if counts["total"] > 0 else 0
|
| 1027 |
+
print(f" {category}: {counts['present']}/{counts['total']} ({percentage}%)")
|
| 1028 |
|
| 1029 |
# Calculate subtotal (sum of rounded category scores)
|
| 1030 |
subtotal_score = sum(category_scores.values())
|
|
|
|
| 1160 |
return result
|
| 1161 |
|
| 1162 |
|
| 1163 |
+
def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True, extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 1164 |
"""
|
| 1165 |
Calculate completeness score for an AIBOM and optionally validate against AI requirements.
|
| 1166 |
Enhanced with industry best practices scoring.
|
|
|
|
| 1173 |
Returns:
|
| 1174 |
Dictionary containing score and validation results
|
| 1175 |
"""
|
| 1176 |
+
print(f"π DEBUG: use_best_practices={use_best_practices}")
|
| 1177 |
+
print(f"π DEBUG: extraction_results is None: {extraction_results is None}")
|
| 1178 |
+
print(f"π DEBUG: extraction_results keys: {list(extraction_results.keys()) if extraction_results else 'None'}")
|
| 1179 |
+
|
| 1180 |
+
if use_best_practices:
|
| 1181 |
+
print("π DEBUG: Calling calculate_industry_neutral_score")
|
| 1182 |
+
result = calculate_industry_neutral_score(aibom, extraction_results)
|
| 1183 |
# If using best practices scoring, use the enhanced industry-neutral approach
|
| 1184 |
if use_best_practices:
|
| 1185 |
+
result = calculate_industry_neutral_score(aibom, extraction_results)
|
| 1186 |
|
| 1187 |
# Add validation if requested
|
| 1188 |
if validate:
|
|
|
|
| 1659 |
summary += f"\nCompleteness Profile: {profile['name']}\n"
|
| 1660 |
summary += f"Description: {profile['description']}\n"
|
| 1661 |
|
| 1662 |
+
return summary
|
| 1663 |
+
|
| 1664 |
+
def test_consolidated_integration():
|
| 1665 |
+
"""Test that consolidated field registry manager integration is working"""
|
| 1666 |
+
try:
|
| 1667 |
+
print("\nπ§ͺ Testing Consolidated Integration...")
|
| 1668 |
+
|
| 1669 |
+
# Test registry availability
|
| 1670 |
+
if REGISTRY_AVAILABLE:
|
| 1671 |
+
print("β
Consolidated registry manager available")
|
| 1672 |
+
|
| 1673 |
+
# Test registry manager
|
| 1674 |
+
manager = get_field_registry_manager()
|
| 1675 |
+
print(f"β
Registry manager initialized: {manager.registry_path}")
|
| 1676 |
+
|
| 1677 |
+
# Test field classification generation
|
| 1678 |
+
field_count = len(FIELD_CLASSIFICATION)
|
| 1679 |
+
print(f"β
FIELD_CLASSIFICATION loaded: {field_count} fields")
|
| 1680 |
+
|
| 1681 |
+
# Test completeness profiles
|
| 1682 |
+
profile_count = len(COMPLETENESS_PROFILES)
|
| 1683 |
+
print(f"β
COMPLETENESS_PROFILES loaded: {profile_count} profiles")
|
| 1684 |
+
|
| 1685 |
+
# Test validation messages
|
| 1686 |
+
message_count = len(VALIDATION_MESSAGES)
|
| 1687 |
+
print(f"β
VALIDATION_MESSAGES loaded: {message_count} messages")
|
| 1688 |
+
|
| 1689 |
+
# Test scoring weights
|
| 1690 |
+
tier_weights = SCORING_WEIGHTS.get("tier_weights", {})
|
| 1691 |
+
category_weights = SCORING_WEIGHTS.get("category_weights", {})
|
| 1692 |
+
print(f"β
SCORING_WEIGHTS loaded: {len(tier_weights)} tiers, {len(category_weights)} categories")
|
| 1693 |
+
|
| 1694 |
+
else:
|
| 1695 |
+
print("β οΈ Consolidated registry manager not available, using hardcoded definitions")
|
| 1696 |
+
|
| 1697 |
+
# Test dynamic field detector (DynamicFieldDetector)
|
| 1698 |
+
if hasattr(check_field_with_enhanced_results, '_detector') and check_field_with_enhanced_results._detector:
|
| 1699 |
+
print(f"β
Dynamic field detector ready")
|
| 1700 |
+
else:
|
| 1701 |
+
print(f"β οΈ Dynamic field detector not initialized")
|
| 1702 |
+
|
| 1703 |
+
# Test field lookup
|
| 1704 |
+
test_fields = ["bomFormat", "primaryPurpose", "energyConsumption"]
|
| 1705 |
+
for field in test_fields:
|
| 1706 |
+
if field in FIELD_CLASSIFICATION:
|
| 1707 |
+
field_info = FIELD_CLASSIFICATION[field]
|
| 1708 |
+
print(f"β
Field '{field}': tier={field_info['tier']}, category={field_info['category']}")
|
| 1709 |
+
else:
|
| 1710 |
+
print(f"β Field '{field}' not found in FIELD_CLASSIFICATION")
|
| 1711 |
+
|
| 1712 |
+
print("π Consolidated integration test completed!")
|
| 1713 |
+
return True
|
| 1714 |
+
|
| 1715 |
+
except Exception as e:
|
| 1716 |
+
print(f"β Consolidated integration test failed: {e}")
|
| 1717 |
+
import traceback
|
| 1718 |
+
traceback.print_exc()
|
| 1719 |
+
return False
|
| 1720 |
+
|
| 1721 |
+
# Uncomment this line to run the test automatically when utils.py is imported
|
| 1722 |
+
test_consolidated_integration()
|