Spaces:

GenAISecurityProject
/

OWASP-AIBOM-Generator

Running

App Files Files

a1c00l commited on Jul 15, 2025

Commit

58ff627

verified ·

1 Parent(s): c7cebcc

Upload 5 files

Browse files

Files changed (5) hide show

src/aibom-generator/enhanced_extractor.py +876 -0
src/aibom-generator/field_registry.json +737 -0
src/aibom-generator/field_registry_manager.py +648 -0
src/aibom-generator/generator.py +442 -35
src/aibom-generator/utils.py +335 -141

src/aibom-generator/enhanced_extractor.py ADDED Viewed

	@@ -0,0 +1,876 @@

+#!/usr/bin/env python3
+"""
+Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AI SBOM Generator
+This module provides a fully configurable enhanced data extraction system that
+automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
+It includes comprehensive logging, fallback mechanisms, and confidence tracking.
+Key Features:
+- Automatically discovers all fields from the registry (field_registry.json)
+- Attempts extraction for every registry field
+- Provides detailed logging for each field attempt
+- Graceful error handling for individual field failures
+- Maintains backward compatibility with existing code
+"""
+import json
+import logging
+import re
+import requests
+from typing import Dict, Any, Optional, List, Tuple
+from enum import Enum
+from dataclasses import dataclass, field
+from datetime import datetime
+from urllib.parse import urlparse, urljoin
+import time
+# Import existing dependencies
+from huggingface_hub import HfApi, ModelCard, hf_hub_download
+from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
+# Import field registry manager (field_registry_manager.py)
+try:
+    from .field_registry_manager import get_field_registry_manager
+    REGISTRY_AVAILABLE = True
+except ImportError:
+    try:
+        from field_registry_manager import get_field_registry_manager
+        REGISTRY_AVAILABLE = True
+    except ImportError:
+        REGISTRY_AVAILABLE = False
+        print("⚠️ Field registry manager not available, falling back to legacy extraction")
+# Configure logging for this module
+logger = logging.getLogger(__name__)
+class DataSource(Enum):
+    """Enumeration of data sources for provenance tracking"""
+    HF_API = "huggingface_api"
+    MODEL_CARD = "model_card_yaml"
+    README_TEXT = "readme_text"
+    CONFIG_FILE = "config_file"
+    REPOSITORY_FILES = "repository_files"
+    EXTERNAL_REFERENCE = "external_reference"
+    INTELLIGENT_DEFAULT = "intelligent_default"
+    PLACEHOLDER = "placeholder"
+    REGISTRY_DRIVEN = "registry_driven"
+class ConfidenceLevel(Enum):
+    """Confidence levels for extracted data"""
+    HIGH = "high"        # Direct API data, official sources
+    MEDIUM = "medium"    # Inferred from reliable patterns
+    LOW = "low"          # Weak inference or pattern matching
+    NONE = "none"        # Placeholder values
+@dataclass
+class ExtractionResult:
+    """Container for extraction results with full provenance"""
+    value: Any
+    source: DataSource
+    confidence: ConfidenceLevel
+    extraction_method: str
+    timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
+    fallback_chain: List[str] = field(default_factory=list)
+    def __str__(self):
+        return f"{self.value} (source: {self.source.value}, confidence: {self.confidence.value})"
+class EnhancedExtractor:
+    """
+    Registry-integrated enhanced extractor that automatically picks up new fields
+    from the JSON registry (field_registry.json) without requiring code changes.
+    """
+    def __init__(self, hf_api: Optional[HfApi] = None, field_registry_manager=None):
+        """
+        Initialize the enhanced extractor with registry integration (field_registry.json and field_registry_manager.py).
+        Args:
+            hf_api: Optional HuggingFace API instance (will create if not provided)
+            field_registry_manager.py: Optional registry manager instance
+        """
+        self.hf_api = hf_api or HfApi()
+        self.extraction_results = {}
+        # Initialize registry manager (field_registry_manager.py)
+        self.registry_manager = field_registry_manager
+        if not self.registry_manager and REGISTRY_AVAILABLE:
+            try:
+                self.registry_manager = get_field_registry_manager()
+                logger.info("✅ Registry manager initialized successfully")
+            except Exception as e:
+                logger.warning(f"⚠️ Could not initialize registry manager: {e}")
+                self.registry_manager = None
+        # Load registry fields
+        self.registry_fields = {}
+        if self.registry_manager:
+            try:
+                registry = self.registry_manager.registry
+                self.registry_fields = registry.get('fields', {})
+                logger.info(f"✅ Loaded {len(self.registry_fields)} fields from registry")
+            except Exception as e:
+                logger.error(f"❌ Error loading registry fields: {e}")
+                self.registry_fields = {}
+        # Configure logging
+        self._setup_logging()
+        # Compile regex patterns for text extraction
+        self._compile_patterns()
+        logger.info(f"Enhanced extractor initialized (registry-driven: {bool(self.registry_fields)})")
+    def _setup_logging(self):
+        """Setup logging configuration for detailed extraction tracking"""
+        # Ensure a logger that will show in HF Spaces
+        if not logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            )
+            handler.setFormatter(formatter)
+            logger.addHandler(handler)
+            logger.setLevel(logging.INFO)
+    def _compile_patterns(self):
+        """Compile regex patterns for text extraction"""
+        self.patterns = {
+            'license': [
+                r'license[:\s]+([a-zA-Z0-9\-\.]+)',
+                r'licensed under[:\s]+([a-zA-Z0-9\-\.]+)',
+                r'released under[:\s]+([a-zA-Z0-9\-\.]+)',
+            ],
+            'datasets': [
+                r'trained on[:\s]+([a-zA-Z0-9\-\_\/]+)',
+                r'dataset[:\s]+([a-zA-Z0-9\-\_\/]+)',
+                r'using[:\s]+([a-zA-Z0-9\-\_\/]+)\s+dataset',
+            ],
+            'metrics': [
+                r'([a-zA-Z]+)[:\s]+([0-9\.]+)',
+                r'achieves[:\s]+([0-9\.]+)[:\s]+([a-zA-Z]+)',
+            ],
+            'model_type': [
+                r'model type[:\s]+([a-zA-Z0-9\-]+)',
+                r'architecture[:\s]+([a-zA-Z0-9\-]+)',
+            ],
+            'energy': [
+                r'energy[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
+                r'power[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
+                r'consumption[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
+            ],
+            'limitations': [
+                r'limitation[s]?[:\s]+([^\.]+)',
+                r'known issue[s]?[:\s]+([^\.]+)',
+                r'constraint[s]?[:\s]+([^\.]+)',
+            ],
+            'safety': [
+                r'safety[:\s]+([^\.]+)',
+                r'risk[s]?[:\s]+([^\.]+)',
+                r'bias[:\s]+([^\.]+)',
+            ]
+        }
+        # Compile all patterns
+        for category, pattern_list in self.patterns.items():
+            self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
+    def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
+        """
+        Main extraction method with full registry integration.
+        This method automatically discovers all fields from the registry and attempts
+        to extract them without requiring code changes when new fields are added.
+        Args:
+            model_id: Hugging Face model identifier
+            model_info: Model information from HF API
+            model_card: Model card object from HF
+        Returns:
+            Dictionary of extracted metadata
+        """
+        logger.info(f"🚀 Starting registry-driven extraction for model: {model_id}")
+        # Initialize extraction results tracking
+        self.extraction_results = {}
+        metadata = {}
+        if self.registry_fields:
+            # Registry-driven extraction
+            logger.info(f"📋 Registry-driven mode: Attempting extraction for {len(self.registry_fields)} fields")
+            metadata = self._registry_driven_extraction(model_id, model_info, model_card)
+        else:
+            # Fallback to legacy extraction
+            logger.warning("⚠️ Registry not available, falling back to legacy extraction")
+            metadata = self._legacy_extraction(model_id, model_info, model_card)
+        # Log extraction summary
+        self._log_extraction_summary(model_id, metadata)
+        # Return metadata in the same format as original method
+        return {k: v for k, v in metadata.items() if v is not None}
+    def _registry_driven_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
+        """
+        Registry-driven extraction that automatically processes all registry fields.
+        """
+        metadata = {}
+        # Prepare extraction context
+        extraction_context = {
+            'model_id': model_id,
+            'model_info': model_info,
+            'model_card': model_card,
+            'readme_content': self._get_readme_content(model_card, model_id),
+            'config_data': self._download_and_parse_config(model_id, "config.json"),
+            'tokenizer_config': self._download_and_parse_config(model_id, "tokenizer_config.json")
+        }
+        # Process each field from the registry
+        successful_extractions = 0
+        failed_extractions = 0
+        for field_name, field_config in self.registry_fields.items():
+            try:
+                logger.info(f"🔍 Attempting extraction for field: {field_name}")
+                # Extract field using registry configuration
+                extracted_value = self._extract_registry_field(field_name, field_config, extraction_context)
+                if extracted_value is not None:
+                    metadata[field_name] = extracted_value
+                    successful_extractions += 1
+                    logger.info(f"✅ Successfully extracted {field_name}: {extracted_value}")
+                else:
+                    failed_extractions += 1
+                    logger.info(f"❌ Failed to extract {field_name}")
+            except Exception as e:
+                failed_extractions += 1
+                logger.error(f"❌ Error extracting {field_name}: {e}")
+                # Continue with other fields - individual failures don't stop the process
+                continue
+        logger.info(f"📊 Registry extraction complete: {successful_extractions} successful, {failed_extractions} failed")
+        # Add external references
+        metadata.update(self._generate_external_references(model_id, metadata))
+        return metadata
+    def _extract_registry_field(self, field_name: str, field_config: Dict[str, Any], context: Dict[str, Any]) -> Any:
+        """
+        Extract a single field based on its registry configuration.
+        This method uses multiple extraction strategies in order of preference:
+        1. Direct API extraction
+        2. Model card YAML extraction
+        3. Text pattern matching
+        4. Intelligent inference
+        5. Fallback values
+        """
+        extraction_methods = []
+        # Strategy 1: Direct API extraction
+        api_value = self._try_api_extraction(field_name, context)
+        if api_value is not None:
+            self.extraction_results[field_name] = ExtractionResult(
+                value=api_value,
+                source=DataSource.HF_API,
+                confidence=ConfidenceLevel.HIGH,
+                extraction_method="api_direct"
+            )
+            extraction_methods.append("api_direct")
+            return api_value
+        # Strategy 2: Model card YAML extraction
+        yaml_value = self._try_model_card_extraction(field_name, context)
+        if yaml_value is not None:
+            self.extraction_results[field_name] = ExtractionResult(
+                value=yaml_value,
+                source=DataSource.MODEL_CARD,
+                confidence=ConfidenceLevel.HIGH,
+                extraction_method="model_card_yaml"
+            )
+            extraction_methods.append("model_card_yaml")
+            return yaml_value
+        # Strategy 3: Configuration file extraction
+        config_value = self._try_config_extraction(field_name, context)
+        if config_value is not None:
+            self.extraction_results[field_name] = ExtractionResult(
+                value=config_value,
+                source=DataSource.CONFIG_FILE,
+                confidence=ConfidenceLevel.HIGH,
+                extraction_method="config_file"
+            )
+            extraction_methods.append("config_file")
+            return config_value
+        # Strategy 4: Text pattern extraction
+        text_value = self._try_text_pattern_extraction(field_name, context)
+        if text_value is not None:
+            self.extraction_results[field_name] = ExtractionResult(
+                value=text_value,
+                source=DataSource.README_TEXT,
+                confidence=ConfidenceLevel.MEDIUM,
+                extraction_method="text_pattern"
+            )
+            extraction_methods.append("text_pattern")
+            return text_value
+        # Strategy 5: Intelligent inference
+        inferred_value = self._try_intelligent_inference(field_name, context)
+        if inferred_value is not None:
+            self.extraction_results[field_name] = ExtractionResult(
+                value=inferred_value,
+                source=DataSource.INTELLIGENT_DEFAULT,
+                confidence=ConfidenceLevel.MEDIUM,
+                extraction_method="intelligent_inference"
+            )
+            extraction_methods.append("intelligent_inference")
+            return inferred_value
+        # Strategy 6: Fallback value (if configured)
+        fallback_value = self._try_fallback_value(field_name, field_config)
+        if fallback_value is not None:
+            self.extraction_results[field_name] = ExtractionResult(
+                value=fallback_value,
+                source=DataSource.PLACEHOLDER,
+                confidence=ConfidenceLevel.NONE,
+                extraction_method="fallback_placeholder",
+                fallback_chain=extraction_methods
+            )
+            return fallback_value
+        # No extraction successful
+        self.extraction_results[field_name] = ExtractionResult(
+            value=None,
+            source=DataSource.PLACEHOLDER,
+            confidence=ConfidenceLevel.NONE,
+            extraction_method="extraction_failed",
+            fallback_chain=extraction_methods
+        )
+        return None
+    def _try_api_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
+        """Try to extract field from HuggingFace API data"""
+        model_info = context.get('model_info')
+        if not model_info:
+            return None
+        # Field mapping for API extraction
+        api_mappings = {
+            'author': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
+            'name': lambda info: getattr(info, 'modelId', context['model_id']).split('/')[-1],
+            'tags': lambda info: getattr(info, 'tags', []),
+            'pipeline_tag': lambda info: getattr(info, 'pipeline_tag', None),
+            'downloads': lambda info: getattr(info, 'downloads', 0),
+            'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
+            'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
+            'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
+            'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main"
+        }
+        if field_name in api_mappings:
+            try:
+                return api_mappings[field_name](model_info)
+            except Exception as e:
+                logger.debug(f"API extraction failed for {field_name}: {e}")
+                return None
+        return None
+    def _try_model_card_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
+        """Try to extract field from model card YAML frontmatter"""
+        model_card = context.get('model_card')
+        if not model_card or not hasattr(model_card, 'data') or not model_card.data:
+            return None
+        try:
+            card_data = model_card.data.to_dict() if hasattr(model_card.data, 'to_dict') else {}
+            # Field mapping for model card extraction
+            card_mappings = {
+                'license': 'license',
+                'language': 'language',
+                'library_name': 'library_name',
+                'base_model': 'base_model',
+                'datasets': 'datasets',
+                'description': ['model_summary', 'description'],
+                'typeOfModel': 'model_type',
+                'licenses': 'license'  # Alternative mapping
+            }
+            if field_name in card_mappings:
+                mapping = card_mappings[field_name]
+                if isinstance(mapping, list):
+                    # Try multiple keys
+                    for key in mapping:
+                        value = card_data.get(key)
+                        if value:
+                            return value
+                else:
+                    # Single key
+                    return card_data.get(mapping)
+            # Direct field name lookup
+            return card_data.get(field_name)
+        except Exception as e:
+            logger.debug(f"Model card extraction failed for {field_name}: {e}")
+            return None
+    def _try_config_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
+        """Try to extract field from configuration files"""
+        config_data = context.get('config_data')
+        tokenizer_config = context.get('tokenizer_config')
+        # Config file mappings
+        config_mappings = {
+            'model_type': ('config_data', 'model_type'),
+            'architectures': ('config_data', 'architectures'),
+            'vocab_size': ('config_data', 'vocab_size'),
+            'tokenizer_class': ('tokenizer_config', 'tokenizer_class'),
+            'typeOfModel': ('config_data', 'model_type')
+        }
+        if field_name in config_mappings:
+            config_type, config_key = config_mappings[field_name]
+            config_source = context.get(config_type)
+            if config_source:
+                return config_source.get(config_key)
+        return None
+    def _try_text_pattern_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
+        """Try to extract field using text pattern matching"""
+        readme_content = context.get('readme_content')
+        if not readme_content:
+            return None
+        # Pattern mappings for different fields
+        pattern_mappings = {
+            'license': 'license',
+            'datasets': 'datasets',
+            'energyConsumption': 'energy',
+            'limitation': 'limitations',
+            'safetyRiskAssessment': 'safety',
+            'model_type': 'model_type'
+        }
+        if field_name in pattern_mappings:
+            pattern_key = pattern_mappings[field_name]
+            if pattern_key in self.patterns:
+                matches = self._find_pattern_matches(readme_content, self.patterns[pattern_key])
+                if matches:
+                    return matches[0] if len(matches) == 1 else matches
+        return None
+    def _try_intelligent_inference(self, field_name: str, context: Dict[str, Any]) -> Any:
+        """Try to infer field value from other available data"""
+        model_id = context['model_id']
+        # Intelligent inference rules
+        inference_rules = {
+            'author': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
+            'suppliedBy': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
+            'name': lambda: model_id.split('/')[-1],
+            'primaryPurpose': lambda: 'text-generation',  # Default for most HF models
+            'typeOfModel': lambda: 'transformer',  # Default for most HF models
+            'downloadLocation': lambda: f"https://huggingface.co/{model_id}/tree/main",
+            'bomFormat': lambda: 'CycloneDX',
+            'specVersion': lambda: '1.6',
+            'serialNumber': lambda: f"urn:uuid:{model_id.replace('/', '-')}",
+            'version': lambda: '1.0.0'
+        }
+        if field_name in inference_rules:
+            try:
+                return inference_rules[field_name]()
+            except Exception as e:
+                logger.debug(f"Intelligent inference failed for {field_name}: {e}")
+                return None
+        return None
+    def _try_fallback_value(self, field_name: str, field_config: Dict[str, Any]) -> Any:
+        """Try to get fallback value from field configuration"""
+        # Check if field config has fallback value
+        if isinstance(field_config, dict):
+            fallback = field_config.get('fallback_value')
+            if fallback:
+                return fallback
+        # Standard fallback values for common fields
+        standard_fallbacks = {
+            'license': 'NOASSERTION',
+            'description': 'No description available',
+            'version': '1.0.0',
+            'bomFormat': 'CycloneDX',
+            'specVersion': '1.6'
+        }
+        return standard_fallbacks.get(field_name)
+    def _legacy_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
+        """
+        Fallback to legacy extraction when registry is not available.
+        This maintains backward compatibility.
+        """
+        logger.info("🔄 Executing legacy extraction mode")
+        metadata = {}
+        # Execute legacy extraction layers
+        metadata.update(self._layer1_structured_api(model_id, model_info, model_card))
+        metadata.update(self._layer2_repository_files(model_id))
+        metadata.update(self._layer3_stp_extraction(model_card, model_id))
+        metadata.update(self._layer4_external_references(model_id, metadata))
+        metadata.update(self._layer5_intelligent_defaults(model_id, metadata))
+        return metadata
+    def _generate_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate external references for the model"""
+        external_refs = []
+        # Model repository
+        repo_url = f"https://huggingface.co/{model_id}"
+        external_refs.append({
+            "type": "website",
+            "url": repo_url,
+            "comment": "Model repository"
+        })
+        # Model files
+        files_url = f"https://huggingface.co/{model_id}/tree/main"
+        external_refs.append({
+            "type": "distribution",
+            "url": files_url,
+            "comment": "Model files"
+        })
+        # Commit URL if available
+        if 'commit' in metadata:
+            commit_url = f"https://huggingface.co/{model_id}/commit/{metadata['commit']}"
+            external_refs.append({
+                "type": "vcs",
+                "url": commit_url,
+                "comment": "Specific commit"
+            })
+        # Dataset references
+        if 'datasets' in metadata:
+            datasets = metadata['datasets']
+            if isinstance(datasets, list):
+                for dataset in datasets:
+                    if isinstance(dataset, str):
+                        dataset_url = f"https://huggingface.co/datasets/{dataset}"
+                        external_refs.append({
+                            "type": "distribution",
+                            "url": dataset_url,
+                            "comment": f"Training dataset: {dataset}"
+                        })
+        result = {'external_references': external_refs}
+        self.extraction_results['external_references'] = ExtractionResult(
+            value=external_refs,
+            source=DataSource.EXTERNAL_REFERENCE,
+            confidence=ConfidenceLevel.HIGH,
+            extraction_method="url_generation"
+        )
+        return result
+    # Legacy methods for backward compatibility
+    def _layer1_structured_api(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
+        """Legacy Layer 1: Enhanced structured data extraction from HF API and model card."""
+        logger.info("📊 Executing Legacy Layer 1: Enhanced Structured API Extraction")
+        metadata = {}
+        # Enhanced model info extraction
+        if model_info:
+            try:
+                # Extract author with fallback logic
+                author = getattr(model_info, "author", None)
+                if not author or author.strip() == "":
+                    parts = model_id.split("/")
+                    author = parts[0] if len(parts) > 1 else "unknown"
+                metadata['author'] = author
+                metadata['name'] = getattr(model_info, "modelId", model_id).split("/")[-1]
+                metadata['tags'] = getattr(model_info, "tags", [])
+                metadata['pipeline_tag'] = getattr(model_info, "pipeline_tag", None)
+                metadata['downloads'] = getattr(model_info, "downloads", 0)
+                # Commit information
+                commit_sha = getattr(model_info, "sha", None)
+                if commit_sha:
+                    metadata['commit'] = commit_sha[:7]
+            except Exception as e:
+                logger.error(f"❌ Legacy Layer 1: Error extracting from model_info: {e}")
+        # Enhanced model card extraction
+        if model_card and hasattr(model_card, "data") and model_card.data:
+            try:
+                card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
+                metadata['license'] = card_data.get("license")
+                metadata['language'] = card_data.get("language")
+                metadata['library_name'] = card_data.get("library_name")
+                metadata['base_model'] = card_data.get("base_model")
+                metadata['datasets'] = card_data.get("datasets")
+                metadata['description'] = card_data.get("model_summary") or card_data.get("description")
+            except Exception as e:
+                logger.error(f"❌ Legacy Layer 1: Error extracting from model card: {e}")
+        # Add standard AI metadata
+        metadata["primaryPurpose"] = metadata.get("pipeline_tag", "text-generation")
+        metadata["suppliedBy"] = metadata.get("author", "unknown")
+        metadata["typeOfModel"] = "transformer"
+        return metadata
+    def _layer2_repository_files(self, model_id: str) -> Dict[str, Any]:
+        """Legacy Layer 2: Repository file analysis"""
+        logger.info("🔧 Executing Legacy Layer 2: Repository File Analysis")
+        metadata = {}
+        try:
+            config_data = self._download_and_parse_config(model_id, "config.json")
+            if config_data:
+                metadata['model_type'] = config_data.get("model_type")
+                metadata['architectures'] = config_data.get("architectures", [])
+                metadata['vocab_size'] = config_data.get("vocab_size")
+            tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
+            if tokenizer_config:
+                metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
+        except Exception as e:
+            logger.warning(f"⚠️ Legacy Layer 2: Could not analyze repository files: {e}")
+        return metadata
+    def _layer3_stp_extraction(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
+        """Legacy Layer 3: Smart Text Parsing"""
+        logger.info("🔍 Executing Legacy Layer 3: Smart Text Parsing")
+        metadata = {}
+        try:
+            readme_content = self._get_readme_content(model_card, model_id)
+            if readme_content:
+                extracted_info = self._extract_from_text(readme_content)
+                metadata.update(extracted_info)
+        except Exception as e:
+            logger.warning(f"⚠️ Legacy Layer 3: Error in Smart Text Parsing: {e}")
+        return metadata
+    def _layer4_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        """Legacy Layer 4: External reference generation"""
+        logger.info("🔗 Executing Legacy Layer 4: External Reference Generation")
+        return self._generate_external_references(model_id, metadata)
+    def _layer5_intelligent_defaults(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        """Legacy Layer 5: Intelligent default generation"""
+        logger.info("🧠 Executing Legacy Layer 5: Intelligent Default Generation")
+        if 'author' not in metadata or not metadata['author']:
+            parts = model_id.split("/")
+            metadata['author'] = parts[0] if len(parts) > 1 else "unknown"
+        if 'license' not in metadata or not metadata['license']:
+            metadata['license'] = "NOASSERTION"
+        return metadata
+    # Utility methods
+    def _download_and_parse_config(self, model_id: str, filename: str) -> Optional[Dict[str, Any]]:
+        """Download and parse a configuration file from the model repository"""
+        try:
+            file_path = hf_hub_download(repo_id=model_id, filename=filename)
+            with open(file_path, 'r') as f:
+                return json.load(f)
+        except (RepositoryNotFoundError, EntryNotFoundError, json.JSONDecodeError) as e:
+            logger.debug(f"Could not download/parse {filename}: {e}")
+            return None
+        except Exception as e:
+            logger.warning(f"Unexpected error downloading {filename}: {e}")
+            return None
+    def _get_readme_content(self, model_card: Optional[ModelCard], model_id: str) -> Optional[str]:
+        """Get README content from model card or by downloading"""
+        try:
+            if model_card and hasattr(model_card, 'content'):
+                return model_card.content
+            readme_path = hf_hub_download(repo_id=model_id, filename="README.md")
+            with open(readme_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            logger.debug(f"Could not get README content: {e}")
+            return None
+    def _extract_from_text(self, text: str) -> Dict[str, Any]:
+        """Extract structured information from unstructured text"""
+        metadata = {}
+        # Extract license information
+        license_matches = self._find_pattern_matches(text, self.patterns['license'])
+        if license_matches:
+            metadata['license_from_text'] = license_matches[0]
+        # Extract dataset information
+        dataset_matches = self._find_pattern_matches(text, self.patterns['datasets'])
+        if dataset_matches:
+            metadata['datasets_from_text'] = dataset_matches
+        # Extract performance metrics
+        metric_matches = self._extract_metrics(text)
+        if metric_matches:
+            metadata['performance_metrics'] = metric_matches
+        return metadata
+    def _find_pattern_matches(self, text: str, patterns: List[re.Pattern]) -> List[str]:
+        """Find matches for a list of regex patterns in text"""
+        matches = []
+        for pattern in patterns:
+            found = pattern.findall(text)
+            matches.extend(found)
+        return list(set(matches))  # Remove duplicates
+    def _extract_metrics(self, text: str) -> Dict[str, float]:
+        """Extract performance metrics from text"""
+        metrics = {}
+        metric_patterns = [
+            r'accuracy[:\s]+([0-9\.]+)',
+            r'f1[:\s]+([0-9\.]+)',
+            r'bleu[:\s]+([0-9\.]+)',
+            r'rouge[:\s]+([0-9\.]+)',
+        ]
+        for pattern_str in metric_patterns:
+            pattern = re.compile(pattern_str, re.IGNORECASE)
+            matches = pattern.findall(text)
+            if matches:
+                metric_name = pattern_str.split('[')[0]
+                try:
+                    metrics[metric_name] = float(matches[0])
+                except ValueError:
+                    continue
+        return metrics
+    def _log_extraction_summary(self, model_id: str, metadata: Dict[str, Any]):
+        """Log comprehensive extraction summary"""
+        logger.info("=" * 60)
+        logger.info(f"📋 REGISTRY-DRIVEN EXTRACTION SUMMARY FOR: {model_id}")
+        logger.info("=" * 60)
+        if self.registry_fields:
+            logger.info(f"📊 Registry fields available: {len(self.registry_fields)}")
+            logger.info(f"📊 Total fields extracted: {len(self.extraction_results)}")
+            # Count fields by confidence level
+            confidence_counts = {}
+            source_counts = {}
+            for field_name, result in self.extraction_results.items():
+                conf = result.confidence.value
+                source = result.source.value
+                confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
+                source_counts[source] = source_counts.get(source, 0) + 1
+            logger.info("📈 Confidence distribution:")
+            for conf, count in confidence_counts.items():
+                logger.info(f"   {conf}: {count} fields")
+            logger.info("🔍 Source distribution:")
+            for source, count in source_counts.items():
+                logger.info(f"   {source}: {count} fields")
+            # Log registry field coverage
+            extracted_fields = set(self.extraction_results.keys())
+            registry_field_names = set(self.registry_fields.keys())
+            coverage = len(extracted_fields & registry_field_names) / len(registry_field_names) * 100
+            logger.info(f"📊 Registry field coverage: {coverage:.1f}%")
+            # Log missing registry fields
+            missing_fields = registry_field_names - extracted_fields
+            if missing_fields:
+                logger.info(f"❌ Missing registry fields: {', '.join(sorted(missing_fields))}")
+        else:
+            logger.info(f"📊 Legacy extraction mode: {len(metadata)} fields extracted")
+        logger.info("=" * 60)
+    def get_extraction_results(self) -> Dict[str, ExtractionResult]:
+        """Get detailed extraction results with provenance"""
+        return self.extraction_results.copy()
+# Convenience function for drop-in replacement
+def extract_enhanced_metadata(model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard], hf_api: Optional[HfApi] = None) -> Dict[str, Any]:
+    """
+    Drop-in replacement function for _extract_structured_metadata with registry integration.
+    This function automatically picks up new fields from the registry without code changes.
+    Args:
+        model_id: Hugging Face model identifier
+        model_info: Model information from HF API
+        model_card: Model card object from HF
+        hf_api: Optional HuggingFace API instance
+    Returns:
+        Dictionary of extracted metadata
+    """
+    extractor = EnhancedExtractor(hf_api)
+    return extractor.extract_metadata(model_id, model_info, model_card)
+if __name__ == "__main__":
+    # Test the registry-integrated enhanced extractor
+    import sys
+    if len(sys.argv) > 1:
+        test_model_id = sys.argv[1]
+    else:
+        test_model_id = "deepseek-ai/DeepSeek-R1"
+    print(f"Testing registry-integrated enhanced extractor with model: {test_model_id}")
+    # Initialize HF API
+    hf_api = HfApi()
+    try:
+        # Fetch model info and card
+        model_info = hf_api.model_info(test_model_id)
+        model_card = ModelCard.load(test_model_id)
+        # Test extraction
+        extractor = EnhancedExtractor(hf_api)
+        metadata = extractor.extract_metadata(test_model_id, model_info, model_card)
+        print(f"\nExtracted {len(metadata)} metadata fields:")
+        for key, value in metadata.items():
+            print(f"  {key}: {value}")
+        print(f"\nExtraction results with provenance:")
+        for field, result in extractor.get_extraction_results().items():
+            print(f"  {field}: {result}")
+    except Exception as e:
+        print(f"Error testing extractor: {e}")

src/aibom-generator/field_registry.json ADDED Viewed

	@@ -0,0 +1,737 @@

+{
+  "registry_metadata": {
+    "description": "Field registry for configurable AI SBOM generation and scoring"
+  },
+  "scoring_config": {
+    "tier_weights": {
+      "critical": 3,
+      "important": 2,
+      "supplementary": 1
+    },
+    "category_weights": {
+      "required_fields": 20,
+      "metadata": 20,
+      "component_basic": 20,
+      "component_model_card": 30,
+      "external_references": 10
+    },
+    "scoring_profiles": {
+      "basic": {
+        "description": "Minimal fields required for identification",
+        "required_categories": ["required_fields", "component_basic"],
+        "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
+        "minimum_score": 40,
+        "weight_multiplier": 1.0
+      },
+      "standard": {
+        "description": "Comprehensive fields for proper documentation",
+        "required_categories": ["required_fields", "metadata", "component_basic"],
+        "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy"],
+        "minimum_score": 70,
+        "weight_multiplier": 1.0
+      },
+      "advanced": {
+        "description": "Extensive documentation for maximum transparency",
+        "required_categories": ["required_fields", "metadata", "component_basic", "component_model_card", "external_references"],
+        "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy", "type", "purl", "description", "licenses", "hyperparameter", "limitation", "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
+        "minimum_score": 85,
+        "weight_multiplier": 1.0
+      }
+    },
+    "algorithm_config": {
+      "type": "weighted_sum",
+      "max_score": 100,
+      "normalization": "category_based",
+      "penalty_for_missing_critical": 0.5,
+      "bonus_for_complete_categories": 0.1
+    }
+  },
+  "aibom_config": {
+    "structure_template": "cyclonedx_1.6",
+    "generator_info": {
+      "name": "aetheris-aibom-generator",
+      "version": "1.0",
+      "manufacturer": "Aetheris AI"
+    },
+    "generation_rules": {
+      "include_metadata_properties": true,
+      "include_model_card": true,
+      "include_external_references": true,
+      "include_dependencies": true
+    },
+    "validation_rules": {
+      "require_critical_fields": true,
+      "validate_jsonpath_expressions": true,
+      "enforce_cyclonedx_schema": true
+    }
+  },
+  "fields": {
+    "bomFormat": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "required_fields",
+      "description": "Format identifier for the SBOM",
+      "jsonpath": "$.bomFormat",
+      "aibom_generation": {
+        "location": "$.bomFormat",
+        "rule": "always_include",
+        "source_fields": ["bomFormat"],
+        "validation": "required",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["basic", "standard", "advanced"],
+        "category_contribution": 0.2
+      },
+      "validation_message": {
+        "missing": "Missing critical field: bomFormat - essential for SBOM identification",
+        "recommendation": "Ensure bomFormat is set to 'CycloneDX'"
+      }
+    },
+    "specVersion": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "required_fields",
+      "description": "CycloneDX specification version",
+      "jsonpath": "$.specVersion",
+      "aibom_generation": {
+        "location": "$.specVersion",
+        "rule": "always_include",
+        "source_fields": ["specVersion"],
+        "validation": "required",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["basic", "standard", "advanced"],
+        "category_contribution": 0.2
+      },
+      "validation_message": {
+        "missing": "Missing critical field: specVersion - required for CycloneDX compliance",
+        "recommendation": "Set specVersion to '1.6' for CycloneDX 1.6 compliance"
+      }
+    },
+    "serialNumber": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "required_fields",
+      "description": "Unique identifier for this SBOM instance",
+      "jsonpath": "$.serialNumber",
+      "aibom_generation": {
+        "location": "$.serialNumber",
+        "rule": "always_include",
+        "source_fields": ["serialNumber"],
+        "validation": "required",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["basic", "standard", "advanced"],
+        "category_contribution": 0.2
+      },
+      "validation_message": {
+        "missing": "Missing critical field: serialNumber - unique identifier required",
+        "recommendation": "Generate a UUID for the SBOM instance"
+      }
+    },
+    "version": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "required_fields",
+      "description": "Version of this SBOM document",
+      "jsonpath": "$.version",
+      "aibom_generation": {
+        "location": "$.version",
+        "rule": "always_include",
+        "source_fields": ["version"],
+        "validation": "required",
+        "data_type": "integer"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["basic", "standard", "advanced"],
+        "category_contribution": 0.2
+      },
+      "validation_message": {
+        "missing": "Missing critical field: version - document version required",
+        "recommendation": "Set version to 1 for initial SBOM generation"
+      }
+    },
+    "primaryPurpose": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "metadata",
+      "description": "Primary purpose or task of the AI model",
+      "jsonpath": "$.metadata.properties[?(@.name=='primaryPurpose')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["primaryPurpose", "pipeline_tag", "ai:task"],
+        "validation": "recommended",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["standard", "advanced"],
+        "category_contribution": 0.2
+      },
+      "validation_message": {
+        "missing": "Missing critical field: primaryPurpose - essential for understanding model intent",
+        "recommendation": "Add the primary task or purpose of the AI model"
+      }
+    },
+    "suppliedBy": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "metadata",
+      "description": "Organization or individual that supplied the model",
+      "jsonpath": "$.metadata.properties[?(@.name=='suppliedBy')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["suppliedBy", "author", "publisher"],
+        "validation": "recommended",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["standard", "advanced"],
+        "category_contribution": 0.2
+      },
+      "validation_message": {
+        "missing": "Missing critical field: suppliedBy - supplier identification required",
+        "recommendation": "Add the organization or individual who provided the model"
+      }
+    },
+    "standardCompliance": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "metadata",
+      "description": "Standards or regulations the model complies with",
+      "jsonpath": "$.metadata.properties[?(@.name=='standardCompliance')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["standardCompliance", "compliance"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.05
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: standardCompliance - compliance information helpful",
+        "recommendation": "Add any relevant standards or regulations the model complies with"
+      }
+    },
+    "domain": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "metadata",
+      "description": "Domain or field of application",
+      "jsonpath": "$.metadata.properties[?(@.name=='domain')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["domain", "field", "application_area"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.05
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: domain - application domain helpful for context",
+        "recommendation": "Add the domain or field where this model is typically applied"
+      }
+    },
+    "autonomyType": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "metadata",
+      "description": "Level of autonomy or human involvement required",
+      "jsonpath": "$.metadata.properties[?(@.name=='autonomyType')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["autonomyType", "autonomy_level"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.05
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: autonomyType - autonomy level information helpful",
+        "recommendation": "Add information about the level of human oversight required"
+      }
+    },
+    "name": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "component_basic",
+      "description": "Name of the AI model component",
+      "jsonpath": "$.components[0].name",
+      "aibom_generation": {
+        "location": "$.components[0].name",
+        "rule": "always_include",
+        "source_fields": ["name", "model_name"],
+        "validation": "required",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["basic", "standard", "advanced"],
+        "category_contribution": 0.2
+      },
+      "validation_message": {
+        "missing": "Missing critical field: name - essential for model identification",
+        "recommendation": "Add a descriptive name for the model"
+      }
+    },
+    "type": {
+      "tier": "important",
+      "weight": 3.0,
+      "category": "component_basic",
+      "description": "Type of component (machine-learning-model)",
+      "jsonpath": "$.components[0].type",
+      "aibom_generation": {
+        "location": "$.components[0].type",
+        "rule": "always_include",
+        "source_fields": ["type"],
+        "validation": "required",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 3.0,
+        "required_for_profiles": ["standard", "advanced"],
+        "category_contribution": 0.15
+      },
+      "validation_message": {
+        "missing": "Missing important field: type - component type classification needed",
+        "recommendation": "Set type to 'machine-learning-model' for AI models"
+      }
+    },
+    "purl": {
+      "tier": "important",
+      "weight": 3.0,
+      "category": "component_basic",
+      "description": "Package URL identifier",
+      "jsonpath": "$.components[0].purl",
+      "aibom_generation": {
+        "location": "$.components[0].purl",
+        "rule": "include_if_available",
+        "source_fields": ["purl", "package_url"],
+        "validation": "recommended",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 3.0,
+        "required_for_profiles": ["standard", "advanced"],
+        "category_contribution": 0.15
+      },
+      "validation_message": {
+        "missing": "Missing important field: purl - package URL for identification",
+        "recommendation": "Add a Package URL (PURL) for the model"
+      }
+    },
+    "description": {
+      "tier": "important",
+      "weight": 3.0,
+      "category": "component_basic",
+      "description": "Description of the AI model",
+      "jsonpath": "$.components[0].description",
+      "aibom_generation": {
+        "location": "$.components[0].description",
+        "rule": "include_if_available",
+        "source_fields": ["description", "summary"],
+        "validation": "recommended",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 3.0,
+        "required_for_profiles": ["standard", "advanced"],
+        "category_contribution": 0.15
+      },
+      "validation_message": {
+        "missing": "Missing important field: description - model description helpful for understanding",
+        "recommendation": "Add a clear description of what the model does"
+      }
+    },
+    "licenses": {
+      "tier": "important",
+      "weight": 3.0,
+      "category": "component_basic",
+      "description": "License information for the model",
+      "jsonpath": "$.components[0].licenses",
+      "aibom_generation": {
+        "location": "$.components[0].licenses",
+        "rule": "include_if_available",
+        "source_fields": ["licenses", "license"],
+        "validation": "recommended",
+        "data_type": "array"
+      },
+      "scoring": {
+        "points": 3.0,
+        "required_for_profiles": ["standard", "advanced"],
+        "category_contribution": 0.15
+      },
+      "validation_message": {
+        "missing": "Missing important field: licenses - license information important for compliance",
+        "recommendation": "Add license information for the model"
+      }
+    },
+    "energyConsumption": {
+      "tier": "important",
+      "weight": 2.0,
+      "category": "component_model_card",
+      "description": "Energy consumption information",
+      "jsonpath": "$.metadata.properties[?(@.name=='energyConsumption')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["energyConsumption", "energy_usage"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 2.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.067
+      },
+      "validation_message": {
+        "missing": "Missing important field: energyConsumption - energy usage information helpful for sustainability",
+        "recommendation": "Add information about the model's energy consumption"
+      }
+    },
+    "hyperparameter": {
+      "tier": "important",
+      "weight": 2.0,
+      "category": "component_model_card",
+      "description": "Key hyperparameters used in training",
+      "jsonpath": "$.metadata.properties[?(@.name=='hyperparameter')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["hyperparameter", "hyperparameters", "training_params"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 2.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.067
+      },
+      "validation_message": {
+        "missing": "Missing important field: hyperparameter - training configuration helpful for reproducibility",
+        "recommendation": "Add key hyperparameters used during model training"
+      }
+    },
+    "limitation": {
+      "tier": "important",
+      "weight": 2.0,
+      "category": "component_model_card",
+      "description": "Known limitations of the model",
+      "jsonpath": "$.metadata.properties[?(@.name=='limitation')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["limitation", "limitations", "known_issues"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 2.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.067
+      },
+      "validation_message": {
+        "missing": "Missing important field: limitation - known limitations important for responsible use",
+        "recommendation": "Add information about known limitations or constraints"
+      }
+    },
+    "safetyRiskAssessment": {
+      "tier": "important",
+      "weight": 2.0,
+      "category": "component_model_card",
+      "description": "Safety and risk assessment information",
+      "jsonpath": "$.metadata.properties[?(@.name=='safetyRiskAssessment')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["safetyRiskAssessment", "safety_assessment", "risk_analysis"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 2.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.067
+      },
+      "validation_message": {
+        "missing": "Missing important field: safetyRiskAssessment - safety assessment important for responsible deployment",
+        "recommendation": "Add safety and risk assessment information"
+      }
+    },
+    "typeOfModel": {
+      "tier": "important",
+      "weight": 2.0,
+      "category": "component_model_card",
+      "description": "Type or architecture of the model",
+      "jsonpath": "$.metadata.properties[?(@.name=='typeOfModel')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["typeOfModel", "model_type", "architecture"],
+        "validation": "recommended",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 2.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.067
+      },
+      "validation_message": {
+        "missing": "Missing important field: typeOfModel - model architecture information helpful",
+        "recommendation": "Add the type or architecture of the model (e.g., Transformer, CNN)"
+      }
+    },
+    "modelExplainability": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Information about model explainability",
+      "jsonpath": "$.metadata.properties[?(@.name=='modelExplainability')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["modelExplainability", "explainability", "interpretability"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: modelExplainability - explainability information helpful for transparency",
+        "recommendation": "Add information about model explainability or interpretability features"
+      }
+    },
+    "energyQuantity": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Quantitative energy consumption data",
+      "jsonpath": "$.metadata.properties[?(@.name=='energyQuantity')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["energyQuantity", "energy_amount"],
+        "validation": "optional",
+        "data_type": "number"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: energyQuantity - quantitative energy data helpful for sustainability metrics",
+        "recommendation": "Add specific energy consumption quantities"
+      }
+    },
+    "energyUnit": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Unit of measurement for energy consumption",
+      "jsonpath": "$.metadata.properties[?(@.name=='energyUnit')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["energyUnit", "energy_unit"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: energyUnit - energy measurement unit helpful for standardization",
+        "recommendation": "Add the unit of measurement for energy consumption (e.g., kWh, Joules)"
+      }
+    },
+    "informationAboutTraining": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Information about the training process",
+      "jsonpath": "$.metadata.properties[?(@.name=='informationAboutTraining')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["informationAboutTraining", "training_info", "training_details"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: informationAboutTraining - training details helpful for understanding model development",
+        "recommendation": "Add information about the training process and methodology"
+      }
+    },
+    "informationAboutApplication": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Information about intended applications",
+      "jsonpath": "$.metadata.properties[?(@.name=='informationAboutApplication')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["informationAboutApplication", "application_info", "intended_use"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: informationAboutApplication - application guidance helpful for proper usage",
+        "recommendation": "Add information about intended applications and use cases"
+      }
+    },
+    "metric": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Performance metrics and evaluation results",
+      "jsonpath": "$.metadata.properties[?(@.name=='metric')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["metric", "metrics", "performance"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: metric - performance metrics helpful for evaluation",
+        "recommendation": "Add performance metrics and evaluation results"
+      }
+    },
+    "metricDecisionThreshold": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Decision thresholds for metrics",
+      "jsonpath": "$.metadata.properties[?(@.name=='metricDecisionThreshold')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["metricDecisionThreshold", "decision_threshold", "threshold"],
+        "validation": "optional",
+        "data_type": "number"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: metricDecisionThreshold - decision thresholds helpful for operational guidance",
+        "recommendation": "Add decision thresholds for performance metrics"
+      }
+    },
+    "modelDataPreprocessing": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Data preprocessing information",
+      "jsonpath": "$.metadata.properties[?(@.name=='modelDataPreprocessing')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["modelDataPreprocessing", "data_preprocessing", "preprocessing"],
+        "validation": "optional",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: modelDataPreprocessing - preprocessing details helpful for reproducibility",
+        "recommendation": "Add information about data preprocessing steps"
+      }
+    },
+    "useSensitivePersonalInformation": {
+      "tier": "supplementary",
+      "weight": 1.0,
+      "category": "component_model_card",
+      "description": "Information about use of sensitive personal data",
+      "jsonpath": "$.metadata.properties[?(@.name=='useSensitivePersonalInformation')].value",
+      "aibom_generation": {
+        "location": "$.metadata.properties",
+        "rule": "include_if_available",
+        "source_fields": ["useSensitivePersonalInformation", "sensitive_data", "personal_data"],
+        "validation": "optional",
+        "data_type": "boolean"
+      },
+      "scoring": {
+        "points": 1.0,
+        "required_for_profiles": ["advanced"],
+        "category_contribution": 0.033
+      },
+      "validation_message": {
+        "missing": "Missing supplementary field: useSensitivePersonalInformation - privacy information important for compliance",
+        "recommendation": "Add information about use of sensitive or personal data"
+      }
+    },
+    "downloadLocation": {
+      "tier": "critical",
+      "weight": 4.0,
+      "category": "external_references",
+      "description": "Location where the model can be downloaded",
+      "jsonpath": "$.externalReferences[0].url",
+      "aibom_generation": {
+        "location": "$.externalReferences",
+        "rule": "include_if_available",
+        "source_fields": ["downloadLocation", "download_url", "repository_url"],
+        "validation": "recommended",
+        "data_type": "string"
+      },
+      "scoring": {
+        "points": 4.0,
+        "required_for_profiles": ["standard", "advanced"],
+        "category_contribution": 1.0
+      },
+      "validation_message": {
+        "missing": "Missing critical field: downloadLocation - download location essential for model access",
+        "recommendation": "Add the URL where the model can be downloaded or accessed"
+      }
+    }
+  }
+}

src/aibom-generator/field_registry_manager.py ADDED Viewed

	@@ -0,0 +1,648 @@

+"""
+Field Registry Manager for AI SBOM Generator
+Combines registry loading, configuration generation, and field detection functionality
+"""
+import json
+import os
+import re
+from typing import Dict, Any, Optional, List, Tuple
+from functools import lru_cache
+class FieldRegistryManager:
+    """
+    Field registry manager that handles:
+    1. Registry loading and validation
+    2. Configuration generation for utils.py compatibility
+    3. Field detection and JSONPath parsing
+    4. AIBOM completeness analysis
+    5. Scoring calculations
+    """
+    def __init__(self, registry_path: Optional[str] = None):
+        """
+        Initialize the field registry manager
+        Args:
+            registry_path: Path to the field registry JSON file. If None, auto-detects.
+        """
+        if registry_path is None:
+            # Auto-detect registry path relative to this file
+            current_dir = os.path.dirname(os.path.abspath(__file__))
+            registry_path = os.path.join(current_dir, "field_registry.json")
+        self.registry_path = registry_path
+        self.registry = self._load_registry()
+        # Cache for performance
+        self._field_classification = None
+        self._completeness_profiles = None
+        self._validation_messages = None
+        self._scoring_weights = None
+    def _load_registry(self) -> Dict[str, Any]:
+        """Load the complete field registry from JSON file"""
+        try:
+            with open(self.registry_path, 'r', encoding='utf-8') as f:
+                registry = json.load(f)
+            # Validate basic structure
+            required_sections = ["fields"]
+            missing_sections = [section for section in required_sections if section not in registry]
+            if missing_sections:
+                raise ValueError(f"Registry missing required sections: {missing_sections}")
+            # Validate fields structure
+            fields = registry.get('fields', {})
+            if not fields:
+                raise ValueError("Registry 'fields' section is empty")
+            print(f"✅ Field registry loaded: {len(fields)} fields from {self.registry_path}")
+            return registry
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Field registry not found at: {self.registry_path}")
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in field registry: {e}")
+        except Exception as e:
+            raise Exception(f"Failed to load field registry: {e}")
+    # =============================================================================
+    # CONFIGURATION GENERATION
+    # =============================================================================
+    @lru_cache(maxsize=1)
+    def get_scoring_config(self) -> Dict[str, Any]:
+        """Get scoring configuration from registry"""
+        return self.registry.get('scoring_config', {})
+    @lru_cache(maxsize=1)
+    def get_aibom_config(self) -> Dict[str, Any]:
+        """Get AIBOM generation configuration from registry"""
+        return self.registry.get('aibom_config', {})
+    @lru_cache(maxsize=1)
+    def get_field_definitions(self) -> Dict[str, Any]:
+        """Get all field definitions from registry"""
+        return self.registry.get('fields', {})
+    def generate_field_classification(self) -> Dict[str, Any]:
+        """
+        Generate FIELD_CLASSIFICATION dictionary from registry
+        """
+        if self._field_classification is not None:
+            return self._field_classification
+        fields = self.get_field_definitions()
+        classification = {}
+        for field_name, field_config in fields.items():
+            classification[field_name] = {
+                "tier": field_config.get("tier", "supplementary"),
+                "weight": field_config.get("weight", 1),
+                "category": field_config.get("category", "unknown")
+            }
+        self._field_classification = classification
+        return classification
+    def generate_completeness_profiles(self) -> Dict[str, Any]:
+        """
+        Generate COMPLETENESS_PROFILES dictionary from registry
+        """
+        if self._completeness_profiles is not None:
+            return self._completeness_profiles
+        scoring_config = self.get_scoring_config()
+        profiles = scoring_config.get("scoring_profiles", {})
+        # Convert to utils.py format
+        completeness_profiles = {}
+        for profile_name, profile_config in profiles.items():
+            completeness_profiles[profile_name] = {
+                "description": profile_config.get("description", f"{profile_name.title()} completeness profile"),
+                "required_fields": profile_config.get("required_fields", []),
+                "minimum_score": profile_config.get("minimum_score", 50)
+            }
+        # Fallback profiles if none defined in registry
+        if not completeness_profiles:
+            completeness_profiles = {
+                "basic": {
+                    "description": "Minimal fields required for identification",
+                    "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
+                    "minimum_score": 40
+                },
+                "standard": {
+                    "description": "Comprehensive fields for proper documentation",
+                    "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
+                                       "downloadLocation", "primaryPurpose", "suppliedBy"],
+                    "minimum_score": 70
+                },
+                "advanced": {
+                    "description": "Extensive documentation for maximum transparency",
+                    "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
+                                       "downloadLocation", "primaryPurpose", "suppliedBy",
+                                       "type", "purl", "description", "licenses", "hyperparameter", "limitation",
+                                       "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
+                    "minimum_score": 85
+                }
+            }
+        self._completeness_profiles = completeness_profiles
+        return completeness_profiles
+    def generate_validation_messages(self) -> Dict[str, Any]:
+        """
+        Generate VALIDATION_MESSAGES dictionary from registry
+        """
+        if self._validation_messages is not None:
+            return self._validation_messages
+        fields = self.get_field_definitions()
+        validation_messages = {}
+        for field_name, field_config in fields.items():
+            validation_msg = field_config.get("validation_message", {})
+            if validation_msg:
+                validation_messages[field_name] = {
+                    "missing": validation_msg.get("missing", f"Missing field: {field_name}"),
+                    "recommendation": validation_msg.get("recommendation", f"Consider adding {field_name} field")
+                }
+        self._validation_messages = validation_messages
+        return validation_messages
+    def get_configurable_scoring_weights(self) -> Dict[str, Any]:
+        """Get configurable scoring weights from registry"""
+        if self._scoring_weights is not None:
+            return self._scoring_weights
+        scoring_config = self.get_scoring_config()
+        weights = {
+            "tier_weights": scoring_config.get("tier_weights", {
+                "critical": 3,
+                "important": 2,
+                "supplementary": 1
+            }),
+            "category_weights": scoring_config.get("category_weights", {
+                "required_fields": 20,
+                "metadata": 20,
+                "component_basic": 20,
+                "component_model_card": 30,
+                "external_references": 10
+            }),
+            "algorithm_config": scoring_config.get("algorithm_config", {
+                "type": "weighted_sum",
+                "max_score": 100,
+                "normalization": "category_based"
+            })
+        }
+        self._scoring_weights = weights
+        return weights
+    # =============================================================================
+    # FIELD DETECTION
+    # =============================================================================
+    def _get_nested_value(self, data: dict, path: str) -> Tuple[bool, Any]:
+        """
+        Get value from nested dictionary using dot notation and array filters
+        Supports paths like: $.components[0].name, $.metadata.properties[?(@.name=='primaryPurpose')].value
+        """
+        try:
+            # Remove leading $. if present
+            if path.startswith('$.'):
+                path = path[2:]
+            # Handle special JSONPath-like syntax for property arrays
+            if '[?(@.name==' in path:
+                return self._handle_property_array_path(data, path)
+            # Split path and traverse
+            parts = self._split_path(path)
+            current = data
+            for part in parts:
+                if '[' in part and ']' in part:
+                    # Handle array access like components[0]
+                    key, index_str = part.split('[')
+                    index = int(index_str.rstrip(']'))
+                    if key and key in current:
+                        current = current[key]
+                    if isinstance(current, list) and 0 <= index < len(current):
+                        current = current[index]
+                    else:
+                        return False, None
+                else:
+                    # Regular key access
+                    if isinstance(current, dict) and part in current:
+                        current = current[part]
+                    else:
+                        return False, None
+            # Check if value is meaningful
+            if current is not None and current != "" and current != []:
+                return True, current
+            return False, None
+        except Exception as e:
+            print(f"Error getting value at path {path}: {e}")
+            return False, None
+    def _handle_property_array_path(self, data: dict, path: str) -> Tuple[bool, Any]:
+        """
+        Handle JSONPath-like syntax for property arrays
+        Example: metadata.properties[?(@.name=='primaryPurpose')].value
+        """
+        try:
+            # Extract the base path, property name, and final key
+            match = re.match(r'(.+)\.properties\[\?\(@\.name==\'(.+)\'\)\]\.(.+)', path)
+            if not match:
+                return False, None
+            base_path, prop_name, final_key = match.groups()
+            # Get to the properties array
+            base_found, base_value = self._get_nested_value(data, base_path + '.properties')
+            if not base_found or not isinstance(base_value, list):
+                return False, None
+            # Find the property with matching name
+            for prop in base_value:
+                if isinstance(prop, dict) and prop.get('name') == prop_name:
+                    if final_key in prop:
+                        value = prop[final_key]
+                        if value is not None and value != "" and value != []:
+                            return True, value
+            return False, None
+        except Exception as e:
+            print(f"Error handling property array path {path}: {e}")
+            return False, None
+    def _split_path(self, path: str) -> List[str]:
+        """Split path into parts, handling array notation"""
+        parts = []
+        current_part = ""
+        in_brackets = False
+        for char in path:
+            if char == '[':
+                in_brackets = True
+                current_part += char
+            elif char == ']':
+                in_brackets = False
+                current_part += char
+            elif char == '.' and not in_brackets:
+                if current_part:
+                    parts.append(current_part)
+                current_part = ""
+            else:
+                current_part += char
+        if current_part:
+            parts.append(current_part)
+        return parts
+    def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
+        """
+        Detect if a field exists at the given path in the AIBOM
+        Returns: (field_exists, field_value)
+        """
+        return self._get_nested_value(aibom, field_path)
+    def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
+        """
+        Analyze AIBOM completeness against the enhanced field registry
+        Compatible with enhanced registry structure: registry['fields'][field_name]
+        """
+        results = {
+            'category_scores': {},
+            'total_score': 0,
+            'field_details': {},
+            'summary': {}
+        }
+        # Get fields from enhanced registry structure
+        fields = self.get_field_definitions()
+        if not fields:
+            print("❌ No fields found in registry")
+            return results
+        # Get scoring configuration
+        scoring_weights = self.get_configurable_scoring_weights()
+        category_weights = scoring_weights.get('category_weights', {})
+        # Group fields by category
+        categories = {}
+        for field_name, field_config in fields.items():
+            category = field_config.get('category', 'unknown')
+            if category not in categories:
+                categories[category] = []
+            categories[category].append((field_name, field_config))
+        print(f"🔍 Analyzing {len(fields)} fields across {len(categories)} categories")
+        total_weighted_score = 0
+        for category_name, category_fields in categories.items():
+            category_weight = category_weights.get(category_name, 20)
+            present_fields = 0
+            total_fields = len(category_fields)
+            field_details = {}
+            print(f"\n📂 Category: {category_name} (weight: {category_weight})")
+            for field_name, field_config in category_fields:
+                field_path = field_config.get('jsonpath', '')
+                tier = field_config.get('tier', 'supplementary')
+                weight = field_config.get('weight', 1)
+                if not field_path:
+                    print(f"⚠️  Field {field_name} has no jsonpath defined")
+                    field_details[field_name] = {
+                        'present': False,
+                        'value': None,
+                        'path': field_path,
+                        'tier': tier,
+                        'weight': weight,
+                        'error': 'No jsonpath defined'
+                    }
+                    continue
+                is_present, value = self.detect_field_presence(aibom, field_path)
+                field_details[field_name] = {
+                    'present': is_present,
+                    'value': value,
+                    'path': field_path,
+                    'tier': tier,
+                    'weight': weight
+                }
+                if is_present:
+                    present_fields += 1
+                    print(f"✅ FOUND: {field_name} = {value} (tier: {tier}, weight: {weight})")
+                else:
+                    print(f"❌ MISSING: {field_name} at {field_path} (tier: {tier})")
+            # Calculate category score
+            category_percentage = (present_fields / total_fields) * 100 if total_fields > 0 else 0
+            category_score = (category_percentage / 100) * category_weight
+            results['category_scores'][category_name] = category_score
+            results['field_details'][category_name] = field_details
+            results['summary'][category_name] = {
+                'present': present_fields,
+                'total': total_fields,
+                'percentage': category_percentage,
+                'weight': category_weight
+            }
+            total_weighted_score += category_score
+            print(f"📊 {category_name}: {present_fields}/{total_fields} ({category_percentage:.1f}%) × {category_weight} = {category_score:.1f} pts")
+        results['total_score'] = total_weighted_score
+        print(f"\n🎯 TOTAL SCORE: {total_weighted_score:.1f}")
+        return results
+    # =============================================================================
+    # UTILITY METHODS
+    # =============================================================================
+    def get_field_info(self, field_name: str) -> Optional[Dict[str, Any]]:
+        """Get complete information for a specific field"""
+        fields = self.get_field_definitions()
+        return fields.get(field_name)
+    def get_field_jsonpath(self, field_name: str) -> Optional[str]:
+        """Get JSONPath expression for a specific field"""
+        field_info = self.get_field_info(field_name)
+        return field_info.get("jsonpath") if field_info else None
+    def get_fields_by_category(self, category: str) -> List[str]:
+        """Get all field names in a specific category"""
+        fields = self.get_field_definitions()
+        return [
+            field_name for field_name, field_config in fields.items()
+            if field_config.get("category") == category
+        ]
+    def get_fields_by_tier(self, tier: str) -> List[str]:
+        """Get all field names in a specific tier"""
+        fields = self.get_field_definitions()
+        return [
+            field_name for field_name, field_config in fields.items()
+            if field_config.get("tier") == tier
+        ]
+    def validate_registry_integrity(self) -> Dict[str, Any]:
+        """Validate the integrity of the loaded registry"""
+        validation_results = {
+            "valid": True,
+            "errors": [],
+            "warnings": [],
+            "field_count": 0,
+            "category_distribution": {},
+            "tier_distribution": {}
+        }
+        try:
+            fields = self.get_field_definitions()
+            validation_results["field_count"] = len(fields)
+            # Check category and tier distribution
+            categories = {}
+            tiers = {}
+            for field_name, field_config in fields.items():
+                # Check required field properties
+                required_props = ["tier", "weight", "category", "jsonpath"]
+                missing_props = [prop for prop in required_props if prop not in field_config]
+                if missing_props:
+                    validation_results["errors"].append(
+                        f"Field '{field_name}' missing properties: {missing_props}"
+                    )
+                    validation_results["valid"] = False
+                # Count categories and tiers
+                category = field_config.get("category", "unknown")
+                tier = field_config.get("tier", "unknown")
+                categories[category] = categories.get(category, 0) + 1
+                tiers[tier] = tiers.get(tier, 0) + 1
+            validation_results["category_distribution"] = categories
+            validation_results["tier_distribution"] = tiers
+            # Check scoring configuration
+            scoring_config = self.get_scoring_config()
+            if not scoring_config.get("tier_weights"):
+                validation_results["warnings"].append("Missing tier_weights in scoring_config")
+            if not scoring_config.get("category_weights"):
+                validation_results["warnings"].append("Missing category_weights in scoring_config")
+        except Exception as e:
+            validation_results["valid"] = False
+            validation_results["errors"].append(f"Registry validation error: {e}")
+        return validation_results
+# =============================================================================
+# GLOBAL INSTANCE AND CONVENIENCE FUNCTIONS
+# =============================================================================
+# Global registry manager instance (initialized on first import)
+_registry_manager = None
+def get_field_registry_manager() -> FieldRegistryManager:
+    """Get the global field registry manager instance (singleton pattern)"""
+    global _registry_manager
+    if _registry_manager is None:
+        _registry_manager = FieldRegistryManager()
+    return _registry_manager
+# Convenience functions for backward compatibility with existing code
+def load_field_registry() -> Dict[str, Any]:
+    """Load the complete field registry (convenience function)"""
+    manager = get_field_registry_manager()
+    return manager.registry
+def generate_field_classification() -> Dict[str, Any]:
+    """Generate FIELD_CLASSIFICATION from registry (convenience function)"""
+    manager = get_field_registry_manager()
+    return manager.generate_field_classification()
+def generate_completeness_profiles() -> Dict[str, Any]:
+    """Generate COMPLETENESS_PROFILES from registry (convenience function)"""
+    manager = get_field_registry_manager()
+    return manager.generate_completeness_profiles()
+def generate_validation_messages() -> Dict[str, Any]:
+    """Generate VALIDATION_MESSAGES from registry (convenience function)"""
+    manager = get_field_registry_manager()
+    return manager.generate_validation_messages()
+def get_configurable_scoring_weights() -> Dict[str, Any]:
+    """Get configurable scoring weights from registry"""
+    manager = get_field_registry_manager()
+    return manager.get_configurable_scoring_weights()
+# For compatibility with old DynamicFieldDetector usage
+class DynamicFieldDetector:
+    """Compatibility wrapper for old DynamicFieldDetector usage"""
+    def __init__(self, registry_path: str):
+        """Initialize with field registry manager"""
+        self.manager = FieldRegistryManager(registry_path)
+        self.registry = self.manager.registry
+    def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
+        """Detect field presence using the manager"""
+        return self.manager.detect_field_presence(aibom, field_path)
+    def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
+        """Analyze AIBOM completeness using the manager"""
+        return self.manager.analyze_aibom_completeness(aibom)
+# Validation function for testing
+def validate_registry_setup() -> bool:
+    """Validate that the registry is properly set up and accessible"""
+    try:
+        manager = get_field_registry_manager()
+        validation_results = manager.validate_registry_integrity()
+        if validation_results["valid"]:
+            print(f"✅ Registry validation successful")
+            print(f"   Fields loaded: {validation_results['field_count']}")
+            print(f"   Categories: {list(validation_results['category_distribution'].keys())}")
+            print(f"   Tiers: {list(validation_results['tier_distribution'].keys())}")
+            return True
+        else:
+            print(f"❌ Registry validation failed")
+            for error in validation_results["errors"]:
+                print(f"   Error: {error}")
+            return False
+    except Exception as e:
+        print(f"❌ Registry setup validation failed: {e}")
+        return False
+def test_field_registry_manager():
+    """
+    This function is temporary (or optional later on).
+    It serves the purpose of validating the field registry manager after refactoring
+    such as replacing old files or methods within for field detection and score calculations
+    and comes handy as a debugging tool.
+    """
+    try:
+        print("🧪 Testing Consolidated Field Registry Manager...")
+        # Test manager initialization
+        manager = get_field_registry_manager()
+        print(f"✅ Manager initialized with registry: {manager.registry_path}")
+        # Test configuration generation
+        field_classification = manager.generate_field_classification()
+        print(f"✅ Generated FIELD_CLASSIFICATION with {len(field_classification)} fields")
+        completeness_profiles = manager.generate_completeness_profiles()
+        print(f"✅ Generated COMPLETENESS_PROFILES with {len(completeness_profiles)} profiles")
+        validation_messages = manager.generate_validation_messages()
+        print(f"✅ Generated VALIDATION_MESSAGES with {len(validation_messages)} messages")
+        scoring_weights = manager.get_configurable_scoring_weights()
+        print(f"✅ Generated SCORING_WEIGHTS with {len(scoring_weights)} sections")
+        # Test field detection capabilities
+        test_fields = ['bomFormat', 'primaryPurpose', 'energyConsumption']
+        for field_name in test_fields:
+            field_info = manager.get_field_info(field_name)
+            if field_info:
+                jsonpath = field_info.get('jsonpath', 'N/A')
+                category = field_info.get('category', 'N/A')
+                tier = field_info.get('tier', 'N/A')
+                print(f"✅ Field '{field_name}': {jsonpath} (category: {category}, tier: {tier})")
+            else:
+                print(f"❌ Field '{field_name}' not found in registry")
+        # Test registry validation
+        validation_results = manager.validate_registry_integrity()
+        if validation_results["valid"]:
+            print("✅ Registry integrity validation passed")
+        else:
+            print("⚠️  Registry integrity validation issues found")
+            for error in validation_results["errors"]:
+                print(f"   Error: {error}")
+        print("🎉 Consolidated field registry manager test completed successfully!")
+        return True
+    except Exception as e:
+        print(f"❌ Field registry manager test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    # Test the consolidated manager when run directly
+    test_field_registry_manager()

src/aibom-generator/generator.py CHANGED Viewed

@@ -1,13 +1,30 @@
 import json
 import uuid
 import datetime
 from typing import Dict, Optional, Any, List
 from huggingface_hub import HfApi, ModelCard
 from urllib.parse import urlparse
 from .utils import calculate_completeness_score
 class AIBOMGenerator:
     def __init__(
@@ -16,7 +33,7 @@ class AIBOMGenerator:
         inference_model_url: Optional[str] = None,
         use_inference: bool = True,
         cache_dir: Optional[str] = None,
-        use_best_practices: bool = True,  # Added parameter for industry-neutral scoring
     ):
         self.hf_api = HfApi(token=hf_token)
         self.inference_model_url = inference_model_url
@@ -24,13 +41,48 @@ class AIBOMGenerator:
         self.cache_dir = cache_dir
         self.enhancement_report = None  # Store enhancement report as instance variable
         self.use_best_practices = use_best_practices  # Store best practices flag
     def generate_aibom(
         self,
         model_id: str,
         output_file: Optional[str] = None,
         include_inference: Optional[bool] = None,
-        use_best_practices: Optional[bool] = None,  # Added parameter for industry-neutral scoring
     ) -> Dict[str, Any]:
         try:
             model_id = self._normalise_model_id(model_id)
@@ -43,12 +95,59 @@ class AIBOMGenerator:
             # Store original metadata before any AI enhancement
             original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
             # Create initial AIBOM with original metadata
             original_aibom = self._create_aibom_structure(model_id, original_metadata)
             # Calculate initial score with industry-neutral approach if enabled
-            original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices)
             # Final metadata starts with original metadata
             final_metadata = original_metadata.copy() if original_metadata else {}
@@ -74,12 +173,19 @@ class AIBOMGenerator:
                 except Exception as e:
                     print(f"Error during AI enhancement: {e}")
                     # Continue with original metadata if enhancement fails
             # Create final AIBOM with potentially enhanced metadata
             aibom = self._create_aibom_structure(model_id, final_metadata)
-            # Calculate final score with industry-neutral approach if enabled
-            final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
             if output_file:
@@ -98,8 +204,8 @@ class AIBOMGenerator:
             # Return only the AIBOM to maintain compatibility with existing code
             return aibom
         except Exception as e:
-            print(f"Error generating AIBOM: {e}")
-            # Return a minimal valid AIBOM structure in case of error
             return self._create_minimal_aibom(model_id)
     def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
@@ -156,7 +262,7 @@ class AIBOMGenerator:
             print(f"Error fetching model info for {model_id}: {e}")
             return {}
-    # ---- new helper ---------------------------------------------------------
     @staticmethod
     def _normalise_model_id(raw_id: str) -> str:
         """
@@ -171,7 +277,7 @@ class AIBOMGenerator:
                 return "/".join(parts[:2])
             return path
         return raw_id
-    # -------------------------------------------------------------------------
     def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
         try:
@@ -185,6 +291,12 @@ class AIBOMGenerator:
         model_id: str,
         metadata: Dict[str, Any],
     ) -> Dict[str, Any]:
         # Extract owner and model name from model_id
         parts = model_id.split("/")
         group = parts[0] if len(parts) > 1 else ""
@@ -192,6 +304,9 @@ class AIBOMGenerator:
         # Get version from metadata or use default
         version = metadata.get("commit", "1.0")
         aibom = {
             "bomFormat": "CycloneDX",
@@ -206,7 +321,10 @@ class AIBOMGenerator:
                     "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
                 }
             ]
-        }
         # ALWAYS add root-level external references
         aibom["externalReferences"] = [{
@@ -220,6 +338,7 @@ class AIBOMGenerator:
                 "url": metadata["commit_url"]
             } )
         return aibom
     def _extract_structured_metadata(
@@ -228,6 +347,48 @@ class AIBOMGenerator:
         model_info: Dict[str, Any],
         model_card: Optional[ModelCard],
     ) -> Dict[str, Any]:
         metadata = {}
         if model_info:
@@ -248,7 +409,7 @@ class AIBOMGenerator:
                     "downloads": getattr(model_info, "downloads", 0),
                     "last_modified": getattr(model_info, "lastModified", None),
                     "commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
-                    "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None) else None,
                 })
             except Exception as e:
                 print(f"Error extracting model info metadata: {e}")
@@ -290,6 +451,7 @@ class AIBOMGenerator:
         print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
         return {k: v for k, v in metadata.items() if v is not None}
     def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
@@ -301,6 +463,9 @@ class AIBOMGenerator:
     def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
         timestamp = datetime.datetime.utcnow().isoformat() + "Z"
         # Get version from metadata or use default
@@ -358,24 +523,43 @@ class AIBOMGenerator:
         # ALWAYS add critical fields for scoring
         critical_fields = {
-            "primaryPurpose": metadata.get("primaryPurpose", metadata.get("ai:task", "text-generation")),
-            "suppliedBy": metadata.get("suppliedBy", metadata.get("author", "unknown")),
-            "typeOfModel": metadata.get("ai:type", "transformer")
         }
-        # Add critical fields first
         for key, value in critical_fields.items():
-            if value and value != "unknown":
-                properties.append({"name": key, "value": str(value)})
-        # Add other metadata fields (excluding basic component fields)
-        excluded_fields = ["name", "author", "license", "description", "commit", "primaryPurpose", "suppliedBy", "typeOfModel"]
         for key, value in metadata.items():
-            if key not in excluded_fields and value is not None:
                 if isinstance(value, (list, dict)):
-                    if not isinstance(value, str):
                         value = json.dumps(value)
                 properties.append({"name": key, "value": str(value)})
         # Assemble metadata section
         metadata_section = {
@@ -388,6 +572,9 @@ class AIBOMGenerator:
         return metadata_section
     def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
         # Extract owner and model name from model_id
         parts = model_id.split("/")
         group = parts[0] if len(parts) > 1 else ""
@@ -412,7 +599,7 @@ class AIBOMGenerator:
             "purl": purl
         }
-        # ALWAYS add licenses (use default if not available)
         if metadata and "license" in metadata and metadata["license"]:
             component["licenses"] = [{
                 "license": {
@@ -420,14 +607,48 @@ class AIBOMGenerator:
                     "url": self._get_license_url(metadata["license"])
                 }
             }]
         else:
-            # Add default license structure for consistency
             component["licenses"] = [{
                 "license": {
-                    "id": "unknown",
                     "url": "https://spdx.org/licenses/"
                 }
             }]
         # Debug
         print(f"DEBUG: License in metadata: {'license' in metadata}" )
         if "license" in metadata:
@@ -435,6 +656,21 @@ class AIBOMGenerator:
         # ALWAYS add description
         component["description"] = metadata.get("description", f"AI model {model_id}")
         # Add external references
         external_refs = [{
@@ -470,26 +706,70 @@ class AIBOMGenerator:
         return component
     def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         model_card_section = {}
         # Add quantitative analysis section
         if "eval_results" in metadata:
             model_card_section["quantitativeAnalysis"] = {
-                "performanceMetrics": metadata["eval_results"],
                 "graphics": {}  # Empty graphics object as in the example
             }
         else:
             model_card_section["quantitativeAnalysis"] = {"graphics": {}}
-        # Add properties section
         properties = []
-        for key, value in metadata.items():
-            if key in ["author", "library_name", "license", "downloads", "likes", "tags", "created_at", "last_modified"]:
-                properties.append({"name": key, "value": str(value)})
-        if properties:
-            model_card_section["properties"] = properties
         # Create model parameters section
         model_parameters = {}
@@ -538,6 +818,25 @@ class AIBOMGenerator:
         # Add model parameters to model card section
         model_card_section["modelParameters"] = model_parameters
         # Add considerations section
         considerations = {}
@@ -578,4 +877,112 @@ class AIBOMGenerator:
                     logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
                     return None
                 time.sleep(1 * (attempt + 1))  # Exponential backoff
-        return None

 import json
 import uuid
 import datetime
+import json
 from typing import Dict, Optional, Any, List
 from huggingface_hub import HfApi, ModelCard
+from huggingface_hub.repocard_data import EvalResult
 from urllib.parse import urlparse
 from .utils import calculate_completeness_score
+# Import registry-aware enhanced extraction if available
+try:
+    from .enhanced_extractor import EnhancedExtractor
+    from .field_registry_manager import get_field_registry_manager
+    ENHANCED_EXTRACTION_AVAILABLE = True
+    print("✅ Registry-aware enhanced extraction module loaded successfully")
+except ImportError:
+    try:
+        from enhanced_extractor import EnhancedExtractor
+        from field_registry_manager import get_field_registry_manager
+        ENHANCED_EXTRACTION_AVAILABLE = True
+        print("✅ Registry-aware enhanced extraction module loaded successfully (direct import)")
+    except ImportError:
+        ENHANCED_EXTRACTION_AVAILABLE = False
+        print("⚠️ Registry-aware enhanced extraction not available, using basic extraction")
 class AIBOMGenerator:
     def __init__(
         inference_model_url: Optional[str] = None,
         use_inference: bool = True,
         cache_dir: Optional[str] = None,
+        use_best_practices: bool = True,  # parameter for industry-neutral scoring
     ):
         self.hf_api = HfApi(token=hf_token)
         self.inference_model_url = inference_model_url
         self.cache_dir = cache_dir
         self.enhancement_report = None  # Store enhancement report as instance variable
         self.use_best_practices = use_best_practices  # Store best practices flag
+        self._setup_enhanced_logging()
+        self.extraction_results = {}  # Store extraction results for scoring
+        # Initialize registry manager for enhanced extraction
+        self.registry_manager = None
+        if ENHANCED_EXTRACTION_AVAILABLE:
+            try:
+                self.registry_manager = get_field_registry_manager()
+                print("✅ Registry manager initialized for generator")
+            except Exception as e:
+                print(f"⚠️ Could not initialize registry manager: {e}")
+                self.registry_manager = None
+    def get_extraction_results(self):
+        """Return the enhanced extraction results from the last extraction"""
+        return getattr(self, 'extraction_results', {})
+    def _setup_enhanced_logging(self):
+        """Setup enhanced logging for extraction tracking"""
+        import logging
+        # Configure logging to show in HF Spaces
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+            force=True  # Override any existing configuration
+        )
+        # Ensure logger shows up
+        logger = logging.getLogger('enhanced_extractor')
+        logger.setLevel(logging.INFO)
+        print("🔧 Enhanced logging configured for AI SBOM generation")
     def generate_aibom(
         self,
         model_id: str,
         output_file: Optional[str] = None,
         include_inference: Optional[bool] = None,
+        use_best_practices: Optional[bool] = None,  # parameter for industry-neutral scoring
     ) -> Dict[str, Any]:
         try:
             model_id = self._normalise_model_id(model_id)
             # Store original metadata before any AI enhancement
             original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
+            print(f"🔍 ENHANCED EXTRACTION DEBUG: Returned {len(original_metadata)} fields:")
+            for key, value in original_metadata.items():
+                print(f"   {key}: {value}")
+            print(f"🔍 EXTRACTION RESULTS: {len(self.extraction_results) if hasattr(self, 'extraction_results') and self.extraction_results else 0} extraction results available")
             # Create initial AIBOM with original metadata
             original_aibom = self._create_aibom_structure(model_id, original_metadata)
+            print(f"🔍 AI SBOM CREATION DEBUG: Checking what made it into AIBOM:")
+            if 'components' in original_aibom and original_aibom['components']:
+                component = original_aibom['components'][0]
+                if 'properties' in component:
+                    print(f"   Found {len(component['properties'])} properties in AIBOM:")
+                    for prop in component['properties']:
+                        print(f"      {prop.get('name')}: {prop.get('value')}")
+                else:
+                    print("   No properties found in component")
+            else:
+                print("   No components found in AI SBOM")
+                print(f"🔍 FIELD PRESERVATION VERIFICATION:")
+                print(f"   Enhanced extraction returned: {len(original_metadata)} fields")
+                # Count fields in final AIBOM
+                aibom_field_count = 0
+                # Count component properties
+                if 'components' in original_aibom and original_aibom['components']:
+                    component = original_aibom['components'][0]
+                    if 'properties' in component:
+                        aibom_field_count += len(component['properties'])
+                    # Count model card properties
+                    if 'modelCard' in component and 'properties' in component['modelCard']:
+                        aibom_field_count += len(component['modelCard']['properties'])
+                # Count metadata properties
+                if 'metadata' in original_aibom and 'properties' in original_aibom['metadata']:
+                    aibom_field_count += len(original_aibom['metadata']['properties'])
+                print(f"   Final AIBOM contains: {aibom_field_count} fields")
+                print(f"   Field preservation rate: {(aibom_field_count/len(original_metadata)*100):.1f}%")
+                if aibom_field_count >= len(original_metadata) * 0.9:  # 90% or better
+                    print("✅ EXCELLENT: Field preservation successful!")
+                elif aibom_field_count >= len(original_metadata) * 0.7:  # 70% or better
+                    print("⚠️ GOOD: Most fields preserved, some optimization possible")
+                else:
+                    print("❌ POOR: Significant field loss detected")
             # Calculate initial score with industry-neutral approach if enabled
+            original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices, extraction_results=self.extraction_results)
             # Final metadata starts with original metadata
             final_metadata = original_metadata.copy() if original_metadata else {}
                 except Exception as e:
                     print(f"Error during AI enhancement: {e}")
                     # Continue with original metadata if enhancement fails
+                    print("🚨 FALLBACK: Using _create_minimal_aibom due to error!")
+                    print(f"🚨 ERROR DETAILS: {str(e)}")
             # Create final AIBOM with potentially enhanced metadata
             aibom = self._create_aibom_structure(model_id, final_metadata)
+            # Calculate final score with enhanced extraction results
+            extraction_results = self.get_extraction_results()
+            final_score = calculate_completeness_score(
+                aibom,
+                validate=True,
+                use_best_practices=use_best_practices,
+                extraction_results=extraction_results  # Pass enhanced results
+            )
             if output_file:
             # Return only the AIBOM to maintain compatibility with existing code
             return aibom
         except Exception as e:
+            print(f"Error generating AI SBOM: {e}")
+            # Return a minimal valid AI SBOM structure in case of error
             return self._create_minimal_aibom(model_id)
     def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
             print(f"Error fetching model info for {model_id}: {e}")
             return {}
     @staticmethod
     def _normalise_model_id(raw_id: str) -> str:
         """
                 return "/".join(parts[:2])
             return path
         return raw_id
     def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
         try:
         model_id: str,
         metadata: Dict[str, Any],
     ) -> Dict[str, Any]:
+        # 🔍 CRASH DEBUG: troubleshoot where the process is crashing and falling back to minimal AIBOM
+        print(f"🔍 CRASH_DEBUG: _create_aibom_structure called")
+        print(f"🔍 CRASH_DEBUG: model_id = {model_id}")
+        print(f"🔍 CRASH_DEBUG: metadata type = {type(metadata)}")
+        print(f"🔍 CRASH_DEBUG: metadata keys = {list(metadata.keys()) if isinstance(metadata, dict) else 'NOT A DICT'}")
         # Extract owner and model name from model_id
         parts = model_id.split("/")
         group = parts[0] if len(parts) > 1 else ""
         # Get version from metadata or use default
         version = metadata.get("commit", "1.0")
+        # 🔍 CRASH DEBUG: Check metadata before creating sections
+        print(f"🔍 CRASH_DEBUG: About to create metadata section")
         aibom = {
             "bomFormat": "CycloneDX",
                     "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
                 }
             ]
+        }
+        # 🔍 CRASH DEBUG: Check if we got this far
+        print(f"🔍 CRASH_DEBUG: Successfully created basic AIBOM structure")
         # ALWAYS add root-level external references
         aibom["externalReferences"] = [{
                 "url": metadata["commit_url"]
             } )
+        print(f"🔍 CRASH_DEBUG: _create_aibom_structure completed successfully")
         return aibom
     def _extract_structured_metadata(
         model_info: Dict[str, Any],
         model_card: Optional[ModelCard],
     ) -> Dict[str, Any]:
+        # Use registry-aware enhanced extraction if available
+        if ENHANCED_EXTRACTION_AVAILABLE:
+            try:
+                print(f"🚀 Using registry-aware enhanced extraction for: {model_id}")
+                # Create registry-aware enhanced extractor instance
+                extractor = EnhancedExtractor(self.hf_api, self.registry_manager)
+                # Get both metadata and extraction results
+                metadata = extractor.extract_metadata(model_id, model_info, model_card)
+                # Store extraction results for scoring
+                self.extraction_results = extractor.extraction_results
+                # Log extraction summary
+                if extractor.registry_fields:
+                    registry_field_count = len(extractor.registry_fields)
+                    extracted_count = len([k for k, v in metadata.items() if v is not None])
+                    extraction_results_count = len(extractor.extraction_results)
+                    print(f"✅ Registry-driven extraction completed:")
+                    print(f"   📋 Registry fields available: {registry_field_count}")
+                    print(f"   📊 Fields attempted: {extraction_results_count}")
+                    print(f"   ✅ Fields extracted: {extracted_count}")
+                    # Log field coverage
+                    if registry_field_count > 0:
+                        coverage = (extracted_count / registry_field_count) * 100
+                        print(f"   📈 Registry field coverage: {coverage:.1f}%")
+                else:
+                    extracted_count = len([k for k, v in metadata.items() if v is not None])
+                    print(f"✅ Legacy extraction completed: {extracted_count} fields extracted")
+                return metadata
+            except Exception as e:
+                print(f"❌ Registry-aware enhanced extraction failed: {e}")
+                print("🔄 Falling back to original extraction method")
+                # Fall back to original extraction code here
+        # ORIGINAL EXTRACTION METHOD (as fallback)
         metadata = {}
         if model_info:
                     "downloads": getattr(model_info, "downloads", 0),
                     "last_modified": getattr(model_info, "lastModified", None),
                     "commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
+                    "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None ) else None,
                 })
             except Exception as e:
                 print(f"Error extracting model info metadata: {e}")
         print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
         return {k: v for k, v in metadata.items() if v is not None}
     def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
     def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        print(f"🔍 CRASH_DEBUG: _create_metadata_section called")
+        print(f"🔍 CRASH_DEBUG: metadata type in metadata_section = {type(metadata)}")
         timestamp = datetime.datetime.utcnow().isoformat() + "Z"
         # Get version from metadata or use default
         # ALWAYS add critical fields for scoring
         critical_fields = {
+            "primaryPurpose": metadata.get("primaryPurpose", "text-generation"),
+            "suppliedBy": metadata.get("suppliedBy", "unknown"),
+            "typeOfModel": metadata.get("typeOfModel", "Transformer")
         }
         for key, value in critical_fields.items():
+            properties.append({"name": key, "value": str(value)})
+        # Add enhanced extraction fields to properties
+        # Organize fields by category for better AIBOM structure
+        component_fields = ["name", "author", "description", "commit"]  # These go in component section
+        critical_fields = ["primaryPurpose", "suppliedBy", "typeOfModel"]  # Always include these
+        # Add all other enhanced extraction fields (preserve everything!)
+        enhanced_fields = ["model_type", "tokenizer_class", "architectures", "library_name",
+                          "pipeline_tag", "tags", "datasets", "base_model", "language",
+                          "downloads", "last_modified", "commit_url", "ai:type", "ai:task",
+                          "ai:framework", "eval_results"]
+        print(f"🔍 CRASH_DEBUG: About to call .items() on metadata")
+        print(f"🔍 CRASH_DEBUG: metadata type before .items() = {type(metadata)}")
         for key, value in metadata.items():
+            # Skip component fields (handled elsewhere) but include everything else
+            if key not in component_fields and value is not None:
+                # Handle different data types properly
                 if isinstance(value, (list, dict)):
+                    if isinstance(value, list) and len(value) > 0:
+                        # Convert list to comma-separated string for better display
+                        if all(isinstance(item, str) for item in value):
+                            value = ", ".join(value)
+                        else:
+                            value = json.dumps(value)
+                    elif isinstance(value, dict):
                         value = json.dumps(value)
                 properties.append({"name": key, "value": str(value)})
+                print(f"✅ METADATA: Added {key} = {value} to properties")
         # Assemble metadata section
         metadata_section = {
         return metadata_section
     def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        print(f"🔍 CRASH_DEBUG: _create_component_section called")
+        print(f"🔍 CRASH_DEBUG: metadata type in component_section = {type(metadata)}")
         # Extract owner and model name from model_id
         parts = model_id.split("/")
         group = parts[0] if len(parts) > 1 else ""
             "purl": purl
         }
+        # Handle license
         if metadata and "license" in metadata and metadata["license"]:
             component["licenses"] = [{
                 "license": {
                     "url": self._get_license_url(metadata["license"])
                 }
             }]
+            print(f"✅ COMPONENT: Added license = {metadata['license']}")
         else:
             component["licenses"] = [{
                 "license": {
+                    "id": "NOASSERTION",
                     "url": "https://spdx.org/licenses/"
                 }
             }]
+            print(f"⚠️ COMPONENT: No license found, using NOASSERTION")
+        # ALWAYS add description
+        component["description"] = metadata.get("description", f"AI model {model_id}")
+        # Add enhanced technical properties to component
+        technical_properties = []
+        # Add model type information
+        if "model_type" in metadata:
+            technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
+            print(f"✅ COMPONENT: Added model_type = {metadata['model_type']}")
+        # Add tokenizer information
+        if "tokenizer_class" in metadata:
+            technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
+            print(f"✅ COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
+        # Add architecture information
+        if "architectures" in metadata:
+            arch_value = metadata["architectures"]
+            if isinstance(arch_value, list):
+                arch_value = ", ".join(arch_value)
+            technical_properties.append({"name": "architectures", "value": str(arch_value)})
+            print(f"✅ COMPONENT: Added architectures = {arch_value}")
+        # Add library information
+        if "library_name" in metadata:
+            technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
+            print(f"✅ COMPONENT: Added library_name = {metadata['library_name']}")
+        # Add technical properties to component if any exist
+        if technical_properties:
+            component["properties"] = technical_properties
         # Debug
         print(f"DEBUG: License in metadata: {'license' in metadata}" )
         if "license" in metadata:
         # ALWAYS add description
         component["description"] = metadata.get("description", f"AI model {model_id}")
+        if metadata.get("license"):
+            component["licenses"] = [{
+                "license": {
+                    "id": metadata["license"],
+                    "url": self._get_license_url(metadata["license"])
+                }
+            }]
+        else:
+            component["licenses"] = [{
+                "license": {
+                    "id": "unknown",
+                    "url": "https://spdx.org/licenses/"
+                }
+            }]
         # Add external references
         external_refs = [{
         return component
+    def _eval_results_to_json(self, eval_results: List[EvalResult]) -> List[Dict[str, str]]:
+        res = []
+        for eval_result in eval_results:
+            if hasattr(eval_result, "metric_type") and hasattr(eval_result, "metric_value"):
+                res.append({"type": eval_result.metric_type, "value": str(eval_result.metric_value)})
+        return res
     def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        print(f"🔍 CRASH_DEBUG: _create_model_card_section called")
+        print(f"🔍 CRASH_DEBUG: metadata type in model_card_section = {type(metadata)}")
         model_card_section = {}
         # Add quantitative analysis section
         if "eval_results" in metadata:
             model_card_section["quantitativeAnalysis"] = {
+                "performanceMetrics": self._eval_results_to_json(metadata["eval_results"]),
                 "graphics": {}  # Empty graphics object as in the example
             }
         else:
             model_card_section["quantitativeAnalysis"] = {"graphics": {}}
+        # Add properties section with enhanced extraction fields
         properties = []
+        # Component-level fields that shouldn't be duplicated in model card
+        component_level_fields = ["name", "author", "license", "description", "commit"]
+        # DEBUG: troubleshooting AIBOM generation
+        print(f"🔍 DEBUG: About to iterate metadata.items()")
+        print(f"🔍 DEBUG: metadata type = {type(metadata)}")
+        if isinstance(metadata, dict):
+            print(f"🔍 DEBUG: metadata keys = {list(metadata.keys())}")
+        else:
+            print(f"🔍 DEBUG: metadata value = {metadata}")
+            print(f"🔍 DEBUG: This is the problem - metadata should be a dict!")
+        # Add all enhanced extraction fields to model card properties
+        try:
+            for key, value in metadata.items():
+                if key not in component_level_fields and value is not None:
+                    # Handle different data types properly
+                    if isinstance(value, (list, dict)):
+                        if isinstance(value, list) and len(value) > 0:
+                            # Convert list to readable format
+                            if all(isinstance(item, str) for item in value):
+                                value = ", ".join(value)
+                            else:
+                                value = json.dumps(value)
+                        elif isinstance(value, dict):
+                            value = json.dumps(value)
+                    properties.append({"name": key, "value": str(value)})
+                    print(f"✅ MODEL_CARD: Added {key} = {value}")
+        except AttributeError as e:
+            print(f"❌ FOUND THE ERROR: {e}")
+            print(f"❌ metadata type: {type(metadata)}")
+            print(f"❌ metadata value: {metadata}")
+            raise e
+        # Always include properties section (even if empty for consistency)
+        model_card_section["properties"] = properties
+        print(f"✅ MODEL_CARD: Added {len(properties)} properties to model card")
         # Create model parameters section
         model_parameters = {}
         # Add model parameters to model card section
         model_card_section["modelParameters"] = model_parameters
+        # Add enhanced technical parameters
+        if "model_type" in metadata or "tokenizer_class" in metadata or "architectures" in metadata:
+            technical_details = {}
+            if "model_type" in metadata:
+                technical_details["modelType"] = metadata["model_type"]
+            if "tokenizer_class" in metadata:
+                technical_details["tokenizerClass"] = metadata["tokenizer_class"]
+            if "architectures" in metadata:
+                technical_details["architectures"] = metadata["architectures"]
+            # Add to model parameters
+            model_parameters.update(technical_details)
+            print(f"✅ MODEL_CARD: Added technical details: {list(technical_details.keys())}")
+        # Update model parameters with enhanced details
+        model_card_section["modelParameters"] = model_parameters
         # Add considerations section
         considerations = {}
                     logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
                     return None
                 time.sleep(1 * (attempt + 1))  # Exponential backoff
+        return None
+    def validate_registry_integration(self) -> Dict[str, Any]:
+        """
+        Validate that the registry integration is working correctly.
+        This method helps debug registry-related issues.
+        """
+        validation_results = {
+            'registry_manager_available': bool(self.registry_manager),
+            'enhanced_extraction_available': ENHANCED_EXTRACTION_AVAILABLE,
+            'registry_fields_count': 0,
+            'registry_fields_loaded': False,
+            'validation_status': 'unknown'
+        }
+        try:
+            if self.registry_manager:
+                registry = self.registry_manager.registry
+                registry_fields = registry.get('fields', {})
+                validation_results['registry_fields_count'] = len(registry_fields)
+                validation_results['registry_fields_loaded'] = len(registry_fields) > 0
+                if len(registry_fields) > 0:
+                    validation_results['validation_status'] = 'success'
+                    print(f"✅ Registry validation successful: {len(registry_fields)} fields loaded")
+                    # Log sample fields
+                    sample_fields = list(registry_fields.keys())[:5]
+                    print(f"📋 Sample registry fields: {', '.join(sample_fields)}")
+                else:
+                    validation_results['validation_status'] = 'no_fields'
+                    print("⚠️ Registry loaded but no fields found")
+            else:
+                validation_results['validation_status'] = 'no_registry_manager'
+                print("❌ Registry manager not available")
+        except Exception as e:
+            validation_results['validation_status'] = 'error'
+            validation_results['error'] = str(e)
+            print(f"❌ Registry validation failed: {e}")
+        return validation_results
+def test_registry_integration():
+    """
+    Test function to validate registry integration is working correctly.
+    This function can be called to debug registry-related issues.
+    """
+    print("🧪 Testing Registry Integration...")
+    print("=" * 50)
+    try:
+        # Test generator initialization
+        generator = AIBOMGenerator()
+        # Validate registry integration
+        validation_results = generator.validate_registry_integration()
+        print("📊 Validation Results:")
+        for key, value in validation_results.items():
+            print(f"   {key}: {value}")
+        # Test with a sample model
+        test_model = "deepseek-ai/DeepSeek-R1"
+        print(f"\n🔍 Testing extraction with model: {test_model}")
+        try:
+            # Test model info retrieval
+            model_info = generator.hf_api.model_info(test_model)
+            model_card = ModelCard.load(test_model)
+            # Test extraction
+            if ENHANCED_EXTRACTION_AVAILABLE and generator.registry_manager:
+                extractor = EnhancedExtractor(generator.hf_api, generator.registry_manager)
+                metadata = extractor.extract_metadata(test_model, model_info, model_card)
+                print(f"✅ Test extraction successful: {len(metadata)} fields extracted")
+                # Show sample extracted fields
+                sample_fields = dict(list(metadata.items())[:5])
+                print("📋 Sample extracted fields:")
+                for key, value in sample_fields.items():
+                    print(f"   {key}: {value}")
+                # Show extraction results summary
+                extraction_results = extractor.get_extraction_results()
+                confidence_counts = {}
+                for result in extraction_results.values():
+                    conf = result.confidence.value
+                    confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
+                print("📈 Extraction confidence distribution:")
+                for conf, count in confidence_counts.items():
+                    print(f"   {conf}: {count} fields")
+            else:
+                print("⚠️ Registry-aware extraction not available for testing")
+        except Exception as e:
+            print(f"❌ Test extraction failed: {e}")
+    except Exception as e:
+        print(f"❌ Registry integration test failed: {e}")
+    print("=" * 50)
+    print("🧪 Registry Integration Test Complete")
+# Uncomment this line to run the test automatically when generator.py is imported
+test_registry_integration()

src/aibom-generator/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Utility functions for the AI SBOM Generator.
 """
 import json
@@ -9,6 +9,14 @@ import re
 import uuid
 from typing import Dict, List, Optional, Any, Union, Tuple
 from enum import Enum
 logger = logging.getLogger(__name__)
@@ -18,98 +26,123 @@ class ValidationSeverity(Enum):
     WARNING = "warning"
     INFO = "info"
-# Field classification based on documentation value (silently aligned with SPDX)
-FIELD_CLASSIFICATION = {
-    # Critical fields (silently aligned with SPDX mandatory fields)
-    "bomFormat": {"tier": "critical", "weight": 3, "category": "required_fields"},
-    "specVersion": {"tier": "critical", "weight": 3, "category": "required_fields"},
-    "serialNumber": {"tier": "critical", "weight": 3, "category": "required_fields"},
-    "version": {"tier": "critical", "weight": 3, "category": "required_fields"},
-    "name": {"tier": "critical", "weight": 4, "category": "component_basic"},
-    "downloadLocation": {"tier": "critical", "weight": 4, "category": "external_references"},
-    "primaryPurpose": {"tier": "critical", "weight": 3, "category": "metadata"},
-    "suppliedBy": {"tier": "critical", "weight": 4, "category": "metadata"},
-    # Important fields (aligned with key SPDX optional fields)
-    "type": {"tier": "important", "weight": 2, "category": "component_basic"},
-    "purl": {"tier": "important", "weight": 4, "category": "component_basic"},
-    "description": {"tier": "important", "weight": 4, "category": "component_basic"},
-    "licenses": {"tier": "important", "weight": 4, "category": "component_basic"},
-    "energyConsumption": {"tier": "important", "weight": 3, "category": "component_model_card"},
-    "hyperparameter": {"tier": "important", "weight": 3, "category": "component_model_card"},
-    "limitation": {"tier": "important", "weight": 3, "category": "component_model_card"},
-    "safetyRiskAssessment": {"tier": "important", "weight": 3, "category": "component_model_card"},
-    "typeOfModel": {"tier": "important", "weight": 3, "category": "component_model_card"},
-    # Supplementary fields (aligned with remaining SPDX optional fields)
-    "modelExplainability": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "standardCompliance": {"tier": "supplementary", "weight": 2, "category": "metadata"},
-    "domain": {"tier": "supplementary", "weight": 2, "category": "metadata"},
-    "energyQuantity": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "energyUnit": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "informationAboutTraining": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "informationAboutApplication": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "metric": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "metricDecisionThreshold": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "modelDataPreprocessing": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
-    "autonomyType": {"tier": "supplementary", "weight": 1, "category": "metadata"},
-    "useSensitivePersonalInformation": {"tier": "supplementary", "weight": 2, "category": "component_model_card"}
-}
-# Completeness profiles (silently aligned with SPDX requirements)
-COMPLETENESS_PROFILES = {
-    "basic": {
-        "description": "Minimal fields required for identification",
-        "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
-        "minimum_score": 40
-    },
-    "standard": {
-        "description": "Comprehensive fields for proper documentation",
-        "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
-                           "downloadLocation", "primaryPurpose", "suppliedBy"],
-        "minimum_score": 70
-    },
-    "advanced": {
-        "description": "Extensive documentation for maximum transparency",
-        "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
-                           "downloadLocation", "primaryPurpose", "suppliedBy",
-                           "type", "purl", "description", "licenses", "hyperparameter", "limitation",
-                           "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
-        "minimum_score": 85
     }
-}
-# Validation messages framed as best practices
-VALIDATION_MESSAGES = {
-    "name": {
-        "missing": "Missing critical field: name - essential for model identification",
-        "recommendation": "Add a descriptive name for the model"
-    },
-    "downloadLocation": {
-        "missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
-        "recommendation": "Add information about where the model can be downloaded"
-    },
-    "primaryPurpose": {
-        "missing": "Missing critical field: primaryPurpose - important for understanding model intent",
-        "recommendation": "Add information about the primary purpose of this model"
-    },
-    "suppliedBy": {
-        "missing": "Missing critical field: suppliedBy - needed for provenance tracking",
-        "recommendation": "Add information about who supplied this model"
-    },
-    "energyConsumption": {
-        "missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
-        "recommendation": "Consider documenting energy consumption metrics for better transparency"
-    },
-    "hyperparameter": {
-        "missing": "Missing important field: hyperparameter - valuable for reproducibility",
-        "recommendation": "Document key hyperparameters used in training"
-    },
-    "limitation": {
-        "missing": "Missing important field: limitation - important for responsible use",
-        "recommendation": "Document known limitations of the model to guide appropriate usage"
     }
-}
 def setup_logging(level=logging.INFO):
@@ -207,77 +240,53 @@ def check_field_in_aibom(aibom: Dict[str, Any], field: str) -> bool:
     Returns:
         True if the field is present, False otherwise
     """
-    # Check in root level
     if field in aibom:
         return True
-    # Check in metadata
     if "metadata" in aibom:
         metadata = aibom["metadata"]
         if field in metadata:
             return True
-        # Check in metadata properties
         if "properties" in metadata:
             for prop in metadata["properties"]:
-                if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
                     return True
-    # Check in components
     if "components" in aibom and aibom["components"]:
-        component = aibom["components"][0]  # Use first component
         if field in component:
             return True
-        # Check in component properties
         if "properties" in component:
             for prop in component["properties"]:
-                if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
                     return True
-        # Check in model card
         if "modelCard" in component:
             model_card = component["modelCard"]
             if field in model_card:
                 return True
-            # Check in model parameters
-            if "modelParameters" in model_card:
-                if field in model_card["modelParameters"]:
-                    return True
-                # Check in model parameters properties
-                if "properties" in model_card["modelParameters"]:
-                    for prop in model_card["modelParameters"]["properties"]:
-                        if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
-                            return True
-            # Check in considerations
             if "considerations" in model_card:
-                if field in model_card["considerations"]:
-                    return True
-                # Check in specific consideration sections
-                for section in ["technicalLimitations", "ethicalConsiderations", "environmentalConsiderations"]:
-                    if section in model_card["considerations"]:
-                        if field == "limitation" and section == "technicalLimitations":
-                            return True
-                        if field == "safetyRiskAssessment" and section == "ethicalConsiderations":
-                            return True
-                        if field == "energyConsumption" and section == "environmentalConsiderations":
                             return True
-    # Check in external references
     if field == "downloadLocation" and "externalReferences" in aibom:
         for ref in aibom["externalReferences"]:
-            if ref.get("type") == "distribution":
                 return True
     return False
 def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
     """
     Determine which completeness profile the AIBOM satisfies.
@@ -835,8 +844,113 @@ def get_validation_summary(report: Dict[str, Any]) -> str:
     return summary
-def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
     """
     Calculate completeness score using industry best practices with proper normalization and penalties.
@@ -875,8 +989,8 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
         # Count total fields in this category
         fields_by_category[category]["total"] += 1
-        # Check if field is present (ensure boolean result)
-        is_present = bool(check_field_in_aibom(aibom, field))
         if is_present:
             fields_by_category[category]["present"] += 1
@@ -898,6 +1012,19 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
                 category_scores[category] = round(raw_score, 1)
             else:
                 category_scores[category] = 0.0
     # Calculate subtotal (sum of rounded category scores)
     subtotal_score = sum(category_scores.values())
@@ -1033,7 +1160,7 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
     return result
-def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True) -> Dict[str, Any]:
     """
     Calculate completeness score for an AIBOM and optionally validate against AI requirements.
     Enhanced with industry best practices scoring.
@@ -1046,9 +1173,16 @@ def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, u
     Returns:
         Dictionary containing score and validation results
     """
     # If using best practices scoring, use the enhanced industry-neutral approach
     if use_best_practices:
-        result = calculate_industry_neutral_score(aibom)
         # Add validation if requested
         if validate:
@@ -1525,4 +1659,64 @@ def format_score_summary(score_result: Dict[str, Any]) -> str:
     summary += f"\nCompleteness Profile: {profile['name']}\n"
     summary += f"Description: {profile['description']}\n"
-    return summary

 """
+Mostly score calculation functions for the AI SBOM Generator.
 """
 import json
 import uuid
 from typing import Dict, List, Optional, Any, Union, Tuple
 from enum import Enum
+from .field_registry_manager import (
+    get_field_registry_manager,
+    generate_field_classification,
+    generate_completeness_profiles,
+    generate_validation_messages,
+    get_configurable_scoring_weights,
+    DynamicFieldDetector  # Compatibility wrapper
+)
 logger = logging.getLogger(__name__)
     WARNING = "warning"
     INFO = "info"
+# Registry-driven field definitions
+try:
+    REGISTRY_MANAGER = get_field_registry_manager()
+    FIELD_CLASSIFICATION = generate_field_classification()
+    COMPLETENESS_PROFILES = generate_completeness_profiles()
+    VALIDATION_MESSAGES = generate_validation_messages()
+    SCORING_WEIGHTS = get_configurable_scoring_weights()
+    print(f"✅ Registry-driven configuration loaded: {len(FIELD_CLASSIFICATION)} fields")
+    REGISTRY_AVAILABLE = True
+except Exception as e:
+    print(f"❌ Failed to load registry configuration: {e}")
+    print("🔄 Falling back to hardcoded definitions...")
+    REGISTRY_AVAILABLE = False
+    # Hardcoded definitions as fallback
+    FIELD_CLASSIFICATION = {
+        # Critical fields (silently aligned with SPDX mandatory fields)
+        "bomFormat": {"tier": "critical", "weight": 3, "category": "required_fields"},
+        "specVersion": {"tier": "critical", "weight": 3, "category": "required_fields"},
+        "serialNumber": {"tier": "critical", "weight": 3, "category": "required_fields"},
+        "version": {"tier": "critical", "weight": 3, "category": "required_fields"},
+        "name": {"tier": "critical", "weight": 4, "category": "component_basic"},
+        "downloadLocation": {"tier": "critical", "weight": 4, "category": "external_references"},
+        "primaryPurpose": {"tier": "critical", "weight": 3, "category": "metadata"},
+        "suppliedBy": {"tier": "critical", "weight": 4, "category": "metadata"},
+        # Important fields (aligned with key SPDX optional fields)
+        "type": {"tier": "important", "weight": 2, "category": "component_basic"},
+        "purl": {"tier": "important", "weight": 4, "category": "component_basic"},
+        "description": {"tier": "important", "weight": 4, "category": "component_basic"},
+        "licenses": {"tier": "important", "weight": 4, "category": "component_basic"},
+        "energyConsumption": {"tier": "important", "weight": 3, "category": "component_model_card"},
+        "hyperparameter": {"tier": "important", "weight": 3, "category": "component_model_card"},
+        "limitation": {"tier": "important", "weight": 3, "category": "component_model_card"},
+        "safetyRiskAssessment": {"tier": "important", "weight": 3, "category": "component_model_card"},
+        "typeOfModel": {"tier": "important", "weight": 3, "category": "component_model_card"},
+        # Supplementary fields (aligned with remaining SPDX optional fields)
+        "modelExplainability": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "standardCompliance": {"tier": "supplementary", "weight": 2, "category": "metadata"},
+        "domain": {"tier": "supplementary", "weight": 2, "category": "metadata"},
+        "energyQuantity": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "energyUnit": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "informationAboutTraining": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "informationAboutApplication": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "metric": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "metricDecisionThreshold": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "modelDataPreprocessing": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
+        "autonomyType": {"tier": "supplementary", "weight": 1, "category": "metadata"},
+        "useSensitivePersonalInformation": {"tier": "supplementary", "weight": 2, "category": "component_model_card"}
+    }
+    # Completeness profiles (silently aligned with SPDX requirements)
+    COMPLETENESS_PROFILES = {
+        "basic": {
+            "description": "Minimal fields required for identification",
+            "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
+            "minimum_score": 40
+        },
+        "standard": {
+            "description": "Comprehensive fields for proper documentation",
+            "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
+                               "downloadLocation", "primaryPurpose", "suppliedBy"],
+            "minimum_score": 70
+        },
+        "advanced": {
+            "description": "Extensive documentation for maximum transparency",
+            "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
+                               "downloadLocation", "primaryPurpose", "suppliedBy",
+                               "type", "purl", "description", "licenses", "hyperparameter", "limitation",
+                               "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
+            "minimum_score": 85
+        }
+    }
+    # Validation messages framed as best practices
+    VALIDATION_MESSAGES = {
+        "name": {
+            "missing": "Missing critical field: name - essential for model identification",
+            "recommendation": "Add a descriptive name for the model"
+        },
+        "downloadLocation": {
+            "missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
+            "recommendation": "Add information about where the model can be downloaded"
+        },
+        "primaryPurpose": {
+            "missing": "Missing critical field: primaryPurpose - important for understanding model intent",
+            "recommendation": "Add information about the primary purpose of this model"
+        },
+        "suppliedBy": {
+            "missing": "Missing critical field: suppliedBy - needed for provenance tracking",
+            "recommendation": "Add information about who supplied this model"
+        },
+        "energyConsumption": {
+            "missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
+            "recommendation": "Consider documenting energy consumption metrics for better transparency"
+        },
+        "hyperparameter": {
+            "missing": "Missing important field: hyperparameter - valuable for reproducibility",
+            "recommendation": "Document key hyperparameters used in training"
+        },
+        "limitation": {
+            "missing": "Missing important field: limitation - important for responsible use",
+            "recommendation": "Document known limitations of the model to guide appropriate usage"
+        }
     }
+    SCORING_WEIGHTS = {
+        "tier_weights": {"critical": 3, "important": 2, "supplementary": 1},
+        "category_weights": {
+            "required_fields": 20, "metadata": 20, "component_basic": 20,
+            "component_model_card": 30, "external_references": 10
+        },
+        "algorithm_config": {"type": "weighted_sum", "max_score": 100}
     }
 def setup_logging(level=logging.INFO):
     Returns:
         True if the field is present, False otherwise
     """
     if field in aibom:
         return True
     if "metadata" in aibom:
         metadata = aibom["metadata"]
         if field in metadata:
             return True
         if "properties" in metadata:
             for prop in metadata["properties"]:
+                prop_name = prop.get("name", "")
+                if prop_name in {field, f"spdx:{field}"}:
                     return True
     if "components" in aibom and aibom["components"]:
+        component = aibom["components"][0]
         if field in component:
             return True
         if "properties" in component:
             for prop in component["properties"]:
+                prop_name = prop.get("name", "")
+                if prop_name in {field, f"spdx:{field}"}:
                     return True
         if "modelCard" in component:
             model_card = component["modelCard"]
             if field in model_card:
                 return True
+            if "modelParameters" in model_card and field in model_card["modelParameters"]:
+                return True
             if "considerations" in model_card:
+                considerations = model_card["considerations"]
+                field_mappings = {
+                    "limitation": ["technicalLimitations", "limitations"],
+                    "safetyRiskAssessment": ["ethicalConsiderations", "safetyRiskAssessment"],
+                    "energyConsumption": ["environmentalConsiderations", "energyConsumption"]
+                }
+                if field in field_mappings:
+                    for section in field_mappings[field]:
+                        if section in considerations and considerations[section]:
                             return True
+                if field in considerations:
+                    return True
     if field == "downloadLocation" and "externalReferences" in aibom:
         for ref in aibom["externalReferences"]:
+            if ref.get("type") == "distribution" and ref.get("url"):
                 return True
     return False
 def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
     """
     Determine which completeness profile the AIBOM satisfies.
     return summary
+def check_field_with_enhanced_results(aibom: Dict[str, Any], field: str, extraction_results: Optional[Dict[str, Any]] = None) -> bool:
+    """
+    Enhanced field detection using consolidated field registry manager.
+    Args:
+        aibom: The AIBOM to check
+        field: The field name to check (must match field registry)
+        extraction_results: Enhanced extraction results with confidence levels
+    Returns:
+        True if the field is present and should count toward score, False otherwise
+    """
+    try:
+        # Initialize dynamic field detector (cached)
+        if not hasattr(check_field_with_enhanced_results, '_detector'):
+            try:
+                if REGISTRY_AVAILABLE:
+                    # Use the consolidated registry manager
+                    registry_manager = get_field_registry_manager()
+                    check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
+                    print(f"✅ Dynamic field detector initialized with registry manager")
+                else:
+                    # Create registry manager from path
+                    from field_registry_manager import FieldRegistryManager
+                    registry_path = os.path.join(current_dir, "field_registry.json")
+                    registry_manager = FieldRegistryManager(registry_path)
+                    check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
+                    print(f"✅ Dynamic field detector initialized with fallback registry manager")
+            except Exception as e:
+                print(f"❌ Failed to initialize dynamic field detector: {e}")
+                # Final fallback
+                import os
+                current_dir = os.path.dirname(os.path.abspath(__file__))
+                registry_path = os.path.join(current_dir, "field_registry.json")
+                try:
+                    check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_path)
+                    print(f"🔄 Dynamic field detector initialized with emergency fallback")
+                except Exception as final_error:
+                    print(f"❌ Complete failure to initialize dynamic field detector: {final_error}")
+                    check_field_with_enhanced_results._detector = None
+        detector = check_field_with_enhanced_results._detector
+        if detector is None:
+            print(f"⚠️  No detector available, using fallback for {field}")
+            return check_field_in_aibom(aibom, field)
+        # First, try dynamic detection from AIBOM structure using ENHANCED REGISTRY FORMAT
+        field_found_in_registry = False
+        # Use the enhanced registry structure (registry['fields'][field_name])
+        fields = detector.registry.get('fields', {})
+        if field in fields:
+            field_found_in_registry = True
+            field_config = fields[field]
+            field_path = field_config.get('jsonpath', '')
+            if field_path:
+                # Use dynamic detection
+                is_present, value = detector.detect_field_presence(aibom, field_path)
+                if is_present:
+                    print(f"✅ DYNAMIC: Found {field} = {value}")
+                    return True
+                else:
+                    print(f"❌ DYNAMIC: Missing {field} at {field_path}")
+            else:
+                print(f"⚠️  Field '{field}' has no jsonpath defined in registry")
+        # If field not in registry, log warning but continue
+        if not field_found_in_registry:
+            print(f"⚠️  WARNING: Field '{field}' not found in field registry")
+        # Second, check extraction results (existing logic)
+        if extraction_results and field in extraction_results:
+            extraction_result = extraction_results[field]
+            # Check if this field has actual extracted data (not just placeholder)
+            if hasattr(extraction_result, 'confidence'):
+                # Don't count fields with 'none' confidence (placeholders like NOASSERTION)
+                if extraction_result.confidence.value == 'none':
+                    print(f"❌ EXTRACTION: {field} has 'none' confidence")
+                    return False
+                # Count fields with medium or high confidence
+                is_confident = extraction_result.confidence.value in ['medium', 'high']
+                print(f"{'✅' if is_confident else '❌'} EXTRACTION: {field} confidence = {extraction_result.confidence.value}")
+                return is_confident
+            elif hasattr(extraction_result, 'value'):
+                # For simple extraction results, check if value is meaningful
+                value = extraction_result.value
+                if value in ['NOASSERTION', 'NOT_FOUND', None, '']:
+                    print(f"❌ EXTRACTION: {field} has placeholder value: {value}")
+                    return False
+                print(f"✅ EXTRACTION: {field} = {value}")
+                return True
+        # Third, fallback to original AIBOM detection
+        print(f"🔄 FALLBACK: Using original detection for {field}")
+        return check_field_in_aibom(aibom, field)
+    except Exception as e:
+        print(f"❌ Error in enhanced field detection for {field}: {e}")
+        return check_field_in_aibom(aibom, field)
+def calculate_industry_neutral_score(aibom: Dict[str, Any], extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     """
     Calculate completeness score using industry best practices with proper normalization and penalties.
         # Count total fields in this category
         fields_by_category[category]["total"] += 1
+        # Enhanced field detection using extraction results
+        is_present = check_field_with_enhanced_results(aibom, field, extraction_results)
         if is_present:
             fields_by_category[category]["present"] += 1
                 category_scores[category] = round(raw_score, 1)
             else:
                 category_scores[category] = 0.0
+    # Log field extraction summary
+    total_fields = sum(counts["total"] for counts in fields_by_category.values())
+    total_present = sum(counts["present"] for counts in fields_by_category.values())
+    print(f"📊 SCORING SUMMARY:")
+    print(f"   Total fields evaluated: {total_fields}")
+    print(f"   Fields successfully extracted: {total_present}")
+    print(f"   Extraction success rate: {round((total_present/total_fields)*100, 1)}%")
+    print(f"   Category breakdown:")
+    for category, counts in fields_by_category.items():
+        percentage = round((counts["present"]/counts["total"])*100, 1) if counts["total"] > 0 else 0
+        print(f"     {category}: {counts['present']}/{counts['total']} ({percentage}%)")
     # Calculate subtotal (sum of rounded category scores)
     subtotal_score = sum(category_scores.values())
     return result
+def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True, extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     """
     Calculate completeness score for an AIBOM and optionally validate against AI requirements.
     Enhanced with industry best practices scoring.
     Returns:
         Dictionary containing score and validation results
     """
+    print(f"🔍 DEBUG: use_best_practices={use_best_practices}")
+    print(f"🔍 DEBUG: extraction_results is None: {extraction_results is None}")
+    print(f"🔍 DEBUG: extraction_results keys: {list(extraction_results.keys()) if extraction_results else 'None'}")
+    if use_best_practices:
+        print("🔍 DEBUG: Calling calculate_industry_neutral_score")
+        result = calculate_industry_neutral_score(aibom, extraction_results)
     # If using best practices scoring, use the enhanced industry-neutral approach
     if use_best_practices:
+        result = calculate_industry_neutral_score(aibom, extraction_results)
         # Add validation if requested
         if validate:
     summary += f"\nCompleteness Profile: {profile['name']}\n"
     summary += f"Description: {profile['description']}\n"
+    return summary
+def test_consolidated_integration():
+    """Test that consolidated field registry manager integration is working"""
+    try:
+        print("\n🧪 Testing Consolidated Integration...")
+        # Test registry availability
+        if REGISTRY_AVAILABLE:
+            print("✅ Consolidated registry manager available")
+            # Test registry manager
+            manager = get_field_registry_manager()
+            print(f"✅ Registry manager initialized: {manager.registry_path}")
+            # Test field classification generation
+            field_count = len(FIELD_CLASSIFICATION)
+            print(f"✅ FIELD_CLASSIFICATION loaded: {field_count} fields")
+            # Test completeness profiles
+            profile_count = len(COMPLETENESS_PROFILES)
+            print(f"✅ COMPLETENESS_PROFILES loaded: {profile_count} profiles")
+            # Test validation messages
+            message_count = len(VALIDATION_MESSAGES)
+            print(f"✅ VALIDATION_MESSAGES loaded: {message_count} messages")
+            # Test scoring weights
+            tier_weights = SCORING_WEIGHTS.get("tier_weights", {})
+            category_weights = SCORING_WEIGHTS.get("category_weights", {})
+            print(f"✅ SCORING_WEIGHTS loaded: {len(tier_weights)} tiers, {len(category_weights)} categories")
+        else:
+            print("⚠️  Consolidated registry manager not available, using hardcoded definitions")
+        # Test dynamic field detector (DynamicFieldDetector)
+        if hasattr(check_field_with_enhanced_results, '_detector') and check_field_with_enhanced_results._detector:
+            print(f"✅ Dynamic field detector ready")
+        else:
+            print(f"⚠️  Dynamic field detector not initialized")
+        # Test field lookup
+        test_fields = ["bomFormat", "primaryPurpose", "energyConsumption"]
+        for field in test_fields:
+            if field in FIELD_CLASSIFICATION:
+                field_info = FIELD_CLASSIFICATION[field]
+                print(f"✅ Field '{field}': tier={field_info['tier']}, category={field_info['category']}")
+            else:
+                print(f"❌ Field '{field}' not found in FIELD_CLASSIFICATION")
+        print("🎉 Consolidated integration test completed!")
+        return True
+    except Exception as e:
+        print(f"❌ Consolidated integration test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+# Uncomment this line to run the test automatically when utils.py is imported
+test_consolidated_integration()