Spaces:

GenAISecurityProject
/

OWASP-AIBOM-Generator

Running

App Files Files

e2hln commited on Dec 2, 2025

Commit

fba8f57

verified ·

1 Parent(s): d18aa7a

Upload 2 files

Browse files

Files changed (2) hide show

src/aibom-generator/enhanced_extractor.py +77 -2
src/aibom-generator/generator.py +41 -54

src/aibom-generator/enhanced_extractor.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AI SBOM Generator
 This module provides a fully configurable enhanced data extraction system that
 automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
@@ -176,6 +176,51 @@ class EnhancedExtractor:
         # Compile all patterns
         for category, pattern_list in self.patterns.items():
             self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
     def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
         """
@@ -333,6 +378,19 @@ class EnhancedExtractor:
             )
             extraction_methods.append("intelligent_inference")
             return inferred_value
         # Strategy 6: Fallback value (if configured)
         fallback_value = self._try_fallback_value(field_name, field_config)
@@ -372,7 +430,9 @@ class EnhancedExtractor:
             'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
             'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
             'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
-            'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main"
         }
         if field_name in api_mappings:
@@ -653,6 +713,12 @@ class EnhancedExtractor:
             tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
             if tokenizer_config:
                 metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
         except Exception as e:
             logger.warning(f"⚠️ Legacy Layer 2: Could not analyze repository files: {e}")
@@ -669,6 +735,15 @@ class EnhancedExtractor:
             if readme_content:
                 extracted_info = self._extract_from_text(readme_content)
                 metadata.update(extracted_info)
         except Exception as e:
             logger.warning(f"⚠️ Legacy Layer 3: Error in Smart Text Parsing: {e}")

 #!/usr/bin/env python3
 """
+Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AIBOM Generator
 This module provides a fully configurable enhanced data extraction system that
 automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
         # Compile all patterns
         for category, pattern_list in self.patterns.items():
             self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
+    # SPDX mappings for common licences
+    LICENSE_MAPPINGS = {
+        "mit": "MIT",
+        "mit license": "MIT",
+        "apache license version 2.0": "Apache-2.0",
+        "apache license 2.0": "Apache-2.0",
+        "apache 2.0": "Apache-2.0",
+        "apache license, version 2.0": "Apache-2.0",
+        "bsd 3-clause": "BSD-3-Clause",
+        "bsd-3-clause": "BSD-3-Clause",
+        "bsd 2-clause": "BSD-2-Clause",
+        "bsd-2-clause": "BSD-2-Clause",
+        "gnu general public license v3": "GPL-3.0-only",
+        "gplv3": "GPL-3.0-only",
+        "gnu general public license v2": "GPL-2.0-only",
+        "gplv2": "GPL-2.0-only",
+    }
+    def _detect_license_from_file(self, model_id: str) -> Optional[str]:
+        """
+        Attempt to detect a licence by looking at repository files.
+        Downloads common licence filenames (e.g. LICENSE, LICENSE.md),
+        reads a small snippet, and returns the matching SPDX identifier,
+        or None if none match.
+        """
+        license_filenames = ["LICENSE", "LICENSE.txt", "LICENSE.md", "LICENSE.rst", "COPYING"]
+        for filename in license_filenames:
+            try:
+                file_path = hf_hub_download(repo_id=model_id, filename=filename)
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                    snippet = f.read(4096).lower()
+                for header, spdx_id in self.LICENSE_MAPPINGS.items():
+                    if header in snippet:
+                        return spdx_id
+            except (RepositoryNotFoundError, EntryNotFoundError):
+                # file doesn’t exist; continue
+                continue
+            except Exception as e:
+                logger.debug(f"Licence detection error reading {filename}: {e}")
+                continue
+        return None
     def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
         """
             )
             extraction_methods.append("intelligent_inference")
             return inferred_value
+        # detect licence from repository files if the field is licence/ licences
+        if field_name in {"license", "licenses"}:
+            detected = self._detect_license_from_file(context["model_id"])
+            if detected:
+                self.extraction_results[field_name] = ExtractionResult(
+                    value=detected,
+                    source=DataSource.REPOSITORY_FILES,
+                    confidence=ConfidenceLevel.MEDIUM,
+                    extraction_method="license_file",
+                    fallback_chain=extraction_methods,
+                )
+                return detected
         # Strategy 6: Fallback value (if configured)
         fallback_value = self._try_fallback_value(field_name, field_config)
             'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
             'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
             'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
+            'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main",
+            'license': lambda info: getattr(info.card_data, 'license', None) if hasattr(info, 'card_data') and info.card_data else None,
+            'licenses': lambda info: getattr(info.card_data, 'license', None) if hasattr(info, 'card_data') and info.card_data else None
         }
         if field_name in api_mappings:
             tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
             if tokenizer_config:
                 metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
+            # try to detect licence from repository files if licence is missing
+            if "license" not in metadata or not metadata["license"]:
+                detected_license = self._detect_license_from_file(model_id)
+                if detected_license:
+                    metadata["license"] = detected_license
         except Exception as e:
             logger.warning(f"⚠️ Legacy Layer 2: Could not analyze repository files: {e}")
             if readme_content:
                 extracted_info = self._extract_from_text(readme_content)
                 metadata.update(extracted_info)
+                # promote licence found in README into main metadata if no licence exists yet
+                license_from_text = extracted_info.get("license_from_text")
+                if license_from_text and not metadata.get("license"):
+                    if isinstance(license_from_text, list):
+                        metadata["license"] = license_from_text[0]
+                    else:
+                        metadata["license"] = license_from_text
         except Exception as e:
             logger.warning(f"⚠️ Legacy Layer 3: Error in Smart Text Parsing: {e}")

src/aibom-generator/generator.py CHANGED Viewed

@@ -484,6 +484,7 @@ class AIBOMGenerator:
             }]
         }
         # Create authors array
         authors = []
         if "author" in metadata and metadata["author"]:
@@ -600,14 +601,22 @@ class AIBOMGenerator:
         }
         # Handle license
-        if metadata and "license" in metadata and metadata["license"]:
             component["licenses"] = [{
                 "license": {
-                    "id": metadata["license"],
-                    "url": self._get_license_url(metadata["license"])
                 }
             }]
-            print(f"✅ COMPONENT: Added license = {metadata['license']}")
         else:
             component["licenses"] = [{
                 "license": {
@@ -620,57 +629,35 @@ class AIBOMGenerator:
         # ALWAYS add description
         component["description"] = metadata.get("description", f"AI model {model_id}")
-        # Add enhanced technical properties to component
-        technical_properties = []
-        # Add model type information
-        if "model_type" in metadata:
-            technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
-            print(f"✅ COMPONENT: Added model_type = {metadata['model_type']}")
-        # Add tokenizer information
-        if "tokenizer_class" in metadata:
-            technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
-            print(f"✅ COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
-        # Add architecture information
-        if "architectures" in metadata:
-            arch_value = metadata["architectures"]
-            if isinstance(arch_value, list):
-                arch_value = ", ".join(arch_value)
-            technical_properties.append({"name": "architectures", "value": str(arch_value)})
-            print(f"✅ COMPONENT: Added architectures = {arch_value}")
-        # Add library information
-        if "library_name" in metadata:
-            technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
-            print(f"✅ COMPONENT: Added library_name = {metadata['library_name']}")
-        # Add technical properties to component if any exist
-        if technical_properties:
             component["properties"] = technical_properties
-        # Debug
-        print(f"DEBUG: License in metadata: {'license' in metadata}" )
-        if "license" in metadata:
-            print(f"DEBUG: Adding licenses = {metadata['license']}")
-        # ALWAYS add description
-        component["description"] = metadata.get("description", f"AI model {model_id}")
-        if metadata.get("license"):
-            component["licenses"] = [{
-                "license": {
-                    "id": metadata["license"],
-                    "url": self._get_license_url(metadata["license"])
-                }
-            }]
-        else:
-            component["licenses"] = [{
-                "license": {
-                    "id": "unknown",
-                    "url": "https://spdx.org/licenses/"
-                }
-            }]
         # Add external references
         external_refs = [{

             }]
         }
         # Create authors array
         authors = []
         if "author" in metadata and metadata["author"]:
         }
         # Handle license
+        license_value = None
+        if metadata and "licenses" in metadata and metadata["licenses"]:
+            license_value = metadata["licenses"]
+            print(f"✅ COMPONENT: Found licenses = {license_value}")
+        elif metadata and "license" in metadata and metadata["license"]:
+            license_value = metadata["license"]
+            print(f"✅ COMPONENT: Found license = {license_value}")
+        if license_value:
             component["licenses"] = [{
                 "license": {
+                    "id": license_value,
+                    "url": self._get_license_url(license_value)
                 }
             }]
+            print(f"✅ COMPONENT: Added license = {license_value}")
         else:
             component["licenses"] = [{
                 "license": {
         # ALWAYS add description
         component["description"] = metadata.get("description", f"AI model {model_id}")
+        # Add enhanced technical properties to component
+        technical_properties = []
+        # Add model type information
+        if "model_type" in metadata:
+            technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
+            print(f"✅ COMPONENT: Added model_type = {metadata['model_type']}")
+        # Add tokenizer information
+        if "tokenizer_class" in metadata:
+            technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
+            print(f"✅ COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
+        # Add architecture information
+        if "architectures" in metadata:
+            arch_value = metadata["architectures"]
+            if isinstance(arch_value, list):
+                arch_value = ", ".join(arch_value)
+            technical_properties.append({"name": "architectures", "value": str(arch_value)})
+            print(f"✅ COMPONENT: Added architectures = {arch_value}")
+        # Add library information
+        if "library_name" in metadata:
+            technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
+            print(f"✅ COMPONENT: Added library_name = {metadata['library_name']}")
+        # Add technical properties to component if any exist
+        if technical_properties:
             component["properties"] = technical_properties
         # Add external references
         external_refs = [{