Upload 2 files
Browse files
src/aibom-generator/enhanced_extractor.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for
|
| 4 |
|
| 5 |
This module provides a fully configurable enhanced data extraction system that
|
| 6 |
automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
|
|
@@ -176,6 +176,51 @@ class EnhancedExtractor:
|
|
| 176 |
# Compile all patterns
|
| 177 |
for category, pattern_list in self.patterns.items():
|
| 178 |
self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
| 181 |
"""
|
|
@@ -333,6 +378,19 @@ class EnhancedExtractor:
|
|
| 333 |
)
|
| 334 |
extraction_methods.append("intelligent_inference")
|
| 335 |
return inferred_value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
# Strategy 6: Fallback value (if configured)
|
| 338 |
fallback_value = self._try_fallback_value(field_name, field_config)
|
|
@@ -372,7 +430,9 @@ class EnhancedExtractor:
|
|
| 372 |
'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
|
| 373 |
'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
|
| 374 |
'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
|
| 375 |
-
'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main"
|
|
|
|
|
|
|
| 376 |
}
|
| 377 |
|
| 378 |
if field_name in api_mappings:
|
|
@@ -653,6 +713,12 @@ class EnhancedExtractor:
|
|
| 653 |
tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
|
| 654 |
if tokenizer_config:
|
| 655 |
metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
|
| 657 |
except Exception as e:
|
| 658 |
logger.warning(f"β οΈ Legacy Layer 2: Could not analyze repository files: {e}")
|
|
@@ -669,6 +735,15 @@ class EnhancedExtractor:
|
|
| 669 |
if readme_content:
|
| 670 |
extracted_info = self._extract_from_text(readme_content)
|
| 671 |
metadata.update(extracted_info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
except Exception as e:
|
| 673 |
logger.warning(f"β οΈ Legacy Layer 3: Error in Smart Text Parsing: {e}")
|
| 674 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AIBOM Generator
|
| 4 |
|
| 5 |
This module provides a fully configurable enhanced data extraction system that
|
| 6 |
automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
|
|
|
|
| 176 |
# Compile all patterns
|
| 177 |
for category, pattern_list in self.patterns.items():
|
| 178 |
self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# SPDX mappings for common licences
|
| 182 |
+
LICENSE_MAPPINGS = {
|
| 183 |
+
"mit": "MIT",
|
| 184 |
+
"mit license": "MIT",
|
| 185 |
+
"apache license version 2.0": "Apache-2.0",
|
| 186 |
+
"apache license 2.0": "Apache-2.0",
|
| 187 |
+
"apache 2.0": "Apache-2.0",
|
| 188 |
+
"apache license, version 2.0": "Apache-2.0",
|
| 189 |
+
"bsd 3-clause": "BSD-3-Clause",
|
| 190 |
+
"bsd-3-clause": "BSD-3-Clause",
|
| 191 |
+
"bsd 2-clause": "BSD-2-Clause",
|
| 192 |
+
"bsd-2-clause": "BSD-2-Clause",
|
| 193 |
+
"gnu general public license v3": "GPL-3.0-only",
|
| 194 |
+
"gplv3": "GPL-3.0-only",
|
| 195 |
+
"gnu general public license v2": "GPL-2.0-only",
|
| 196 |
+
"gplv2": "GPL-2.0-only",
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
def _detect_license_from_file(self, model_id: str) -> Optional[str]:
|
| 200 |
+
"""
|
| 201 |
+
Attempt to detect a licence by looking at repository files.
|
| 202 |
+
Downloads common licence filenames (e.g. LICENSE, LICENSE.md),
|
| 203 |
+
reads a small snippet, and returns the matching SPDX identifier,
|
| 204 |
+
or None if none match.
|
| 205 |
+
"""
|
| 206 |
+
license_filenames = ["LICENSE", "LICENSE.txt", "LICENSE.md", "LICENSE.rst", "COPYING"]
|
| 207 |
+
for filename in license_filenames:
|
| 208 |
+
try:
|
| 209 |
+
file_path = hf_hub_download(repo_id=model_id, filename=filename)
|
| 210 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 211 |
+
snippet = f.read(4096).lower()
|
| 212 |
+
for header, spdx_id in self.LICENSE_MAPPINGS.items():
|
| 213 |
+
if header in snippet:
|
| 214 |
+
return spdx_id
|
| 215 |
+
except (RepositoryNotFoundError, EntryNotFoundError):
|
| 216 |
+
# file doesnβt exist; continue
|
| 217 |
+
continue
|
| 218 |
+
except Exception as e:
|
| 219 |
+
logger.debug(f"Licence detection error reading {filename}: {e}")
|
| 220 |
+
continue
|
| 221 |
+
return None
|
| 222 |
+
|
| 223 |
+
|
| 224 |
|
| 225 |
def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
| 226 |
"""
|
|
|
|
| 378 |
)
|
| 379 |
extraction_methods.append("intelligent_inference")
|
| 380 |
return inferred_value
|
| 381 |
+
|
| 382 |
+
# detect licence from repository files if the field is licence/ licences
|
| 383 |
+
if field_name in {"license", "licenses"}:
|
| 384 |
+
detected = self._detect_license_from_file(context["model_id"])
|
| 385 |
+
if detected:
|
| 386 |
+
self.extraction_results[field_name] = ExtractionResult(
|
| 387 |
+
value=detected,
|
| 388 |
+
source=DataSource.REPOSITORY_FILES,
|
| 389 |
+
confidence=ConfidenceLevel.MEDIUM,
|
| 390 |
+
extraction_method="license_file",
|
| 391 |
+
fallback_chain=extraction_methods,
|
| 392 |
+
)
|
| 393 |
+
return detected
|
| 394 |
|
| 395 |
# Strategy 6: Fallback value (if configured)
|
| 396 |
fallback_value = self._try_fallback_value(field_name, field_config)
|
|
|
|
| 430 |
'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
|
| 431 |
'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
|
| 432 |
'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
|
| 433 |
+
'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main",
|
| 434 |
+
'license': lambda info: getattr(info.card_data, 'license', None) if hasattr(info, 'card_data') and info.card_data else None,
|
| 435 |
+
'licenses': lambda info: getattr(info.card_data, 'license', None) if hasattr(info, 'card_data') and info.card_data else None
|
| 436 |
}
|
| 437 |
|
| 438 |
if field_name in api_mappings:
|
|
|
|
| 713 |
tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
|
| 714 |
if tokenizer_config:
|
| 715 |
metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
|
| 716 |
+
|
| 717 |
+
# try to detect licence from repository files if licence is missing
|
| 718 |
+
if "license" not in metadata or not metadata["license"]:
|
| 719 |
+
detected_license = self._detect_license_from_file(model_id)
|
| 720 |
+
if detected_license:
|
| 721 |
+
metadata["license"] = detected_license
|
| 722 |
|
| 723 |
except Exception as e:
|
| 724 |
logger.warning(f"β οΈ Legacy Layer 2: Could not analyze repository files: {e}")
|
|
|
|
| 735 |
if readme_content:
|
| 736 |
extracted_info = self._extract_from_text(readme_content)
|
| 737 |
metadata.update(extracted_info)
|
| 738 |
+
|
| 739 |
+
# promote licence found in README into main metadata if no licence exists yet
|
| 740 |
+
license_from_text = extracted_info.get("license_from_text")
|
| 741 |
+
if license_from_text and not metadata.get("license"):
|
| 742 |
+
if isinstance(license_from_text, list):
|
| 743 |
+
metadata["license"] = license_from_text[0]
|
| 744 |
+
else:
|
| 745 |
+
metadata["license"] = license_from_text
|
| 746 |
+
|
| 747 |
except Exception as e:
|
| 748 |
logger.warning(f"β οΈ Legacy Layer 3: Error in Smart Text Parsing: {e}")
|
| 749 |
|
src/aibom-generator/generator.py
CHANGED
|
@@ -484,6 +484,7 @@ class AIBOMGenerator:
|
|
| 484 |
}]
|
| 485 |
}
|
| 486 |
|
|
|
|
| 487 |
# Create authors array
|
| 488 |
authors = []
|
| 489 |
if "author" in metadata and metadata["author"]:
|
|
@@ -600,14 +601,22 @@ class AIBOMGenerator:
|
|
| 600 |
}
|
| 601 |
|
| 602 |
# Handle license
|
| 603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
component["licenses"] = [{
|
| 605 |
"license": {
|
| 606 |
-
"id":
|
| 607 |
-
"url": self._get_license_url(
|
| 608 |
}
|
| 609 |
}]
|
| 610 |
-
print(f"β
COMPONENT: Added license = {
|
| 611 |
else:
|
| 612 |
component["licenses"] = [{
|
| 613 |
"license": {
|
|
@@ -620,57 +629,35 @@ class AIBOMGenerator:
|
|
| 620 |
# ALWAYS add description
|
| 621 |
component["description"] = metadata.get("description", f"AI model {model_id}")
|
| 622 |
|
| 623 |
-
# Add enhanced technical properties to component
|
| 624 |
-
technical_properties = []
|
| 625 |
-
|
| 626 |
-
# Add model type information
|
| 627 |
-
if "model_type" in metadata:
|
| 628 |
-
technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
|
| 629 |
-
print(f"β
COMPONENT: Added model_type = {metadata['model_type']}")
|
| 630 |
-
|
| 631 |
-
# Add tokenizer information
|
| 632 |
-
if "tokenizer_class" in metadata:
|
| 633 |
-
technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
|
| 634 |
-
print(f"β
COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
|
| 635 |
-
|
| 636 |
-
# Add architecture information
|
| 637 |
-
if "architectures" in metadata:
|
| 638 |
-
arch_value = metadata["architectures"]
|
| 639 |
-
if isinstance(arch_value, list):
|
| 640 |
-
arch_value = ", ".join(arch_value)
|
| 641 |
-
technical_properties.append({"name": "architectures", "value": str(arch_value)})
|
| 642 |
-
print(f"β
COMPONENT: Added architectures = {arch_value}")
|
| 643 |
-
|
| 644 |
-
# Add library information
|
| 645 |
-
if "library_name" in metadata:
|
| 646 |
-
technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
|
| 647 |
-
print(f"β
COMPONENT: Added library_name = {metadata['library_name']}")
|
| 648 |
-
|
| 649 |
-
# Add technical properties to component if any exist
|
| 650 |
-
if technical_properties:
|
| 651 |
component["properties"] = technical_properties
|
| 652 |
-
# Debug
|
| 653 |
-
print(f"DEBUG: License in metadata: {'license' in metadata}" )
|
| 654 |
-
if "license" in metadata:
|
| 655 |
-
print(f"DEBUG: Adding licenses = {metadata['license']}")
|
| 656 |
-
|
| 657 |
-
# ALWAYS add description
|
| 658 |
-
component["description"] = metadata.get("description", f"AI model {model_id}")
|
| 659 |
-
if metadata.get("license"):
|
| 660 |
-
component["licenses"] = [{
|
| 661 |
-
"license": {
|
| 662 |
-
"id": metadata["license"],
|
| 663 |
-
"url": self._get_license_url(metadata["license"])
|
| 664 |
-
}
|
| 665 |
-
}]
|
| 666 |
-
else:
|
| 667 |
-
component["licenses"] = [{
|
| 668 |
-
"license": {
|
| 669 |
-
"id": "unknown",
|
| 670 |
-
"url": "https://spdx.org/licenses/"
|
| 671 |
-
}
|
| 672 |
-
}]
|
| 673 |
-
|
| 674 |
|
| 675 |
# Add external references
|
| 676 |
external_refs = [{
|
|
|
|
| 484 |
}]
|
| 485 |
}
|
| 486 |
|
| 487 |
+
|
| 488 |
# Create authors array
|
| 489 |
authors = []
|
| 490 |
if "author" in metadata and metadata["author"]:
|
|
|
|
| 601 |
}
|
| 602 |
|
| 603 |
# Handle license
|
| 604 |
+
license_value = None
|
| 605 |
+
if metadata and "licenses" in metadata and metadata["licenses"]:
|
| 606 |
+
license_value = metadata["licenses"]
|
| 607 |
+
print(f"β
COMPONENT: Found licenses = {license_value}")
|
| 608 |
+
elif metadata and "license" in metadata and metadata["license"]:
|
| 609 |
+
license_value = metadata["license"]
|
| 610 |
+
print(f"β
COMPONENT: Found license = {license_value}")
|
| 611 |
+
|
| 612 |
+
if license_value:
|
| 613 |
component["licenses"] = [{
|
| 614 |
"license": {
|
| 615 |
+
"id": license_value,
|
| 616 |
+
"url": self._get_license_url(license_value)
|
| 617 |
}
|
| 618 |
}]
|
| 619 |
+
print(f"β
COMPONENT: Added license = {license_value}")
|
| 620 |
else:
|
| 621 |
component["licenses"] = [{
|
| 622 |
"license": {
|
|
|
|
| 629 |
# ALWAYS add description
|
| 630 |
component["description"] = metadata.get("description", f"AI model {model_id}")
|
| 631 |
|
| 632 |
+
# Add enhanced technical properties to component
|
| 633 |
+
technical_properties = []
|
| 634 |
+
|
| 635 |
+
# Add model type information
|
| 636 |
+
if "model_type" in metadata:
|
| 637 |
+
technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
|
| 638 |
+
print(f"β
COMPONENT: Added model_type = {metadata['model_type']}")
|
| 639 |
+
|
| 640 |
+
# Add tokenizer information
|
| 641 |
+
if "tokenizer_class" in metadata:
|
| 642 |
+
technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
|
| 643 |
+
print(f"β
COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
|
| 644 |
+
|
| 645 |
+
# Add architecture information
|
| 646 |
+
if "architectures" in metadata:
|
| 647 |
+
arch_value = metadata["architectures"]
|
| 648 |
+
if isinstance(arch_value, list):
|
| 649 |
+
arch_value = ", ".join(arch_value)
|
| 650 |
+
technical_properties.append({"name": "architectures", "value": str(arch_value)})
|
| 651 |
+
print(f"β
COMPONENT: Added architectures = {arch_value}")
|
| 652 |
+
|
| 653 |
+
# Add library information
|
| 654 |
+
if "library_name" in metadata:
|
| 655 |
+
technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
|
| 656 |
+
print(f"β
COMPONENT: Added library_name = {metadata['library_name']}")
|
| 657 |
+
|
| 658 |
+
# Add technical properties to component if any exist
|
| 659 |
+
if technical_properties:
|
| 660 |
component["properties"] = technical_properties
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
|
| 662 |
# Add external references
|
| 663 |
external_refs = [{
|