e2hln commited on
Commit
fba8f57
Β·
verified Β·
1 Parent(s): d18aa7a

Upload 2 files

Browse files
src/aibom-generator/enhanced_extractor.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AI SBOM Generator
4
 
5
  This module provides a fully configurable enhanced data extraction system that
6
  automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
@@ -176,6 +176,51 @@ class EnhancedExtractor:
176
  # Compile all patterns
177
  for category, pattern_list in self.patterns.items():
178
  self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
181
  """
@@ -333,6 +378,19 @@ class EnhancedExtractor:
333
  )
334
  extraction_methods.append("intelligent_inference")
335
  return inferred_value
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  # Strategy 6: Fallback value (if configured)
338
  fallback_value = self._try_fallback_value(field_name, field_config)
@@ -372,7 +430,9 @@ class EnhancedExtractor:
372
  'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
373
  'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
374
  'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
375
- 'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main"
 
 
376
  }
377
 
378
  if field_name in api_mappings:
@@ -653,6 +713,12 @@ class EnhancedExtractor:
653
  tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
654
  if tokenizer_config:
655
  metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
 
 
 
 
 
 
656
 
657
  except Exception as e:
658
  logger.warning(f"⚠️ Legacy Layer 2: Could not analyze repository files: {e}")
@@ -669,6 +735,15 @@ class EnhancedExtractor:
669
  if readme_content:
670
  extracted_info = self._extract_from_text(readme_content)
671
  metadata.update(extracted_info)
 
 
 
 
 
 
 
 
 
672
  except Exception as e:
673
  logger.warning(f"⚠️ Legacy Layer 3: Error in Smart Text Parsing: {e}")
674
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AIBOM Generator
4
 
5
  This module provides a fully configurable enhanced data extraction system that
6
  automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
 
176
  # Compile all patterns
177
  for category, pattern_list in self.patterns.items():
178
  self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
179
+
180
+
181
+ # SPDX mappings for common licences
182
+ LICENSE_MAPPINGS = {
183
+ "mit": "MIT",
184
+ "mit license": "MIT",
185
+ "apache license version 2.0": "Apache-2.0",
186
+ "apache license 2.0": "Apache-2.0",
187
+ "apache 2.0": "Apache-2.0",
188
+ "apache license, version 2.0": "Apache-2.0",
189
+ "bsd 3-clause": "BSD-3-Clause",
190
+ "bsd-3-clause": "BSD-3-Clause",
191
+ "bsd 2-clause": "BSD-2-Clause",
192
+ "bsd-2-clause": "BSD-2-Clause",
193
+ "gnu general public license v3": "GPL-3.0-only",
194
+ "gplv3": "GPL-3.0-only",
195
+ "gnu general public license v2": "GPL-2.0-only",
196
+ "gplv2": "GPL-2.0-only",
197
+ }
198
+
199
+ def _detect_license_from_file(self, model_id: str) -> Optional[str]:
200
+ """
201
+ Attempt to detect a licence by looking at repository files.
202
+ Downloads common licence filenames (e.g. LICENSE, LICENSE.md),
203
+ reads a small snippet, and returns the matching SPDX identifier,
204
+ or None if none match.
205
+ """
206
+ license_filenames = ["LICENSE", "LICENSE.txt", "LICENSE.md", "LICENSE.rst", "COPYING"]
207
+ for filename in license_filenames:
208
+ try:
209
+ file_path = hf_hub_download(repo_id=model_id, filename=filename)
210
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
211
+ snippet = f.read(4096).lower()
212
+ for header, spdx_id in self.LICENSE_MAPPINGS.items():
213
+ if header in snippet:
214
+ return spdx_id
215
+ except (RepositoryNotFoundError, EntryNotFoundError):
216
+ # file doesn’t exist; continue
217
+ continue
218
+ except Exception as e:
219
+ logger.debug(f"Licence detection error reading {filename}: {e}")
220
+ continue
221
+ return None
222
+
223
+
224
 
225
  def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
226
  """
 
378
  )
379
  extraction_methods.append("intelligent_inference")
380
  return inferred_value
381
+
382
+ # detect licence from repository files if the field is licence/ licences
383
+ if field_name in {"license", "licenses"}:
384
+ detected = self._detect_license_from_file(context["model_id"])
385
+ if detected:
386
+ self.extraction_results[field_name] = ExtractionResult(
387
+ value=detected,
388
+ source=DataSource.REPOSITORY_FILES,
389
+ confidence=ConfidenceLevel.MEDIUM,
390
+ extraction_method="license_file",
391
+ fallback_chain=extraction_methods,
392
+ )
393
+ return detected
394
 
395
  # Strategy 6: Fallback value (if configured)
396
  fallback_value = self._try_fallback_value(field_name, field_config)
 
430
  'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
431
  'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
432
  'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
433
+ 'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main",
434
+ 'license': lambda info: getattr(info.card_data, 'license', None) if hasattr(info, 'card_data') and info.card_data else None,
435
+ 'licenses': lambda info: getattr(info.card_data, 'license', None) if hasattr(info, 'card_data') and info.card_data else None
436
  }
437
 
438
  if field_name in api_mappings:
 
713
  tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
714
  if tokenizer_config:
715
  metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
716
+
717
+ # try to detect licence from repository files if licence is missing
718
+ if "license" not in metadata or not metadata["license"]:
719
+ detected_license = self._detect_license_from_file(model_id)
720
+ if detected_license:
721
+ metadata["license"] = detected_license
722
 
723
  except Exception as e:
724
  logger.warning(f"⚠️ Legacy Layer 2: Could not analyze repository files: {e}")
 
735
  if readme_content:
736
  extracted_info = self._extract_from_text(readme_content)
737
  metadata.update(extracted_info)
738
+
739
+ # promote licence found in README into main metadata if no licence exists yet
740
+ license_from_text = extracted_info.get("license_from_text")
741
+ if license_from_text and not metadata.get("license"):
742
+ if isinstance(license_from_text, list):
743
+ metadata["license"] = license_from_text[0]
744
+ else:
745
+ metadata["license"] = license_from_text
746
+
747
  except Exception as e:
748
  logger.warning(f"⚠️ Legacy Layer 3: Error in Smart Text Parsing: {e}")
749
 
src/aibom-generator/generator.py CHANGED
@@ -484,6 +484,7 @@ class AIBOMGenerator:
484
  }]
485
  }
486
 
 
487
  # Create authors array
488
  authors = []
489
  if "author" in metadata and metadata["author"]:
@@ -600,14 +601,22 @@ class AIBOMGenerator:
600
  }
601
 
602
  # Handle license
603
- if metadata and "license" in metadata and metadata["license"]:
 
 
 
 
 
 
 
 
604
  component["licenses"] = [{
605
  "license": {
606
- "id": metadata["license"],
607
- "url": self._get_license_url(metadata["license"])
608
  }
609
  }]
610
- print(f"βœ… COMPONENT: Added license = {metadata['license']}")
611
  else:
612
  component["licenses"] = [{
613
  "license": {
@@ -620,57 +629,35 @@ class AIBOMGenerator:
620
  # ALWAYS add description
621
  component["description"] = metadata.get("description", f"AI model {model_id}")
622
 
623
- # Add enhanced technical properties to component
624
- technical_properties = []
625
-
626
- # Add model type information
627
- if "model_type" in metadata:
628
- technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
629
- print(f"βœ… COMPONENT: Added model_type = {metadata['model_type']}")
630
-
631
- # Add tokenizer information
632
- if "tokenizer_class" in metadata:
633
- technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
634
- print(f"βœ… COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
635
-
636
- # Add architecture information
637
- if "architectures" in metadata:
638
- arch_value = metadata["architectures"]
639
- if isinstance(arch_value, list):
640
- arch_value = ", ".join(arch_value)
641
- technical_properties.append({"name": "architectures", "value": str(arch_value)})
642
- print(f"βœ… COMPONENT: Added architectures = {arch_value}")
643
-
644
- # Add library information
645
- if "library_name" in metadata:
646
- technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
647
- print(f"βœ… COMPONENT: Added library_name = {metadata['library_name']}")
648
-
649
- # Add technical properties to component if any exist
650
- if technical_properties:
651
  component["properties"] = technical_properties
652
- # Debug
653
- print(f"DEBUG: License in metadata: {'license' in metadata}" )
654
- if "license" in metadata:
655
- print(f"DEBUG: Adding licenses = {metadata['license']}")
656
-
657
- # ALWAYS add description
658
- component["description"] = metadata.get("description", f"AI model {model_id}")
659
- if metadata.get("license"):
660
- component["licenses"] = [{
661
- "license": {
662
- "id": metadata["license"],
663
- "url": self._get_license_url(metadata["license"])
664
- }
665
- }]
666
- else:
667
- component["licenses"] = [{
668
- "license": {
669
- "id": "unknown",
670
- "url": "https://spdx.org/licenses/"
671
- }
672
- }]
673
-
674
 
675
  # Add external references
676
  external_refs = [{
 
484
  }]
485
  }
486
 
487
+
488
  # Create authors array
489
  authors = []
490
  if "author" in metadata and metadata["author"]:
 
601
  }
602
 
603
  # Handle license
604
+ license_value = None
605
+ if metadata and "licenses" in metadata and metadata["licenses"]:
606
+ license_value = metadata["licenses"]
607
+ print(f"βœ… COMPONENT: Found licenses = {license_value}")
608
+ elif metadata and "license" in metadata and metadata["license"]:
609
+ license_value = metadata["license"]
610
+ print(f"βœ… COMPONENT: Found license = {license_value}")
611
+
612
+ if license_value:
613
  component["licenses"] = [{
614
  "license": {
615
+ "id": license_value,
616
+ "url": self._get_license_url(license_value)
617
  }
618
  }]
619
+ print(f"βœ… COMPONENT: Added license = {license_value}")
620
  else:
621
  component["licenses"] = [{
622
  "license": {
 
629
  # ALWAYS add description
630
  component["description"] = metadata.get("description", f"AI model {model_id}")
631
 
632
+ # Add enhanced technical properties to component
633
+ technical_properties = []
634
+
635
+ # Add model type information
636
+ if "model_type" in metadata:
637
+ technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
638
+ print(f"βœ… COMPONENT: Added model_type = {metadata['model_type']}")
639
+
640
+ # Add tokenizer information
641
+ if "tokenizer_class" in metadata:
642
+ technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
643
+ print(f"βœ… COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
644
+
645
+ # Add architecture information
646
+ if "architectures" in metadata:
647
+ arch_value = metadata["architectures"]
648
+ if isinstance(arch_value, list):
649
+ arch_value = ", ".join(arch_value)
650
+ technical_properties.append({"name": "architectures", "value": str(arch_value)})
651
+ print(f"βœ… COMPONENT: Added architectures = {arch_value}")
652
+
653
+ # Add library information
654
+ if "library_name" in metadata:
655
+ technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
656
+ print(f"βœ… COMPONENT: Added library_name = {metadata['library_name']}")
657
+
658
+ # Add technical properties to component if any exist
659
+ if technical_properties:
660
  component["properties"] = technical_properties
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
 
662
  # Add external references
663
  external_refs = [{