""" SPDX License Tracking for CASCADE Industry standard license tracking based on: - SPDX (Software Package Data Exchange) - Linux Foundation - HuggingFace Dataset Cards license field - Croissant metadata license property License Compatibility Rules: - Permissive (MIT, Apache-2.0) → Can derive into restrictive - Copyleft (GPL-3.0) → Derivatives must also be copyleft - NonCommercial (CC-BY-NC-*) → Propagates non-commercial restriction - ShareAlike (CC-BY-SA-*) → Derivatives must use same license - NoDerivatives (CC-BY-ND-*) → Cannot create derivatives References: - https://spdx.org/licenses/ - https://creativecommons.org/licenses/ """ from dataclasses import dataclass, field from enum import Enum from typing import Dict, List, Optional, Set, Tuple, Any class LicenseCategory(Enum): """License categories for compatibility analysis.""" PERMISSIVE = "permissive" # MIT, Apache, BSD WEAK_COPYLEFT = "weak-copyleft" # LGPL, MPL STRONG_COPYLEFT = "strong-copyleft" # GPL, AGPL CREATIVE_COMMONS = "creative-commons" PUBLIC_DOMAIN = "public-domain" # CC0, Unlicense PROPRIETARY = "proprietary" UNKNOWN = "unknown" class LicenseRestriction(Enum): """License restrictions that propagate to derivatives.""" NONE = "none" ATTRIBUTION = "attribution" # Must credit original SHARE_ALIKE = "share-alike" # Derivatives same license NON_COMMERCIAL = "non-commercial" # No commercial use NO_DERIVATIVES = "no-derivatives" # Cannot modify COPYLEFT = "copyleft" # Must open source derivatives @dataclass class SPDXLicense: """ SPDX License Information. Based on SPDX License List: https://spdx.org/licenses/ """ id: str # SPDX identifier (e.g., "MIT", "Apache-2.0") name: str # Full name category: LicenseCategory = LicenseCategory.UNKNOWN restrictions: Set[LicenseRestriction] = field(default_factory=set) osi_approved: bool = False # Open Source Initiative approved fsf_libre: bool = False # FSF Free/Libre url: Optional[str] = None # License text URL def allows_commercial(self) -> bool: """Check if license allows commercial use.""" return LicenseRestriction.NON_COMMERCIAL not in self.restrictions def allows_derivatives(self) -> bool: """Check if license allows creating derivatives.""" return LicenseRestriction.NO_DERIVATIVES not in self.restrictions def requires_attribution(self) -> bool: """Check if license requires attribution.""" return LicenseRestriction.ATTRIBUTION in self.restrictions def requires_share_alike(self) -> bool: """Check if license requires same license for derivatives.""" return ( LicenseRestriction.SHARE_ALIKE in self.restrictions or LicenseRestriction.COPYLEFT in self.restrictions ) def to_dict(self) -> Dict[str, Any]: return { "spdx_id": self.id, "name": self.name, "category": self.category.value, "restrictions": [r.value for r in self.restrictions], "osi_approved": self.osi_approved, "fsf_libre": self.fsf_libre, "url": self.url, } # SPDX License Registry - Common ML/Data licenses SPDX_LICENSES: Dict[str, SPDXLicense] = { # Public Domain "CC0-1.0": SPDXLicense( id="CC0-1.0", name="Creative Commons Zero v1.0 Universal", category=LicenseCategory.PUBLIC_DOMAIN, restrictions=set(), osi_approved=False, fsf_libre=True, url="https://creativecommons.org/publicdomain/zero/1.0/", ), "Unlicense": SPDXLicense( id="Unlicense", name="The Unlicense", category=LicenseCategory.PUBLIC_DOMAIN, restrictions=set(), osi_approved=True, fsf_libre=True, url="https://unlicense.org/", ), # Permissive "MIT": SPDXLicense( id="MIT", name="MIT License", category=LicenseCategory.PERMISSIVE, restrictions={LicenseRestriction.ATTRIBUTION}, osi_approved=True, fsf_libre=True, url="https://opensource.org/licenses/MIT", ), "Apache-2.0": SPDXLicense( id="Apache-2.0", name="Apache License 2.0", category=LicenseCategory.PERMISSIVE, restrictions={LicenseRestriction.ATTRIBUTION}, osi_approved=True, fsf_libre=True, url="https://www.apache.org/licenses/LICENSE-2.0", ), "BSD-2-Clause": SPDXLicense( id="BSD-2-Clause", name='BSD 2-Clause "Simplified" License', category=LicenseCategory.PERMISSIVE, restrictions={LicenseRestriction.ATTRIBUTION}, osi_approved=True, fsf_libre=True, url="https://opensource.org/licenses/BSD-2-Clause", ), "BSD-3-Clause": SPDXLicense( id="BSD-3-Clause", name='BSD 3-Clause "New" or "Revised" License', category=LicenseCategory.PERMISSIVE, restrictions={LicenseRestriction.ATTRIBUTION}, osi_approved=True, fsf_libre=True, url="https://opensource.org/licenses/BSD-3-Clause", ), # Creative Commons "CC-BY-4.0": SPDXLicense( id="CC-BY-4.0", name="Creative Commons Attribution 4.0", category=LicenseCategory.CREATIVE_COMMONS, restrictions={LicenseRestriction.ATTRIBUTION}, osi_approved=False, fsf_libre=True, url="https://creativecommons.org/licenses/by/4.0/", ), "CC-BY-SA-4.0": SPDXLicense( id="CC-BY-SA-4.0", name="Creative Commons Attribution ShareAlike 4.0", category=LicenseCategory.CREATIVE_COMMONS, restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.SHARE_ALIKE}, osi_approved=False, fsf_libre=True, url="https://creativecommons.org/licenses/by-sa/4.0/", ), "CC-BY-NC-4.0": SPDXLicense( id="CC-BY-NC-4.0", name="Creative Commons Attribution NonCommercial 4.0", category=LicenseCategory.CREATIVE_COMMONS, restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NON_COMMERCIAL}, osi_approved=False, fsf_libre=False, url="https://creativecommons.org/licenses/by-nc/4.0/", ), "CC-BY-NC-SA-4.0": SPDXLicense( id="CC-BY-NC-SA-4.0", name="Creative Commons Attribution NonCommercial ShareAlike 4.0", category=LicenseCategory.CREATIVE_COMMONS, restrictions={ LicenseRestriction.ATTRIBUTION, LicenseRestriction.NON_COMMERCIAL, LicenseRestriction.SHARE_ALIKE, }, osi_approved=False, fsf_libre=False, url="https://creativecommons.org/licenses/by-nc-sa/4.0/", ), "CC-BY-ND-4.0": SPDXLicense( id="CC-BY-ND-4.0", name="Creative Commons Attribution NoDerivatives 4.0", category=LicenseCategory.CREATIVE_COMMONS, restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NO_DERIVATIVES}, osi_approved=False, fsf_libre=False, url="https://creativecommons.org/licenses/by-nd/4.0/", ), # Weak Copyleft "LGPL-3.0": SPDXLicense( id="LGPL-3.0", name="GNU Lesser General Public License v3.0", category=LicenseCategory.WEAK_COPYLEFT, restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT}, osi_approved=True, fsf_libre=True, url="https://www.gnu.org/licenses/lgpl-3.0.html", ), "MPL-2.0": SPDXLicense( id="MPL-2.0", name="Mozilla Public License 2.0", category=LicenseCategory.WEAK_COPYLEFT, restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT}, osi_approved=True, fsf_libre=True, url="https://www.mozilla.org/en-US/MPL/2.0/", ), # Strong Copyleft "GPL-3.0": SPDXLicense( id="GPL-3.0", name="GNU General Public License v3.0", category=LicenseCategory.STRONG_COPYLEFT, restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT}, osi_approved=True, fsf_libre=True, url="https://www.gnu.org/licenses/gpl-3.0.html", ), "AGPL-3.0": SPDXLicense( id="AGPL-3.0", name="GNU Affero General Public License v3.0", category=LicenseCategory.STRONG_COPYLEFT, restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT}, osi_approved=True, fsf_libre=True, url="https://www.gnu.org/licenses/agpl-3.0.html", ), # ML-Specific "OpenRAIL": SPDXLicense( id="OpenRAIL", name="Open RAIL License", category=LicenseCategory.PERMISSIVE, restrictions={LicenseRestriction.ATTRIBUTION}, osi_approved=False, fsf_libre=False, url="https://huggingface.co/blog/open_rail", ), "OpenRAIL-M": SPDXLicense( id="OpenRAIL-M", name="Open RAIL-M License", category=LicenseCategory.PERMISSIVE, restrictions={LicenseRestriction.ATTRIBUTION}, osi_approved=False, fsf_libre=False, url="https://www.licenses.ai/blog/2022/8/26/bigscience-open-rail-m-license", ), # Special "other": SPDXLicense( id="other", name="Other/Custom License", category=LicenseCategory.UNKNOWN, restrictions=set(), osi_approved=False, fsf_libre=False, url=None, ), "unknown": SPDXLicense( id="unknown", name="Unknown License", category=LicenseCategory.UNKNOWN, restrictions=set(), osi_approved=False, fsf_libre=False, url=None, ), } def get_license(spdx_id: str) -> SPDXLicense: """ Get license by SPDX identifier. Args: spdx_id: SPDX license identifier (case-insensitive) Returns: SPDXLicense object (unknown if not found) """ # Normalize common variants normalized = spdx_id.strip() # Direct lookup if normalized in SPDX_LICENSES: return SPDX_LICENSES[normalized] # Case-insensitive lookup for key, license in SPDX_LICENSES.items(): if key.lower() == normalized.lower(): return license # Common aliases aliases = { "mit": "MIT", "apache": "Apache-2.0", "apache2": "Apache-2.0", "gpl": "GPL-3.0", "gpl3": "GPL-3.0", "lgpl": "LGPL-3.0", "bsd": "BSD-3-Clause", "cc0": "CC0-1.0", "cc-by": "CC-BY-4.0", "cc-by-sa": "CC-BY-SA-4.0", "cc-by-nc": "CC-BY-NC-4.0", "cc-by-nc-sa": "CC-BY-NC-SA-4.0", "cc-by-nd": "CC-BY-ND-4.0", "unlicense": "Unlicense", "public domain": "CC0-1.0", "openrail": "OpenRAIL", } lower_id = normalized.lower().replace("_", "-").replace(" ", "-") if lower_id in aliases: return SPDX_LICENSES[aliases[lower_id]] # Return unknown return SPDX_LICENSES["unknown"] @dataclass class LicenseCompatibility: """Result of license compatibility check.""" compatible: bool derived_license: Optional[SPDXLicense] = None issues: List[str] = field(default_factory=list) warnings: List[str] = field(default_factory=list) attribution_required: List[str] = field(default_factory=list) # Source IDs requiring attribution class LicenseAnalyzer: """ Analyze license compatibility for dataset derivation. Rules: 1. No-Derivatives: Cannot create derivatives 2. Share-Alike: Must use same license 3. Copyleft: Must use compatible copyleft license 4. Non-Commercial: Restriction propagates 5. Attribution: Must credit all sources """ # License compatibility matrix (can this → derive into that?) # Rows: source license category, Columns: derived license category COMPATIBILITY_MATRIX = { LicenseCategory.PUBLIC_DOMAIN: { LicenseCategory.PUBLIC_DOMAIN: True, LicenseCategory.PERMISSIVE: True, LicenseCategory.CREATIVE_COMMONS: True, LicenseCategory.WEAK_COPYLEFT: True, LicenseCategory.STRONG_COPYLEFT: True, LicenseCategory.PROPRIETARY: True, }, LicenseCategory.PERMISSIVE: { LicenseCategory.PUBLIC_DOMAIN: False, LicenseCategory.PERMISSIVE: True, LicenseCategory.CREATIVE_COMMONS: True, LicenseCategory.WEAK_COPYLEFT: True, LicenseCategory.STRONG_COPYLEFT: True, LicenseCategory.PROPRIETARY: True, }, LicenseCategory.CREATIVE_COMMONS: { LicenseCategory.PUBLIC_DOMAIN: False, LicenseCategory.PERMISSIVE: False, # Depends on specific CC LicenseCategory.CREATIVE_COMMONS: True, # Depends on specific CC LicenseCategory.WEAK_COPYLEFT: False, LicenseCategory.STRONG_COPYLEFT: False, LicenseCategory.PROPRIETARY: False, }, LicenseCategory.WEAK_COPYLEFT: { LicenseCategory.PUBLIC_DOMAIN: False, LicenseCategory.PERMISSIVE: False, LicenseCategory.CREATIVE_COMMONS: False, LicenseCategory.WEAK_COPYLEFT: True, LicenseCategory.STRONG_COPYLEFT: True, LicenseCategory.PROPRIETARY: False, }, LicenseCategory.STRONG_COPYLEFT: { LicenseCategory.PUBLIC_DOMAIN: False, LicenseCategory.PERMISSIVE: False, LicenseCategory.CREATIVE_COMMONS: False, LicenseCategory.WEAK_COPYLEFT: False, LicenseCategory.STRONG_COPYLEFT: True, LicenseCategory.PROPRIETARY: False, }, } def check_compatibility( self, source_licenses: List[Tuple[str, str]], # List of (entity_id, spdx_id) target_license: Optional[str] = None, ) -> LicenseCompatibility: """ Check if source licenses allow derivation. Args: source_licenses: List of (entity_id, license_id) tuples target_license: Intended license for derived work (optional) Returns: LicenseCompatibility result """ if not source_licenses: return LicenseCompatibility( compatible=True, derived_license=SPDX_LICENSES["unknown"], ) issues = [] warnings = [] attribution_required = [] # Collect all restrictions all_restrictions: Set[LicenseRestriction] = set() licenses = [] for entity_id, spdx_id in source_licenses: lic = get_license(spdx_id) licenses.append((entity_id, lic)) all_restrictions.update(lic.restrictions) # Track attribution requirements if lic.requires_attribution(): attribution_required.append(entity_id) # Check No-Derivatives for entity_id, lic in licenses: if LicenseRestriction.NO_DERIVATIVES in lic.restrictions: issues.append( f"Cannot derive from '{entity_id}': license '{lic.id}' prohibits derivatives" ) if issues: return LicenseCompatibility( compatible=False, issues=issues, warnings=warnings, attribution_required=attribution_required, ) # Determine derived license derived = self._compute_derived_license(licenses, all_restrictions) # Check target license compatibility if target_license: target = get_license(target_license) if not self._can_relicense(derived, target): issues.append( f"Cannot license derived work as '{target.id}': " f"must use '{derived.id}' or compatible license" ) # Add warnings if LicenseRestriction.NON_COMMERCIAL in all_restrictions: warnings.append("Derived work restricted to non-commercial use only") if LicenseRestriction.SHARE_ALIKE in all_restrictions: warnings.append(f"Derived work must use ShareAlike-compatible license: {derived.id}") if LicenseRestriction.COPYLEFT in all_restrictions: warnings.append(f"Derived work must use copyleft license: {derived.id}") return LicenseCompatibility( compatible=len(issues) == 0, derived_license=derived, issues=issues, warnings=warnings, attribution_required=attribution_required, ) def _compute_derived_license( self, licenses: List[Tuple[str, SPDXLicense]], all_restrictions: Set[LicenseRestriction], ) -> SPDXLicense: """ Compute the most restrictive license for derived work. The derived license is the "lowest common denominator" that satisfies all source license requirements. """ # Priority: Strong Copyleft > Weak Copyleft > CC-SA > CC-NC > Permissive > Public Domain has_strong_copyleft = any( lic.category == LicenseCategory.STRONG_COPYLEFT for _, lic in licenses ) has_weak_copyleft = any( lic.category == LicenseCategory.WEAK_COPYLEFT for _, lic in licenses ) has_share_alike = LicenseRestriction.SHARE_ALIKE in all_restrictions has_non_commercial = LicenseRestriction.NON_COMMERCIAL in all_restrictions # Strong copyleft dominates if has_strong_copyleft: for _, lic in licenses: if lic.category == LicenseCategory.STRONG_COPYLEFT: return lic # Weak copyleft next if has_weak_copyleft: for _, lic in licenses: if lic.category == LicenseCategory.WEAK_COPYLEFT: return lic # CC with restrictions if has_share_alike and has_non_commercial: return SPDX_LICENSES["CC-BY-NC-SA-4.0"] elif has_share_alike: return SPDX_LICENSES["CC-BY-SA-4.0"] elif has_non_commercial: return SPDX_LICENSES["CC-BY-NC-4.0"] # Most permissive with attribution if LicenseRestriction.ATTRIBUTION in all_restrictions: # Check if any source requires specific license for _, lic in licenses: if lic.category == LicenseCategory.CREATIVE_COMMONS: return lic return SPDX_LICENSES["CC-BY-4.0"] # Public domain return SPDX_LICENSES["CC0-1.0"] def _can_relicense(self, source: SPDXLicense, target: SPDXLicense) -> bool: """Check if source license allows relicensing to target.""" # Same license is always OK if source.id == target.id: return True # No relicensing from copyleft to non-copyleft if LicenseRestriction.COPYLEFT in source.restrictions: if LicenseRestriction.COPYLEFT not in target.restrictions: return False # No relicensing from share-alike to non-share-alike if LicenseRestriction.SHARE_ALIKE in source.restrictions: if LicenseRestriction.SHARE_ALIKE not in target.restrictions: return False # Non-commercial must propagate if LicenseRestriction.NON_COMMERCIAL in source.restrictions: if LicenseRestriction.NON_COMMERCIAL not in target.restrictions: return False return True def generate_attribution( self, sources: List[Tuple[str, str, str]], # (entity_id, license_id, name) ) -> str: """ Generate attribution text for derived work. Args: sources: List of (entity_id, license_id, name) tuples Returns: Attribution text """ lines = [ "## Attribution", "", "This dataset is derived from the following sources:", "", ] for entity_id, license_id, name in sources: lic = get_license(license_id) if lic.requires_attribution(): line = f"- **{name}** (`{entity_id}`)" if lic.url: line += f" - Licensed under [{lic.id}]({lic.url})" else: line += f" - Licensed under {lic.id}" lines.append(line) if len(lines) == 4: # No attributions needed return "" lines.append("") return "\n".join(lines) # Singleton analyzer _analyzer = LicenseAnalyzer() def check_license_compatibility( sources: List[Tuple[str, str]], target: Optional[str] = None, ) -> LicenseCompatibility: """ Convenience function to check license compatibility. Args: sources: List of (entity_id, license_id) tuples target: Intended license for derived work Returns: LicenseCompatibility result """ return _analyzer.check_compatibility(sources, target) def get_derived_license(sources: List[str]) -> SPDXLicense: """ Get the appropriate license for a work derived from given sources. Args: sources: List of SPDX license identifiers Returns: SPDXLicense for the derived work """ result = _analyzer.check_compatibility([ (f"source_{i}", lic) for i, lic in enumerate(sources) ]) return result.derived_license or SPDX_LICENSES["unknown"]