Spaces:

Mead0w1ark
/

MicroHS

Sleeping

File size: 11,584 Bytes

79f9b3a

"""
Harmonized System dataset integration.

Loads the official HS code dataset from:
https://github.com/datasets/harmonized-system

Provides:
- Full HS code lookup (2, 4, 6 digit)
- Section/chapter/heading/subheading hierarchy
- HTS extension support (country-specific 7-10 digit codes)
- Search by description
"""

import csv
import json
import os
import re
from pathlib import Path
from typing import Optional

PROJECT_DIR = Path(__file__).parent
HS_DATA_PATH = PROJECT_DIR / "data" / "harmonized-system" / "harmonized-system.csv"
US_HTS_LOOKUP_PATH = PROJECT_DIR / "data" / "hts" / "us_hts_lookup.json"


class HSDataset:
    """Harmonized System code dataset."""
    
    def __init__(self):
        self.codes = {}        # hscode -> {section, description, parent, level}
        self.sections = {}     # section number -> section name
        self.chapters = {}     # 2-digit -> description
        self.headings = {}     # 4-digit -> description
        self.subheadings = {}  # 6-digit -> description
        self._loaded = False
    
    def load(self) -> bool:
        """Load the HS dataset from CSV."""
        if self._loaded:
            return True
            
        if not HS_DATA_PATH.exists():
            print(f"HS dataset not found at {HS_DATA_PATH}")
            return False
        
        with open(HS_DATA_PATH, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                hscode = row['hscode'].strip()
                desc = row['description'].strip()
                section = row['section'].strip()
                parent = row['parent'].strip()
                level = int(row['level'])
                
                self.codes[hscode] = {
                    'section': section,
                    'description': desc,
                    'parent': parent,
                    'level': level,
                }
                
                if level == 2:
                    self.chapters[hscode] = desc
                elif level == 4:
                    self.headings[hscode] = desc
                elif level == 6:
                    self.subheadings[hscode] = desc
        
        self._loaded = True
        print(f"Loaded HS dataset: {len(self.chapters)} chapters, "
              f"{len(self.headings)} headings, {len(self.subheadings)} subheadings")
        return True
    
    def lookup(self, hscode: str) -> Optional[dict]:
        """Look up an HS code and return full hierarchy."""
        hscode = hscode.strip().replace('.', '').replace(' ', '')
        
        if hscode not in self.codes:
            return None
        
        entry = self.codes[hscode].copy()
        
        # Build hierarchy
        hierarchy = []
        current = hscode
        while current and current in self.codes and current != 'TOTAL':
            hierarchy.insert(0, {
                'code': current,
                'description': self.codes[current]['description'],
                'level': self.codes[current]['level'],
            })
            current = self.codes[current]['parent']
        
        entry['hierarchy'] = hierarchy
        entry['hscode'] = hscode
        
        # Get chapter and heading descriptions
        if len(hscode) >= 2:
            ch = hscode[:2]
            entry['chapter'] = self.chapters.get(ch, '')
            entry['chapter_code'] = ch
        if len(hscode) >= 4:
            hd = hscode[:4]
            entry['heading'] = self.headings.get(hd, '')
            entry['heading_code'] = hd
        if len(hscode) == 6:
            entry['subheading'] = self.subheadings.get(hscode, '')
        
        return entry
    
    def search(self, query: str, max_results: int = 20) -> list[dict]:
        """Search HS codes by description text."""
        query_lower = query.lower()
        query_words = set(query_lower.split())
        
        results = []
        for hscode, info in self.codes.items():
            if info['level'] != 6:
                continue
            
            desc_lower = info['description'].lower()
            
            # Score by word overlap
            desc_words = set(desc_lower.split())
            overlap = query_words & desc_words
            
            if overlap:
                score = len(overlap) / len(query_words)
                # Bonus for exact substring match
                if query_lower in desc_lower:
                    score += 1.0
                
                results.append({
                    'hscode': hscode,
                    'description': info['description'],
                    'section': info['section'],
                    'score': score,
                })
        
        results.sort(key=lambda x: -x['score'])
        return results[:max_results]
    
    def get_chapter_name(self, chapter_code: str) -> str:
        """Get chapter description from 2-digit code."""
        return self.chapters.get(chapter_code.zfill(2), 'Unknown')
    
    def validate_hs_code(self, hscode: str) -> dict:
        """Validate an HS code and return info about its validity."""
        hscode = hscode.strip().replace('.', '').replace(' ', '')
        
        result = {
            'valid': False,
            'code': hscode,
            'level': None,
            'description': None,
            'message': '',
        }
        
        if not re.match(r'^\d{2,6}$', hscode):
            result['message'] = 'HS code must be 2-6 digits'
            return result
        
        if hscode in self.codes:
            info = self.codes[hscode]
            result['valid'] = True
            result['level'] = info['level']
            result['description'] = info['description']
            result['message'] = f'Valid {info["level"]}-digit HS code'
        else:
            # Check if partial code is valid
            if len(hscode) == 6:
                heading = hscode[:4]
                chapter = hscode[:2]
                if heading in self.codes:
                    result['message'] = f'Heading {heading} exists but subheading {hscode} not found'
                elif chapter in self.codes:
                    result['message'] = f'Chapter {chapter} exists but code {hscode} not found'
                else:
                    result['message'] = f'Code {hscode} not found in HS nomenclature'
        
        return result
    
    def get_all_6digit_codes(self) -> list[dict]:
        """Return all 6-digit HS codes with descriptions."""
        return [
            {'hscode': code, 'description': info['description'], 'section': info['section']}
            for code, info in self.codes.items()
            if info['level'] == 6
        ]


# --- HTS Extensions ---
# HTS (Harmonized Tariff Schedule) adds country-specific digits (7-10) after the 6-digit HS code.
# This is a simplified reference for major trading partners.

def _load_us_hts_extensions() -> dict:
    """Load US HTS extensions from the pre-built JSON lookup table."""
    if not US_HTS_LOOKUP_PATH.exists():
        return {}
    with open(US_HTS_LOOKUP_PATH, "r", encoding="utf-8") as f:
        raw = json.load(f)
    # Convert from build_hts_lookup format to API format
    extensions = {}
    for hs6, entries in raw.items():
        extensions[hs6] = [
            {"hts": e["hts_code"], "description": e["description"],
             "general_duty": e.get("general_duty", ""),
             "special_duty": e.get("special_duty", ""),
             "unit": e.get("unit", "")}
            for e in entries
        ]
    return extensions


# Lazy-loaded cache for US HTS data
_us_hts_cache = None


def _get_us_hts_extensions() -> dict:
    global _us_hts_cache
    if _us_hts_cache is None:
        _us_hts_cache = _load_us_hts_extensions()
    return _us_hts_cache


HTS_EXTENSIONS = {
    "US": {
        "name": "United States HTS",
        "digits": 10,
        "format": "XXXX.XX.XXXX",
        # Extensions loaded lazily from us_hts_lookup.json
        "extensions": None,  # Sentinel — resolved in get_hts_extensions()
    },
    "EU": {
        "name": "EU Combined Nomenclature (CN)",
        "digits": 8,
        "format": "XXXX.XX.XX",
        "extensions": {
            "851712": [
                {"hts": "85171200", "description": "Telephones for cellular networks; smartphones"},
            ],
            "847130": [
                {"hts": "84713000", "description": "Portable digital automatic data-processing machines, ≤ 10 kg"},
            ],
            "870380": [
                {"hts": "87038000", "description": "Other vehicles, with electric motor for propulsion"},
            ],
        }
    },
    "CN": {
        "name": "China Customs Tariff",
        "digits": 10,
        "format": "XXXX.XXXX.XX",
        "extensions": {
            "851712": [
                {"hts": "8517120010", "description": "Smartphones, 5G capable"},
                {"hts": "8517120090", "description": "Other mobile phones"},
            ],
            "847130": [
                {"hts": "8471300000", "description": "Portable digital data processing machines"},
            ],
        }
    },
    "JP": {
        "name": "Japan HS Tariff",
        "digits": 9,
        "format": "XXXX.XX.XXX",
        "extensions": {
            "851712": [
                {"hts": "851712000", "description": "Telephones for cellular networks or wireless"},
            ],
            "870380": [
                {"hts": "870380000", "description": "Electric motor vehicles for passenger transport"},
            ],
        }
    },
}


def get_hts_extensions(hs_code: str, country_code: str) -> Optional[dict]:
    """
    Get HTS (country-specific) extensions for a 6-digit HS code.
    
    Args:
        hs_code: 6-digit HS code
        country_code: 2-letter country code (US, EU, CN, JP, etc.)
    
    Returns:
        Dict with country HTS info and available extensions, or None.
    """
    hs_code = hs_code.strip().replace('.', '').replace(' ', '')
    country_code = country_code.upper().strip()
    
    if country_code not in HTS_EXTENSIONS:
        return {
            "available": False,
            "country": country_code,
            "message": f"HTS extensions not available for {country_code}. "
                       f"Available: {', '.join(HTS_EXTENSIONS.keys())}",
            "extensions": [],
        }
    
    tariff = HTS_EXTENSIONS[country_code]
    # US extensions are lazy-loaded from JSON
    if country_code == "US":
        ext_dict = _get_us_hts_extensions()
    else:
        ext_dict = tariff["extensions"]
    extensions = ext_dict.get(hs_code, [])
    
    return {
        "available": True,
        "country": country_code,
        "tariff_name": tariff["name"],
        "total_digits": tariff["digits"],
        "format": tariff["format"],
        "extensions": extensions,
        "hs_code": hs_code,
        "message": f"Found {len(extensions)} HTS extension(s)" if extensions else
                   f"No specific extensions found for {hs_code} in {tariff['name']}. "
                   f"The base HS code {hs_code} applies.",
    }


def get_available_hts_countries() -> list[dict]:
    """Return list of countries with HTS extensions available."""
    return [
        {"code": code, "name": info["name"], "digits": info["digits"]}
        for code, info in HTS_EXTENSIONS.items()
    ]


# Singleton instance
_dataset = HSDataset()


def get_dataset() -> HSDataset:
    """Get the singleton HSDataset instance, loading if necessary."""
    if not _dataset._loaded:
        _dataset.load()
    return _dataset