MicroHS / hs_dataset.py
github-actions[bot]
Sync from GitHub 38cd8d69dc858672e22cd1448f7768fef87468b1
79f9b3a
"""
Harmonized System dataset integration.
Loads the official HS code dataset from:
https://github.com/datasets/harmonized-system
Provides:
- Full HS code lookup (2, 4, 6 digit)
- Section/chapter/heading/subheading hierarchy
- HTS extension support (country-specific 7-10 digit codes)
- Search by description
"""
import csv
import json
import os
import re
from pathlib import Path
from typing import Optional
PROJECT_DIR = Path(__file__).parent
HS_DATA_PATH = PROJECT_DIR / "data" / "harmonized-system" / "harmonized-system.csv"
US_HTS_LOOKUP_PATH = PROJECT_DIR / "data" / "hts" / "us_hts_lookup.json"
class HSDataset:
"""Harmonized System code dataset."""
def __init__(self):
self.codes = {} # hscode -> {section, description, parent, level}
self.sections = {} # section number -> section name
self.chapters = {} # 2-digit -> description
self.headings = {} # 4-digit -> description
self.subheadings = {} # 6-digit -> description
self._loaded = False
def load(self) -> bool:
"""Load the HS dataset from CSV."""
if self._loaded:
return True
if not HS_DATA_PATH.exists():
print(f"HS dataset not found at {HS_DATA_PATH}")
return False
with open(HS_DATA_PATH, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
hscode = row['hscode'].strip()
desc = row['description'].strip()
section = row['section'].strip()
parent = row['parent'].strip()
level = int(row['level'])
self.codes[hscode] = {
'section': section,
'description': desc,
'parent': parent,
'level': level,
}
if level == 2:
self.chapters[hscode] = desc
elif level == 4:
self.headings[hscode] = desc
elif level == 6:
self.subheadings[hscode] = desc
self._loaded = True
print(f"Loaded HS dataset: {len(self.chapters)} chapters, "
f"{len(self.headings)} headings, {len(self.subheadings)} subheadings")
return True
def lookup(self, hscode: str) -> Optional[dict]:
"""Look up an HS code and return full hierarchy."""
hscode = hscode.strip().replace('.', '').replace(' ', '')
if hscode not in self.codes:
return None
entry = self.codes[hscode].copy()
# Build hierarchy
hierarchy = []
current = hscode
while current and current in self.codes and current != 'TOTAL':
hierarchy.insert(0, {
'code': current,
'description': self.codes[current]['description'],
'level': self.codes[current]['level'],
})
current = self.codes[current]['parent']
entry['hierarchy'] = hierarchy
entry['hscode'] = hscode
# Get chapter and heading descriptions
if len(hscode) >= 2:
ch = hscode[:2]
entry['chapter'] = self.chapters.get(ch, '')
entry['chapter_code'] = ch
if len(hscode) >= 4:
hd = hscode[:4]
entry['heading'] = self.headings.get(hd, '')
entry['heading_code'] = hd
if len(hscode) == 6:
entry['subheading'] = self.subheadings.get(hscode, '')
return entry
def search(self, query: str, max_results: int = 20) -> list[dict]:
"""Search HS codes by description text."""
query_lower = query.lower()
query_words = set(query_lower.split())
results = []
for hscode, info in self.codes.items():
if info['level'] != 6:
continue
desc_lower = info['description'].lower()
# Score by word overlap
desc_words = set(desc_lower.split())
overlap = query_words & desc_words
if overlap:
score = len(overlap) / len(query_words)
# Bonus for exact substring match
if query_lower in desc_lower:
score += 1.0
results.append({
'hscode': hscode,
'description': info['description'],
'section': info['section'],
'score': score,
})
results.sort(key=lambda x: -x['score'])
return results[:max_results]
def get_chapter_name(self, chapter_code: str) -> str:
"""Get chapter description from 2-digit code."""
return self.chapters.get(chapter_code.zfill(2), 'Unknown')
def validate_hs_code(self, hscode: str) -> dict:
"""Validate an HS code and return info about its validity."""
hscode = hscode.strip().replace('.', '').replace(' ', '')
result = {
'valid': False,
'code': hscode,
'level': None,
'description': None,
'message': '',
}
if not re.match(r'^\d{2,6}$', hscode):
result['message'] = 'HS code must be 2-6 digits'
return result
if hscode in self.codes:
info = self.codes[hscode]
result['valid'] = True
result['level'] = info['level']
result['description'] = info['description']
result['message'] = f'Valid {info["level"]}-digit HS code'
else:
# Check if partial code is valid
if len(hscode) == 6:
heading = hscode[:4]
chapter = hscode[:2]
if heading in self.codes:
result['message'] = f'Heading {heading} exists but subheading {hscode} not found'
elif chapter in self.codes:
result['message'] = f'Chapter {chapter} exists but code {hscode} not found'
else:
result['message'] = f'Code {hscode} not found in HS nomenclature'
return result
def get_all_6digit_codes(self) -> list[dict]:
"""Return all 6-digit HS codes with descriptions."""
return [
{'hscode': code, 'description': info['description'], 'section': info['section']}
for code, info in self.codes.items()
if info['level'] == 6
]
# --- HTS Extensions ---
# HTS (Harmonized Tariff Schedule) adds country-specific digits (7-10) after the 6-digit HS code.
# This is a simplified reference for major trading partners.
def _load_us_hts_extensions() -> dict:
"""Load US HTS extensions from the pre-built JSON lookup table."""
if not US_HTS_LOOKUP_PATH.exists():
return {}
with open(US_HTS_LOOKUP_PATH, "r", encoding="utf-8") as f:
raw = json.load(f)
# Convert from build_hts_lookup format to API format
extensions = {}
for hs6, entries in raw.items():
extensions[hs6] = [
{"hts": e["hts_code"], "description": e["description"],
"general_duty": e.get("general_duty", ""),
"special_duty": e.get("special_duty", ""),
"unit": e.get("unit", "")}
for e in entries
]
return extensions
# Lazy-loaded cache for US HTS data
_us_hts_cache = None
def _get_us_hts_extensions() -> dict:
global _us_hts_cache
if _us_hts_cache is None:
_us_hts_cache = _load_us_hts_extensions()
return _us_hts_cache
HTS_EXTENSIONS = {
"US": {
"name": "United States HTS",
"digits": 10,
"format": "XXXX.XX.XXXX",
# Extensions loaded lazily from us_hts_lookup.json
"extensions": None, # Sentinel — resolved in get_hts_extensions()
},
"EU": {
"name": "EU Combined Nomenclature (CN)",
"digits": 8,
"format": "XXXX.XX.XX",
"extensions": {
"851712": [
{"hts": "85171200", "description": "Telephones for cellular networks; smartphones"},
],
"847130": [
{"hts": "84713000", "description": "Portable digital automatic data-processing machines, ≤ 10 kg"},
],
"870380": [
{"hts": "87038000", "description": "Other vehicles, with electric motor for propulsion"},
],
}
},
"CN": {
"name": "China Customs Tariff",
"digits": 10,
"format": "XXXX.XXXX.XX",
"extensions": {
"851712": [
{"hts": "8517120010", "description": "Smartphones, 5G capable"},
{"hts": "8517120090", "description": "Other mobile phones"},
],
"847130": [
{"hts": "8471300000", "description": "Portable digital data processing machines"},
],
}
},
"JP": {
"name": "Japan HS Tariff",
"digits": 9,
"format": "XXXX.XX.XXX",
"extensions": {
"851712": [
{"hts": "851712000", "description": "Telephones for cellular networks or wireless"},
],
"870380": [
{"hts": "870380000", "description": "Electric motor vehicles for passenger transport"},
],
}
},
}
def get_hts_extensions(hs_code: str, country_code: str) -> Optional[dict]:
"""
Get HTS (country-specific) extensions for a 6-digit HS code.
Args:
hs_code: 6-digit HS code
country_code: 2-letter country code (US, EU, CN, JP, etc.)
Returns:
Dict with country HTS info and available extensions, or None.
"""
hs_code = hs_code.strip().replace('.', '').replace(' ', '')
country_code = country_code.upper().strip()
if country_code not in HTS_EXTENSIONS:
return {
"available": False,
"country": country_code,
"message": f"HTS extensions not available for {country_code}. "
f"Available: {', '.join(HTS_EXTENSIONS.keys())}",
"extensions": [],
}
tariff = HTS_EXTENSIONS[country_code]
# US extensions are lazy-loaded from JSON
if country_code == "US":
ext_dict = _get_us_hts_extensions()
else:
ext_dict = tariff["extensions"]
extensions = ext_dict.get(hs_code, [])
return {
"available": True,
"country": country_code,
"tariff_name": tariff["name"],
"total_digits": tariff["digits"],
"format": tariff["format"],
"extensions": extensions,
"hs_code": hs_code,
"message": f"Found {len(extensions)} HTS extension(s)" if extensions else
f"No specific extensions found for {hs_code} in {tariff['name']}. "
f"The base HS code {hs_code} applies.",
}
def get_available_hts_countries() -> list[dict]:
"""Return list of countries with HTS extensions available."""
return [
{"code": code, "name": info["name"], "digits": info["digits"]}
for code, info in HTS_EXTENSIONS.items()
]
# Singleton instance
_dataset = HSDataset()
def get_dataset() -> HSDataset:
"""Get the singleton HSDataset instance, loading if necessary."""
if not _dataset._loaded:
_dataset.load()
return _dataset