Spaces:
Sleeping
Sleeping
File size: 11,584 Bytes
79f9b3a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | """
Harmonized System dataset integration.
Loads the official HS code dataset from:
https://github.com/datasets/harmonized-system
Provides:
- Full HS code lookup (2, 4, 6 digit)
- Section/chapter/heading/subheading hierarchy
- HTS extension support (country-specific 7-10 digit codes)
- Search by description
"""
import csv
import json
import os
import re
from pathlib import Path
from typing import Optional
PROJECT_DIR = Path(__file__).parent
HS_DATA_PATH = PROJECT_DIR / "data" / "harmonized-system" / "harmonized-system.csv"
US_HTS_LOOKUP_PATH = PROJECT_DIR / "data" / "hts" / "us_hts_lookup.json"
class HSDataset:
"""Harmonized System code dataset."""
def __init__(self):
self.codes = {} # hscode -> {section, description, parent, level}
self.sections = {} # section number -> section name
self.chapters = {} # 2-digit -> description
self.headings = {} # 4-digit -> description
self.subheadings = {} # 6-digit -> description
self._loaded = False
def load(self) -> bool:
"""Load the HS dataset from CSV."""
if self._loaded:
return True
if not HS_DATA_PATH.exists():
print(f"HS dataset not found at {HS_DATA_PATH}")
return False
with open(HS_DATA_PATH, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
hscode = row['hscode'].strip()
desc = row['description'].strip()
section = row['section'].strip()
parent = row['parent'].strip()
level = int(row['level'])
self.codes[hscode] = {
'section': section,
'description': desc,
'parent': parent,
'level': level,
}
if level == 2:
self.chapters[hscode] = desc
elif level == 4:
self.headings[hscode] = desc
elif level == 6:
self.subheadings[hscode] = desc
self._loaded = True
print(f"Loaded HS dataset: {len(self.chapters)} chapters, "
f"{len(self.headings)} headings, {len(self.subheadings)} subheadings")
return True
def lookup(self, hscode: str) -> Optional[dict]:
"""Look up an HS code and return full hierarchy."""
hscode = hscode.strip().replace('.', '').replace(' ', '')
if hscode not in self.codes:
return None
entry = self.codes[hscode].copy()
# Build hierarchy
hierarchy = []
current = hscode
while current and current in self.codes and current != 'TOTAL':
hierarchy.insert(0, {
'code': current,
'description': self.codes[current]['description'],
'level': self.codes[current]['level'],
})
current = self.codes[current]['parent']
entry['hierarchy'] = hierarchy
entry['hscode'] = hscode
# Get chapter and heading descriptions
if len(hscode) >= 2:
ch = hscode[:2]
entry['chapter'] = self.chapters.get(ch, '')
entry['chapter_code'] = ch
if len(hscode) >= 4:
hd = hscode[:4]
entry['heading'] = self.headings.get(hd, '')
entry['heading_code'] = hd
if len(hscode) == 6:
entry['subheading'] = self.subheadings.get(hscode, '')
return entry
def search(self, query: str, max_results: int = 20) -> list[dict]:
"""Search HS codes by description text."""
query_lower = query.lower()
query_words = set(query_lower.split())
results = []
for hscode, info in self.codes.items():
if info['level'] != 6:
continue
desc_lower = info['description'].lower()
# Score by word overlap
desc_words = set(desc_lower.split())
overlap = query_words & desc_words
if overlap:
score = len(overlap) / len(query_words)
# Bonus for exact substring match
if query_lower in desc_lower:
score += 1.0
results.append({
'hscode': hscode,
'description': info['description'],
'section': info['section'],
'score': score,
})
results.sort(key=lambda x: -x['score'])
return results[:max_results]
def get_chapter_name(self, chapter_code: str) -> str:
"""Get chapter description from 2-digit code."""
return self.chapters.get(chapter_code.zfill(2), 'Unknown')
def validate_hs_code(self, hscode: str) -> dict:
"""Validate an HS code and return info about its validity."""
hscode = hscode.strip().replace('.', '').replace(' ', '')
result = {
'valid': False,
'code': hscode,
'level': None,
'description': None,
'message': '',
}
if not re.match(r'^\d{2,6}$', hscode):
result['message'] = 'HS code must be 2-6 digits'
return result
if hscode in self.codes:
info = self.codes[hscode]
result['valid'] = True
result['level'] = info['level']
result['description'] = info['description']
result['message'] = f'Valid {info["level"]}-digit HS code'
else:
# Check if partial code is valid
if len(hscode) == 6:
heading = hscode[:4]
chapter = hscode[:2]
if heading in self.codes:
result['message'] = f'Heading {heading} exists but subheading {hscode} not found'
elif chapter in self.codes:
result['message'] = f'Chapter {chapter} exists but code {hscode} not found'
else:
result['message'] = f'Code {hscode} not found in HS nomenclature'
return result
def get_all_6digit_codes(self) -> list[dict]:
"""Return all 6-digit HS codes with descriptions."""
return [
{'hscode': code, 'description': info['description'], 'section': info['section']}
for code, info in self.codes.items()
if info['level'] == 6
]
# --- HTS Extensions ---
# HTS (Harmonized Tariff Schedule) adds country-specific digits (7-10) after the 6-digit HS code.
# This is a simplified reference for major trading partners.
def _load_us_hts_extensions() -> dict:
"""Load US HTS extensions from the pre-built JSON lookup table."""
if not US_HTS_LOOKUP_PATH.exists():
return {}
with open(US_HTS_LOOKUP_PATH, "r", encoding="utf-8") as f:
raw = json.load(f)
# Convert from build_hts_lookup format to API format
extensions = {}
for hs6, entries in raw.items():
extensions[hs6] = [
{"hts": e["hts_code"], "description": e["description"],
"general_duty": e.get("general_duty", ""),
"special_duty": e.get("special_duty", ""),
"unit": e.get("unit", "")}
for e in entries
]
return extensions
# Lazy-loaded cache for US HTS data
_us_hts_cache = None
def _get_us_hts_extensions() -> dict:
global _us_hts_cache
if _us_hts_cache is None:
_us_hts_cache = _load_us_hts_extensions()
return _us_hts_cache
HTS_EXTENSIONS = {
"US": {
"name": "United States HTS",
"digits": 10,
"format": "XXXX.XX.XXXX",
# Extensions loaded lazily from us_hts_lookup.json
"extensions": None, # Sentinel — resolved in get_hts_extensions()
},
"EU": {
"name": "EU Combined Nomenclature (CN)",
"digits": 8,
"format": "XXXX.XX.XX",
"extensions": {
"851712": [
{"hts": "85171200", "description": "Telephones for cellular networks; smartphones"},
],
"847130": [
{"hts": "84713000", "description": "Portable digital automatic data-processing machines, ≤ 10 kg"},
],
"870380": [
{"hts": "87038000", "description": "Other vehicles, with electric motor for propulsion"},
],
}
},
"CN": {
"name": "China Customs Tariff",
"digits": 10,
"format": "XXXX.XXXX.XX",
"extensions": {
"851712": [
{"hts": "8517120010", "description": "Smartphones, 5G capable"},
{"hts": "8517120090", "description": "Other mobile phones"},
],
"847130": [
{"hts": "8471300000", "description": "Portable digital data processing machines"},
],
}
},
"JP": {
"name": "Japan HS Tariff",
"digits": 9,
"format": "XXXX.XX.XXX",
"extensions": {
"851712": [
{"hts": "851712000", "description": "Telephones for cellular networks or wireless"},
],
"870380": [
{"hts": "870380000", "description": "Electric motor vehicles for passenger transport"},
],
}
},
}
def get_hts_extensions(hs_code: str, country_code: str) -> Optional[dict]:
"""
Get HTS (country-specific) extensions for a 6-digit HS code.
Args:
hs_code: 6-digit HS code
country_code: 2-letter country code (US, EU, CN, JP, etc.)
Returns:
Dict with country HTS info and available extensions, or None.
"""
hs_code = hs_code.strip().replace('.', '').replace(' ', '')
country_code = country_code.upper().strip()
if country_code not in HTS_EXTENSIONS:
return {
"available": False,
"country": country_code,
"message": f"HTS extensions not available for {country_code}. "
f"Available: {', '.join(HTS_EXTENSIONS.keys())}",
"extensions": [],
}
tariff = HTS_EXTENSIONS[country_code]
# US extensions are lazy-loaded from JSON
if country_code == "US":
ext_dict = _get_us_hts_extensions()
else:
ext_dict = tariff["extensions"]
extensions = ext_dict.get(hs_code, [])
return {
"available": True,
"country": country_code,
"tariff_name": tariff["name"],
"total_digits": tariff["digits"],
"format": tariff["format"],
"extensions": extensions,
"hs_code": hs_code,
"message": f"Found {len(extensions)} HTS extension(s)" if extensions else
f"No specific extensions found for {hs_code} in {tariff['name']}. "
f"The base HS code {hs_code} applies.",
}
def get_available_hts_countries() -> list[dict]:
"""Return list of countries with HTS extensions available."""
return [
{"code": code, "name": info["name"], "digits": info["digits"]}
for code, info in HTS_EXTENSIONS.items()
]
# Singleton instance
_dataset = HSDataset()
def get_dataset() -> HSDataset:
"""Get the singleton HSDataset instance, loading if necessary."""
if not _dataset._loaded:
_dataset.load()
return _dataset
|