Spaces:
Sleeping
Sleeping
| """ | |
| Harmonized System dataset integration. | |
| Loads the official HS code dataset from: | |
| https://github.com/datasets/harmonized-system | |
| Provides: | |
| - Full HS code lookup (2, 4, 6 digit) | |
| - Section/chapter/heading/subheading hierarchy | |
| - HTS extension support (country-specific 7-10 digit codes) | |
| - Search by description | |
| """ | |
| import csv | |
| import json | |
| import os | |
| import re | |
| from pathlib import Path | |
| from typing import Optional | |
| PROJECT_DIR = Path(__file__).parent | |
| HS_DATA_PATH = PROJECT_DIR / "data" / "harmonized-system" / "harmonized-system.csv" | |
| US_HTS_LOOKUP_PATH = PROJECT_DIR / "data" / "hts" / "us_hts_lookup.json" | |
| class HSDataset: | |
| """Harmonized System code dataset.""" | |
| def __init__(self): | |
| self.codes = {} # hscode -> {section, description, parent, level} | |
| self.sections = {} # section number -> section name | |
| self.chapters = {} # 2-digit -> description | |
| self.headings = {} # 4-digit -> description | |
| self.subheadings = {} # 6-digit -> description | |
| self._loaded = False | |
| def load(self) -> bool: | |
| """Load the HS dataset from CSV.""" | |
| if self._loaded: | |
| return True | |
| if not HS_DATA_PATH.exists(): | |
| print(f"HS dataset not found at {HS_DATA_PATH}") | |
| return False | |
| with open(HS_DATA_PATH, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| hscode = row['hscode'].strip() | |
| desc = row['description'].strip() | |
| section = row['section'].strip() | |
| parent = row['parent'].strip() | |
| level = int(row['level']) | |
| self.codes[hscode] = { | |
| 'section': section, | |
| 'description': desc, | |
| 'parent': parent, | |
| 'level': level, | |
| } | |
| if level == 2: | |
| self.chapters[hscode] = desc | |
| elif level == 4: | |
| self.headings[hscode] = desc | |
| elif level == 6: | |
| self.subheadings[hscode] = desc | |
| self._loaded = True | |
| print(f"Loaded HS dataset: {len(self.chapters)} chapters, " | |
| f"{len(self.headings)} headings, {len(self.subheadings)} subheadings") | |
| return True | |
| def lookup(self, hscode: str) -> Optional[dict]: | |
| """Look up an HS code and return full hierarchy.""" | |
| hscode = hscode.strip().replace('.', '').replace(' ', '') | |
| if hscode not in self.codes: | |
| return None | |
| entry = self.codes[hscode].copy() | |
| # Build hierarchy | |
| hierarchy = [] | |
| current = hscode | |
| while current and current in self.codes and current != 'TOTAL': | |
| hierarchy.insert(0, { | |
| 'code': current, | |
| 'description': self.codes[current]['description'], | |
| 'level': self.codes[current]['level'], | |
| }) | |
| current = self.codes[current]['parent'] | |
| entry['hierarchy'] = hierarchy | |
| entry['hscode'] = hscode | |
| # Get chapter and heading descriptions | |
| if len(hscode) >= 2: | |
| ch = hscode[:2] | |
| entry['chapter'] = self.chapters.get(ch, '') | |
| entry['chapter_code'] = ch | |
| if len(hscode) >= 4: | |
| hd = hscode[:4] | |
| entry['heading'] = self.headings.get(hd, '') | |
| entry['heading_code'] = hd | |
| if len(hscode) == 6: | |
| entry['subheading'] = self.subheadings.get(hscode, '') | |
| return entry | |
| def search(self, query: str, max_results: int = 20) -> list[dict]: | |
| """Search HS codes by description text.""" | |
| query_lower = query.lower() | |
| query_words = set(query_lower.split()) | |
| results = [] | |
| for hscode, info in self.codes.items(): | |
| if info['level'] != 6: | |
| continue | |
| desc_lower = info['description'].lower() | |
| # Score by word overlap | |
| desc_words = set(desc_lower.split()) | |
| overlap = query_words & desc_words | |
| if overlap: | |
| score = len(overlap) / len(query_words) | |
| # Bonus for exact substring match | |
| if query_lower in desc_lower: | |
| score += 1.0 | |
| results.append({ | |
| 'hscode': hscode, | |
| 'description': info['description'], | |
| 'section': info['section'], | |
| 'score': score, | |
| }) | |
| results.sort(key=lambda x: -x['score']) | |
| return results[:max_results] | |
| def get_chapter_name(self, chapter_code: str) -> str: | |
| """Get chapter description from 2-digit code.""" | |
| return self.chapters.get(chapter_code.zfill(2), 'Unknown') | |
| def validate_hs_code(self, hscode: str) -> dict: | |
| """Validate an HS code and return info about its validity.""" | |
| hscode = hscode.strip().replace('.', '').replace(' ', '') | |
| result = { | |
| 'valid': False, | |
| 'code': hscode, | |
| 'level': None, | |
| 'description': None, | |
| 'message': '', | |
| } | |
| if not re.match(r'^\d{2,6}$', hscode): | |
| result['message'] = 'HS code must be 2-6 digits' | |
| return result | |
| if hscode in self.codes: | |
| info = self.codes[hscode] | |
| result['valid'] = True | |
| result['level'] = info['level'] | |
| result['description'] = info['description'] | |
| result['message'] = f'Valid {info["level"]}-digit HS code' | |
| else: | |
| # Check if partial code is valid | |
| if len(hscode) == 6: | |
| heading = hscode[:4] | |
| chapter = hscode[:2] | |
| if heading in self.codes: | |
| result['message'] = f'Heading {heading} exists but subheading {hscode} not found' | |
| elif chapter in self.codes: | |
| result['message'] = f'Chapter {chapter} exists but code {hscode} not found' | |
| else: | |
| result['message'] = f'Code {hscode} not found in HS nomenclature' | |
| return result | |
| def get_all_6digit_codes(self) -> list[dict]: | |
| """Return all 6-digit HS codes with descriptions.""" | |
| return [ | |
| {'hscode': code, 'description': info['description'], 'section': info['section']} | |
| for code, info in self.codes.items() | |
| if info['level'] == 6 | |
| ] | |
| # --- HTS Extensions --- | |
| # HTS (Harmonized Tariff Schedule) adds country-specific digits (7-10) after the 6-digit HS code. | |
| # This is a simplified reference for major trading partners. | |
| def _load_us_hts_extensions() -> dict: | |
| """Load US HTS extensions from the pre-built JSON lookup table.""" | |
| if not US_HTS_LOOKUP_PATH.exists(): | |
| return {} | |
| with open(US_HTS_LOOKUP_PATH, "r", encoding="utf-8") as f: | |
| raw = json.load(f) | |
| # Convert from build_hts_lookup format to API format | |
| extensions = {} | |
| for hs6, entries in raw.items(): | |
| extensions[hs6] = [ | |
| {"hts": e["hts_code"], "description": e["description"], | |
| "general_duty": e.get("general_duty", ""), | |
| "special_duty": e.get("special_duty", ""), | |
| "unit": e.get("unit", "")} | |
| for e in entries | |
| ] | |
| return extensions | |
| # Lazy-loaded cache for US HTS data | |
| _us_hts_cache = None | |
| def _get_us_hts_extensions() -> dict: | |
| global _us_hts_cache | |
| if _us_hts_cache is None: | |
| _us_hts_cache = _load_us_hts_extensions() | |
| return _us_hts_cache | |
| HTS_EXTENSIONS = { | |
| "US": { | |
| "name": "United States HTS", | |
| "digits": 10, | |
| "format": "XXXX.XX.XXXX", | |
| # Extensions loaded lazily from us_hts_lookup.json | |
| "extensions": None, # Sentinel — resolved in get_hts_extensions() | |
| }, | |
| "EU": { | |
| "name": "EU Combined Nomenclature (CN)", | |
| "digits": 8, | |
| "format": "XXXX.XX.XX", | |
| "extensions": { | |
| "851712": [ | |
| {"hts": "85171200", "description": "Telephones for cellular networks; smartphones"}, | |
| ], | |
| "847130": [ | |
| {"hts": "84713000", "description": "Portable digital automatic data-processing machines, ≤ 10 kg"}, | |
| ], | |
| "870380": [ | |
| {"hts": "87038000", "description": "Other vehicles, with electric motor for propulsion"}, | |
| ], | |
| } | |
| }, | |
| "CN": { | |
| "name": "China Customs Tariff", | |
| "digits": 10, | |
| "format": "XXXX.XXXX.XX", | |
| "extensions": { | |
| "851712": [ | |
| {"hts": "8517120010", "description": "Smartphones, 5G capable"}, | |
| {"hts": "8517120090", "description": "Other mobile phones"}, | |
| ], | |
| "847130": [ | |
| {"hts": "8471300000", "description": "Portable digital data processing machines"}, | |
| ], | |
| } | |
| }, | |
| "JP": { | |
| "name": "Japan HS Tariff", | |
| "digits": 9, | |
| "format": "XXXX.XX.XXX", | |
| "extensions": { | |
| "851712": [ | |
| {"hts": "851712000", "description": "Telephones for cellular networks or wireless"}, | |
| ], | |
| "870380": [ | |
| {"hts": "870380000", "description": "Electric motor vehicles for passenger transport"}, | |
| ], | |
| } | |
| }, | |
| } | |
| def get_hts_extensions(hs_code: str, country_code: str) -> Optional[dict]: | |
| """ | |
| Get HTS (country-specific) extensions for a 6-digit HS code. | |
| Args: | |
| hs_code: 6-digit HS code | |
| country_code: 2-letter country code (US, EU, CN, JP, etc.) | |
| Returns: | |
| Dict with country HTS info and available extensions, or None. | |
| """ | |
| hs_code = hs_code.strip().replace('.', '').replace(' ', '') | |
| country_code = country_code.upper().strip() | |
| if country_code not in HTS_EXTENSIONS: | |
| return { | |
| "available": False, | |
| "country": country_code, | |
| "message": f"HTS extensions not available for {country_code}. " | |
| f"Available: {', '.join(HTS_EXTENSIONS.keys())}", | |
| "extensions": [], | |
| } | |
| tariff = HTS_EXTENSIONS[country_code] | |
| # US extensions are lazy-loaded from JSON | |
| if country_code == "US": | |
| ext_dict = _get_us_hts_extensions() | |
| else: | |
| ext_dict = tariff["extensions"] | |
| extensions = ext_dict.get(hs_code, []) | |
| return { | |
| "available": True, | |
| "country": country_code, | |
| "tariff_name": tariff["name"], | |
| "total_digits": tariff["digits"], | |
| "format": tariff["format"], | |
| "extensions": extensions, | |
| "hs_code": hs_code, | |
| "message": f"Found {len(extensions)} HTS extension(s)" if extensions else | |
| f"No specific extensions found for {hs_code} in {tariff['name']}. " | |
| f"The base HS code {hs_code} applies.", | |
| } | |
| def get_available_hts_countries() -> list[dict]: | |
| """Return list of countries with HTS extensions available.""" | |
| return [ | |
| {"code": code, "name": info["name"], "digits": info["digits"]} | |
| for code, info in HTS_EXTENSIONS.items() | |
| ] | |
| # Singleton instance | |
| _dataset = HSDataset() | |
| def get_dataset() -> HSDataset: | |
| """Get the singleton HSDataset instance, loading if necessary.""" | |
| if not _dataset._loaded: | |
| _dataset.load() | |
| return _dataset | |