Spaces:

Mead0w1ark
/

MicroHS

Sleeping

MicroHS / hs_dataset.py

github-actions[bot]

Sync from GitHub 38cd8d69dc858672e22cd1448f7768fef87468b1

79f9b3a 4 days ago

11.6 kB

	"""
	Harmonized System dataset integration.

	Loads the official HS code dataset from:
	https://github.com/datasets/harmonized-system

	Provides:
	- Full HS code lookup (2, 4, 6 digit)
	- Section/chapter/heading/subheading hierarchy
	- HTS extension support (country-specific 7-10 digit codes)
	- Search by description
	"""

	import csv
	import json
	import os
	import re
	from pathlib import Path
	from typing import Optional

	PROJECT_DIR = Path(__file__).parent
	HS_DATA_PATH = PROJECT_DIR / "data" / "harmonized-system" / "harmonized-system.csv"
	US_HTS_LOOKUP_PATH = PROJECT_DIR / "data" / "hts" / "us_hts_lookup.json"


	class HSDataset:
	"""Harmonized System code dataset."""

	def __init__(self):
	self.codes = {} # hscode -> {section, description, parent, level}
	self.sections = {} # section number -> section name
	self.chapters = {} # 2-digit -> description
	self.headings = {} # 4-digit -> description
	self.subheadings = {} # 6-digit -> description
	self._loaded = False

	def load(self) -> bool:
	"""Load the HS dataset from CSV."""
	if self._loaded:
	return True

	if not HS_DATA_PATH.exists():
	print(f"HS dataset not found at {HS_DATA_PATH}")
	return False

	with open(HS_DATA_PATH, 'r', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	for row in reader:
	hscode = row['hscode'].strip()
	desc = row['description'].strip()
	section = row['section'].strip()
	parent = row['parent'].strip()
	level = int(row['level'])

	self.codes[hscode] = {
	'section': section,
	'description': desc,
	'parent': parent,
	'level': level,
	}

	if level == 2:
	self.chapters[hscode] = desc
	elif level == 4:
	self.headings[hscode] = desc
	elif level == 6:
	self.subheadings[hscode] = desc

	self._loaded = True
	print(f"Loaded HS dataset: {len(self.chapters)} chapters, "
	f"{len(self.headings)} headings, {len(self.subheadings)} subheadings")
	return True

	def lookup(self, hscode: str) -> Optional[dict]:
	"""Look up an HS code and return full hierarchy."""
	hscode = hscode.strip().replace('.', '').replace(' ', '')

	if hscode not in self.codes:
	return None

	entry = self.codes[hscode].copy()

	# Build hierarchy
	hierarchy = []
	current = hscode
	while current and current in self.codes and current != 'TOTAL':
	hierarchy.insert(0, {
	'code': current,
	'description': self.codes[current]['description'],
	'level': self.codes[current]['level'],
	})
	current = self.codes[current]['parent']

	entry['hierarchy'] = hierarchy
	entry['hscode'] = hscode

	# Get chapter and heading descriptions
	if len(hscode) >= 2:
	ch = hscode[:2]
	entry['chapter'] = self.chapters.get(ch, '')
	entry['chapter_code'] = ch
	if len(hscode) >= 4:
	hd = hscode[:4]
	entry['heading'] = self.headings.get(hd, '')
	entry['heading_code'] = hd
	if len(hscode) == 6:
	entry['subheading'] = self.subheadings.get(hscode, '')

	return entry

	def search(self, query: str, max_results: int = 20) -> list[dict]:
	"""Search HS codes by description text."""
	query_lower = query.lower()
	query_words = set(query_lower.split())

	results = []
	for hscode, info in self.codes.items():
	if info['level'] != 6:
	continue

	desc_lower = info['description'].lower()

	# Score by word overlap
	desc_words = set(desc_lower.split())
	overlap = query_words & desc_words

	if overlap:
	score = len(overlap) / len(query_words)
	# Bonus for exact substring match
	if query_lower in desc_lower:
	score += 1.0

	results.append({
	'hscode': hscode,
	'description': info['description'],
	'section': info['section'],
	'score': score,
	})

	results.sort(key=lambda x: -x['score'])
	return results[:max_results]

	def get_chapter_name(self, chapter_code: str) -> str:
	"""Get chapter description from 2-digit code."""
	return self.chapters.get(chapter_code.zfill(2), 'Unknown')

	def validate_hs_code(self, hscode: str) -> dict:
	"""Validate an HS code and return info about its validity."""
	hscode = hscode.strip().replace('.', '').replace(' ', '')

	result = {
	'valid': False,
	'code': hscode,
	'level': None,
	'description': None,
	'message': '',
	}

	if not re.match(r'^\d{2,6}$', hscode):
	result['message'] = 'HS code must be 2-6 digits'
	return result

	if hscode in self.codes:
	info = self.codes[hscode]
	result['valid'] = True
	result['level'] = info['level']
	result['description'] = info['description']
	result['message'] = f'Valid {info["level"]}-digit HS code'
	else:
	# Check if partial code is valid
	if len(hscode) == 6:
	heading = hscode[:4]
	chapter = hscode[:2]
	if heading in self.codes:
	result['message'] = f'Heading {heading} exists but subheading {hscode} not found'
	elif chapter in self.codes:
	result['message'] = f'Chapter {chapter} exists but code {hscode} not found'
	else:
	result['message'] = f'Code {hscode} not found in HS nomenclature'

	return result

	def get_all_6digit_codes(self) -> list[dict]:
	"""Return all 6-digit HS codes with descriptions."""
	return [
	{'hscode': code, 'description': info['description'], 'section': info['section']}
	for code, info in self.codes.items()
	if info['level'] == 6
	]


	# --- HTS Extensions ---
	# HTS (Harmonized Tariff Schedule) adds country-specific digits (7-10) after the 6-digit HS code.
	# This is a simplified reference for major trading partners.

	def _load_us_hts_extensions() -> dict:
	"""Load US HTS extensions from the pre-built JSON lookup table."""
	if not US_HTS_LOOKUP_PATH.exists():
	return {}
	with open(US_HTS_LOOKUP_PATH, "r", encoding="utf-8") as f:
	raw = json.load(f)
	# Convert from build_hts_lookup format to API format
	extensions = {}
	for hs6, entries in raw.items():
	extensions[hs6] = [
	{"hts": e["hts_code"], "description": e["description"],
	"general_duty": e.get("general_duty", ""),
	"special_duty": e.get("special_duty", ""),
	"unit": e.get("unit", "")}
	for e in entries
	]
	return extensions


	# Lazy-loaded cache for US HTS data
	_us_hts_cache = None


	def _get_us_hts_extensions() -> dict:
	global _us_hts_cache
	if _us_hts_cache is None:
	_us_hts_cache = _load_us_hts_extensions()
	return _us_hts_cache


	HTS_EXTENSIONS = {
	"US": {
	"name": "United States HTS",
	"digits": 10,
	"format": "XXXX.XX.XXXX",
	# Extensions loaded lazily from us_hts_lookup.json
	"extensions": None, # Sentinel — resolved in get_hts_extensions()
	},
	"EU": {
	"name": "EU Combined Nomenclature (CN)",
	"digits": 8,
	"format": "XXXX.XX.XX",
	"extensions": {
	"851712": [
	{"hts": "85171200", "description": "Telephones for cellular networks; smartphones"},
	],
	"847130": [
	{"hts": "84713000", "description": "Portable digital automatic data-processing machines, ≤ 10 kg"},
	],
	"870380": [
	{"hts": "87038000", "description": "Other vehicles, with electric motor for propulsion"},
	],
	}
	},
	"CN": {
	"name": "China Customs Tariff",
	"digits": 10,
	"format": "XXXX.XXXX.XX",
	"extensions": {
	"851712": [
	{"hts": "8517120010", "description": "Smartphones, 5G capable"},
	{"hts": "8517120090", "description": "Other mobile phones"},
	],
	"847130": [
	{"hts": "8471300000", "description": "Portable digital data processing machines"},
	],
	}
	},
	"JP": {
	"name": "Japan HS Tariff",
	"digits": 9,
	"format": "XXXX.XX.XXX",
	"extensions": {
	"851712": [
	{"hts": "851712000", "description": "Telephones for cellular networks or wireless"},
	],
	"870380": [
	{"hts": "870380000", "description": "Electric motor vehicles for passenger transport"},
	],
	}
	},
	}


	def get_hts_extensions(hs_code: str, country_code: str) -> Optional[dict]:
	"""
	Get HTS (country-specific) extensions for a 6-digit HS code.

	Args:
	hs_code: 6-digit HS code
	country_code: 2-letter country code (US, EU, CN, JP, etc.)

	Returns:
	Dict with country HTS info and available extensions, or None.
	"""
	hs_code = hs_code.strip().replace('.', '').replace(' ', '')
	country_code = country_code.upper().strip()

	if country_code not in HTS_EXTENSIONS:
	return {
	"available": False,
	"country": country_code,
	"message": f"HTS extensions not available for {country_code}. "
	f"Available: {', '.join(HTS_EXTENSIONS.keys())}",
	"extensions": [],
	}

	tariff = HTS_EXTENSIONS[country_code]
	# US extensions are lazy-loaded from JSON
	if country_code == "US":
	ext_dict = _get_us_hts_extensions()
	else:
	ext_dict = tariff["extensions"]
	extensions = ext_dict.get(hs_code, [])

	return {
	"available": True,
	"country": country_code,
	"tariff_name": tariff["name"],
	"total_digits": tariff["digits"],
	"format": tariff["format"],
	"extensions": extensions,
	"hs_code": hs_code,
	"message": f"Found {len(extensions)} HTS extension(s)" if extensions else
	f"No specific extensions found for {hs_code} in {tariff['name']}. "
	f"The base HS code {hs_code} applies.",
	}


	def get_available_hts_countries() -> list[dict]:
	"""Return list of countries with HTS extensions available."""
	return [
	{"code": code, "name": info["name"], "digits": info["digits"]}
	for code, info in HTS_EXTENSIONS.items()
	]


	# Singleton instance
	_dataset = HSDataset()


	def get_dataset() -> HSDataset:
	"""Get the singleton HSDataset instance, loading if necessary."""
	if not _dataset._loaded:
	_dataset.load()
	return _dataset