Spaces:

crazycrazypete
/

Masters-four-Tab-OpenAI

Running

Masters-four-Tab-OpenAI / backend /scripts /normalize_router_pricing_sources.py

Pete Dunn

Improve KB synthesis and rapid router canary flow

ed025e5 3 months ago

37 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import csv
	import os
	import re
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple

	import openpyxl
	import pdfplumber

	REPO_ROOT = Path(__file__).resolve().parents[2]
	SOURCES_DIR = REPO_ROOT / "backend" / "app" / "knowledgebase" / "data" / "pricing_sources"
	OUT_DIR = REPO_ROOT / "backend" / "app" / "knowledgebase" / "data" / "normalized"
	EXTRA_SOURCES_ENV = "ROUTER_PRICING_EXTRA_SOURCE_DIRS"
	DEFAULT_EXTERNAL_SOURCE_DIRS: Tuple[Path, ...] = (
	REPO_ROOT.parent / "RAG SKU and PRice List",
	)


	def _display_path(path: Optional[Path]) -> str:
	if path is None:
	return ""
	try:
	return str(path.resolve().relative_to(REPO_ROOT))
	except Exception:
	try:
	return str(path.relative_to(REPO_ROOT))
	except Exception:
	return str(path)

	FAMILY_HINTS = [
	"AKITA",
	"CHINOOK",
	"HUSKY",
	"ALBATROSS",
	"WHIPPET",
	"AIREDALE",
	"BEAGLE",
	"BOXER",
	"DALMATIAN",
	"GREYHOUND",
	"IRISH SETTER",
	"LABRADOR",
	"ROTTWEILER",
	"DOBERMAN",
	"BORDER COLLIE",
	"GREAT DANE",
	"ST. BERNARD",
	]

	VERIZON_GATEWAY_WEB_ROWS: Tuple[Dict[str, str], ...] = (
	{
	"manufacturer": "Verizon (OEM Arcadyan)",
	"model": "XC46BE",
	"title": "Verizon Business Internet Gateway XC46BE (Dragon)",
	},
	{
	"manufacturer": "Verizon (OEM Askey)",
	"model": "FSNO21VA",
	"title": "Verizon Internet Gateway Business FSNO21VA",
	},
	{
	"manufacturer": "Verizon (OEM Askey)",
	"model": "ASK-NCM1100E",
	"title": "Verizon Business Internet Gateway ASK-NCM1100E (Crown)",
	},
	{
	"manufacturer": "Verizon (OEM Askey)",
	"model": "ASK-NCQ1338E",
	"title": "Verizon Internet Gateway for Business ASK-NCQ1338E",
	},
	{
	"manufacturer": "Verizon (OEM Arris)",
	"model": "NVG558",
	"title": "Arris NVG558 LTE Router",
	},
	{
	"manufacturer": "Verizon (OEM Askey)",
	"model": "ASK-NCM1100",
	"title": "Verizon Home Internet Router ASK-NCM1100",
	},
	{
	"manufacturer": "Verizon (OEM Askey)",
	"model": "ASK-NCQ1338",
	"title": "Verizon Internet Gateway ASK-NCQ1338",
	},
	{
	"manufacturer": "Verizon (OEM Askey)",
	"model": "ASK-NCQ1338FA",
	"title": "Verizon Internet Gateway ASK-NCQ1338FA",
	},
	{
	"manufacturer": "Verizon (OEM Arcadyan)",
	"model": "ARC-XCI55AX",
	"title": "Verizon Internet Gateway ARC-XCI55AX",
	},
	{
	"manufacturer": "Verizon (OEM Wistron NeWeb)",
	"model": "WNC-CR200A",
	"title": "Verizon Internet Gateway WNC-CR200A",
	},
	)

	MODEL_ALIAS = {
	"RX50": "RX55",
	"RX60": "XR60",
	"CR202LITE": "CR202",
	"CR202-LITE": "CR202",
	"MAXBR1PRO": "MAXBR1PRO",
	"MAXBR1PRO5G": "MAXBR1PRO5G",
	"BR1PRO": "MAXBR1PRO",
	"BR1PRO5G": "MAXBR1PRO5G",
	"BR1MINI": "BR1MINI5G",
	"MAXBR1MINI": "BR1MINI5G",
	"BR1MINI5G": "BR1MINI5G",
	"FW2000": "FW2000E",
	"MG51E": "MG51",
	}

	HEADER_ALIASES = {
	"sku": ("sku", "partnumber", "part number", "item name", "part #", "parsec part #", "parsec accessories part #"),
	"title": ("product", "model", "item name", "short description", "product model", "variant"),
	"description": ("description", "long description", "desc (bom & accessory content)", "desc", "special offers & notes", "notes", "remark"),
	"msrp": ("msrp", "msrp / retail price", "list price", "list ($ usd)", "retail price", "map"),
	"term": ("term", "registration", "subscription duration included"),
	"wifi": ("wifi", "wi-fi", "wifi standard", "wi-fi standard", "wifi capability", "wireless"),
	"ethernet": ("ethernet ports", "ethernet port", "lan", "lan ports and speed", "wan", "wan ports and speed"),
	"serial": ("serial", "serial port", "serial ports"),
	"battery": ("battery",),
	"modem": ("cellular networks", "wan", "modem", "modem type", "cat/cellular", "cat"),
	"poe": ("poe",),
	}

	@dataclass
	class CatalogRow:
	source_file: str
	source_sheet: str
	source_row: int
	manufacturer: str
	item_type: str
	title: str
	model: str
	model_key: str
	sku: str
	msrp: str
	msrp_numeric: Optional[float]
	term: str
	wifi: str
	ethernet_ports: str
	serial_ports: str
	battery: str
	ruggedization: str
	modem_type: str
	primary_use_case: str
	device_type: str
	poe: str
	notes: str
	sources_label: str


	def _norm(text: Any) -> str:
	return re.sub(r"\s+", " ", str(text or "").strip())


	def _compact(text: Any) -> str:
	return re.sub(r"[^A-Z0-9]", "", _norm(text).upper())


	def _file_key(path_or_name: Any) -> str:
	name = Path(str(path_or_name or "")).name
	return re.sub(r"[^a-z0-9]+", "", name.lower())


	def _safe_mtime(path: Path) -> float:
	try:
	return float(path.stat().st_mtime)
	except Exception:
	return 0.0


	def _iter_source_dirs() -> List[Path]:
	dirs: List[Path] = [SOURCES_DIR]
	for d in DEFAULT_EXTERNAL_SOURCE_DIRS:
	if d.exists() and d.is_dir():
	dirs.append(d)
	raw_extra = str(os.getenv(EXTRA_SOURCES_ENV, "") or "").strip()
	if raw_extra:
	parts = [p.strip() for p in re.split(r"[,;]", raw_extra) if p.strip()]
	for part in parts:
	p = Path(part).expanduser()
	if p.exists() and p.is_dir():
	dirs.append(p)
	out: List[Path] = []
	seen: set[str] = set()
	for d in dirs:
	key = str(d.resolve()) if d.exists() else str(d)
	if key in seen:
	continue
	seen.add(key)
	out.append(d)
	return out


	def _iter_source_files(extensions: Sequence[str]) -> List[Path]:
	normalized_ext = {str(x).strip().lower() for x in extensions if str(x).strip()}
	by_key: Dict[str, Path] = {}
	for src_dir in _iter_source_dirs():
	for path in sorted(src_dir.iterdir()):
	if (not path.is_file()) or path.name.startswith("~$"):
	continue
	if normalized_ext and path.suffix.lower() not in normalized_ext:
	continue
	key = _file_key(path.name)
	prev = by_key.get(key)
	if prev is None or _safe_mtime(path) > _safe_mtime(prev):
	by_key[key] = path
	return sorted(by_key.values(), key=lambda p: (p.name.lower(), str(p)))


	def _is_parsec_source_name(path: Path) -> bool:
	key = _file_key(path.name)
	return ("parsec" in key) and key.endswith("xlsx")


	def _is_peplink_overlay_source_name(path: Path) -> bool:
	key = _file_key(path.name)
	if not key.endswith("xlsx"):
	return False
	if "peplink" not in key or "device" not in key:
	return False
	return ("replacement" in key) or ("repalcement" in key)


	def _find_parsec_source_xlsx() -> Optional[Path]:
	candidates = [p for p in _iter_source_files((".xlsx",)) if _is_parsec_source_name(p)]
	if not candidates:
	return None
	candidates.sort(
	key=lambda p: (
	0 if "pricing" in _file_key(p.name) else 1,
	-_safe_mtime(p),
	p.name.lower(),
	)
	)
	return candidates[0]


	def _find_peplink_overlay_source_xlsx() -> Optional[Path]:
	candidates = [p for p in _iter_source_files((".xlsx",)) if _is_peplink_overlay_source_name(p)]
	if not candidates:
	return None
	candidates.sort(
	key=lambda p: (
	0 if "replacementdevices" in _file_key(p.name) else 1,
	-_safe_mtime(p),
	p.name.lower(),
	)
	)
	return candidates[0]


	def _iter_router_pricing_xlsx_sources() -> List[Path]:
	out: List[Path] = []
	for path in _iter_source_files((".xlsx",)):
	if _is_parsec_source_name(path):
	continue
	if _is_peplink_overlay_source_name(path):
	continue
	out.append(path)
	return out


	def _find_atel_pdf() -> Optional[Path]:
	candidates: List[Path] = []
	for path in _iter_source_files((".pdf",)):
	key = _file_key(path.name)
	if "atel" in key and ("pricing" in key or "wholesale" in key):
	candidates.append(path)
	if not candidates:
	return None
	candidates.sort(key=lambda p: (-_safe_mtime(p), p.name.lower()))
	return candidates[0]


	def _parse_money(value: Any) -> Tuple[str, Optional[float]]:
	raw = _norm(value)
	if not raw:
	return "", None
	m = re.search(r"(-?\d{1,3}(?:,\d{3})*(?:\.\d+)?\|-?\d+(?:\.\d+)?)", raw.replace("$", ""))
	if not m:
	return raw, None
	try:
	num = float(m.group(1).replace(",", ""))
	except Exception:
	return raw, None
	return f"${num:,.2f}", num


	def _extract_term(text: str) -> str:
	low = text.lower()
	m = re.search(r"\b([135])\s*[- ]?year\b", low)
	if m:
	return f"{m.group(1)}YR"
	m = re.search(r"\b([135])\s*yr\b", low)
	if m:
	return f"{m.group(1)}YR"
	return ""


	def _model_candidates(text: str) -> List[str]:
	raw = _norm(text)
	if not raw:
	return []
	out: List[str] = []
	seen: set[str] = set()

	phrase_patterns = [
	r"\bmax\sbr\s1\spro\s5g\b",
	r"\bmax\sbr\s1\smini\s5g\b",
	r"\bmax\sbr\s1\s*pro\b",
	r"\bmax\sbr\s1\s*mini\b",
	]
	for patt in phrase_patterns:
	for m in re.finditer(patt, raw, flags=re.IGNORECASE):
	tok = _compact(m.group(0))
	tok = MODEL_ALIAS.get(tok, tok)
	if tok and tok not in seen:
	seen.add(tok)
	out.append(tok)

	for m in re.finditer(r"\b[A-Za-z]{1,8}[- ]?\d{2,4}[A-Za-z0-9\-]*\b", raw):
	tok = _compact(m.group(0))
	if not tok:
	continue
	if tok.isdigit() or len(tok) < 4:
	continue
	if (not any(c.isalpha() for c in tok)) or (not any(c.isdigit() for c in tok)):
	continue
	tok = MODEL_ALIAS.get(tok, tok)
	if tok in {"SKU", "MSRP", "MODEL"}:
	continue
	if tok not in seen:
	seen.add(tok)
	out.append(tok)
	return out


	def _manufacturer_from_filename(name: str) -> str:
	low = name.lower()
	if "semtech" in low or "sierra" in low:
	return "Semtech"
	if "ericsson" in low or "cradlepoint" in low:
	return "Ericsson Cradlepoint"
	if "inseego" in low:
	return "Inseego"
	if "inhand" in low:
	return "InHand Networks"
	if "peplink" in low:
	return "Peplink"
	if "parsec" in low:
	return "Parsec"
	if "atel" in low:
	return "ATEL"
	return "Unknown"


	def _clean_header(text: Any) -> str:
	return _compact(text).lower()


	def _header_index_map(header_row: Sequence[Any]) -> Dict[str, int]:
	idx: Dict[str, int] = {}
	clean = [_clean_header(h) for h in header_row]
	for key, aliases in HEADER_ALIASES.items():
	for i, h in enumerate(clean):
	if not h:
	continue
	for cand in aliases:
	if _clean_header(cand) == h:
	idx[key] = i
	break
	if key in idx:
	break
	if key in idx:
	continue
	for i, h in enumerate(clean):
	if not h:
	continue
	if key == "msrp" and ("msrp" in h or "listprice" in h or "retailprice" in h or "listusd" in h):
	idx[key] = i
	break
	if key == "sku" and ("sku" in h or "partnumber" in h or h.startswith("part")):
	idx[key] = i
	break
	if key == "title" and any(t in h for t in ("product", "model", "itemname", "shortdescription")):
	idx[key] = i
	break
	if key == "description" and any(t in h for t in ("description", "longdescription", "desc", "notes", "remark")):
	idx[key] = i
	break
	if key == "ethernet" and "ethernet" in h:
	idx[key] = i
	break
	if key == "serial" and "serial" in h:
	idx[key] = i
	break
	if key == "wifi" and "wifi" in h:
	idx[key] = i
	break
	if key == "battery" and "battery" in h:
	idx[key] = i
	break
	if key == "modem" and ("cellular" in h or h == "wan" or "modem" in h or "cat" in h):
	idx[key] = i
	break
	if key == "poe" and "poe" in h:
	idx[key] = i
	break
	return idx


	def _detect_header_row(rows: Sequence[Sequence[Any]]) -> Optional[int]:
	for i, row in enumerate(rows[:35]):
	joined = " ".join(_norm(v).lower() for v in row if _norm(v))
	if not joined:
	continue
	score = 0
	if "msrp" in joined or "list price" in joined or "retail price" in joined:
	score += 2
	if "sku" in joined or "part number" in joined or "partnumber" in joined:
	score += 2
	if "description" in joined or "product" in joined or "model" in joined:
	score += 2
	if score >= 4:
	return i
	return None


	def _rowval(row: Sequence[Any], idx_map: Dict[str, int], key: str) -> str:
	ix = idx_map.get(key)
	if ix is None or ix >= len(row):
	return ""
	return _norm(row[ix])


	def _infer_device_type(blob_low: str) -> str:
	if any(t in blob_low for t in ("license", "subscription", "support", "renewal", "service", "primecare", "incontrol", "airlink complete")):
	return "Service/License"
	if any(t in blob_low for t in ("antenna", "mount", "cable", "adapter", "accessory", "bracket")):
	return "Accessory"
	if any(t in blob_low for t in ("gateway", "router", "fwa", "cpe", "modem")):
	return "Router"
	return "Unknown"


	def _extract_ruggedization(blob_low: str) -> str:
	hits = []
	for term in ("ip67", "ip66", "ip65", "ip64", "ip54", "mil-std", "vibration", "ignition sensing", "metal housing", "aluminum"):
	if term in blob_low:
	hits.append(term)
	return ", ".join(dict.fromkeys(hits))


	def _extract_poe(blob_low: str, explicit: str) -> str:
	if explicit:
	return explicit
	if "poe" in blob_low:
	return "yes"
	return ""


	def _build_catalog_rows() -> List[CatalogRow]:
	rows_out: List[CatalogRow] = []
	for path in _iter_router_pricing_xlsx_sources():
	manu = _manufacturer_from_filename(path.name)
	wb = openpyxl.load_workbook(path, data_only=True, read_only=True)
	for ws in wb.worksheets:
	matrix: List[List[Any]] = []
	max_cols = min(80, ws.max_column or 0)
	for r in ws.iter_rows(min_row=1, max_row=min(4000, ws.max_row or 0), min_col=1, max_col=max_cols, values_only=True):
	matrix.append(list(r))
	if not matrix:
	continue
	hdr_idx = _detect_header_row(matrix)
	if hdr_idx is None:
	continue
	hdr = matrix[hdr_idx]
	idx_map = _header_index_map(hdr)
	if ("msrp" not in idx_map) or ("sku" not in idx_map and "title" not in idx_map and "description" not in idx_map):
	continue
	for i in range(hdr_idx + 1, len(matrix)):
	row = matrix[i]
	if not row:
	continue
	sku = _rowval(row, idx_map, "sku")
	title = _rowval(row, idx_map, "title")
	desc = _rowval(row, idx_map, "description")
	msrp_raw = _rowval(row, idx_map, "msrp")
	msrp, msrp_n = _parse_money(msrp_raw)
	if not msrp and msrp_n is None:
	continue
	blob = " ".join(x for x in (title, desc, sku) if x)
	blob_low = blob.lower()
	if not blob:
	continue
	device_type = _infer_device_type(blob_low)
	if device_type == "Service/License":
	# keep for SKU term lookups but skip as router model catalog rows
	continue
	model_candidates = _model_candidates(blob)
	if not model_candidates:
	continue
	model_key = model_candidates[0]
	model = model_key
	term = _rowval(row, idx_map, "term") or _extract_term(blob)
	wifi = _rowval(row, idx_map, "wifi")
	ethernet = _rowval(row, idx_map, "ethernet")
	serial = _rowval(row, idx_map, "serial")
	battery = _rowval(row, idx_map, "battery")
	modem = _rowval(row, idx_map, "modem")
	poe = _extract_poe(blob_low, _rowval(row, idx_map, "poe"))
	rugged = _extract_ruggedization(blob_low)
	item_type = "Router" if device_type == "Router" else "Accessory"
	rows_out.append(
	CatalogRow(
	source_file=path.name,
	source_sheet=ws.title,
	source_row=i + 1,
	manufacturer=manu,
	item_type=item_type,
	title=title or model,
	model=model,
	model_key=model_key,
	sku=sku,
	msrp=msrp or _norm(msrp_raw),
	msrp_numeric=msrp_n,
	term=term,
	wifi=wifi,
	ethernet_ports=ethernet,
	serial_ports=serial,
	battery=battery,
	ruggedization=rugged,
	modem_type=modem,
	primary_use_case="",
	device_type=device_type,
	poe=poe,
	notes=desc,
	sources_label="manufacturer pricing sheet",
	)
	)

	# ATEL PDF (MSRP only; skip wholesale fields)
	atel = _find_atel_pdf()
	if atel and atel.exists():
	with pdfplumber.open(str(atel)) as pdf:
	for pidx, page in enumerate(pdf.pages, start=1):
	tables = page.extract_tables() or []
	for table in tables:
	if not table or len(table) < 2:
	continue
	head = [(_norm(x).lower()) for x in table[1]]
	def col(name: str) -> int:
	for i, h in enumerate(head):
	if name in h:
	return i
	return -1
	model_i = col("model")
	sku_i = col("sku")
	msrp_i = col("msrp")
	desc_i = col("desc")
	if model_i < 0 or msrp_i < 0:
	continue
	for ridx, r in enumerate(table[2:], start=3):
	model_raw = _norm(r[model_i] if model_i < len(r) else "")
	sku = _norm(r[sku_i] if sku_i < len(r) and sku_i >= 0 else "")
	desc = _norm(r[desc_i] if desc_i < len(r) and desc_i >= 0 else "")
	msrp_raw = _norm(r[msrp_i] if msrp_i < len(r) else "")
	if not model_raw or not msrp_raw:
	continue
	msrp, msrp_n = _parse_money(msrp_raw)
	model_cands = _model_candidates(f"{model_raw} {desc}")
	model_key = model_cands[0] if model_cands else _compact(model_raw)
	if not model_key or model_key.isdigit() or len(model_key) < 4:
	continue
	rows_out.append(
	CatalogRow(
	source_file=atel.name,
	source_sheet=f"page_{pidx}",
	source_row=ridx,
	manufacturer="ATEL",
	item_type="Router",
	title=model_raw,
	model=model_key,
	model_key=model_key,
	sku=sku,
	msrp=msrp or msrp_raw,
	msrp_numeric=msrp_n,
	term=_extract_term(f"{model_raw} {desc}"),
	wifi="",
	ethernet_ports="",
	serial_ports="",
	battery="",
	ruggedization=_extract_ruggedization(desc.lower()),
	modem_type="",
	primary_use_case="",
	device_type="Router",
	poe=_extract_poe(desc.lower(), ""),
	notes=desc,
	sources_label="manufacturer pricing sheet (MSRP only)",
	)
	)
	deduped: List[CatalogRow] = []
	seen: set[Tuple[str, str, str, str, str]] = set()
	for row in rows_out:
	fp = (
	_compact(row.manufacturer),
	_compact(row.model_key),
	_compact(row.sku),
	_compact(row.term),
	_compact(row.msrp),
	)
	if fp in seen:
	continue
	seen.add(fp)
	deduped.append(row)
	return deduped


	def _verizon_gateway_web_rows() -> List[CatalogRow]:
	"""
	Add Verizon gateway model tokens so deterministic router facts can resolve them.
	These rows are explicitly marked as web-sourced and are not used to fabricate pricing.
	"""
	rows: List[CatalogRow] = []
	for idx, spec in enumerate(VERIZON_GATEWAY_WEB_ROWS, start=1):
	model = _norm(spec.get("model", ""))
	model_key = _compact(model)
	if not model_key:
	continue
	title = _norm(spec.get("title", "")) or model
	manufacturer = _norm(spec.get("manufacturer", "")) or "Verizon"
	rows.append(
	CatalogRow(
	source_file="verizon_support_gateway_models_web.csv",
	source_sheet="verizon_support_web",
	source_row=idx,
	manufacturer=manufacturer,
	item_type="Router",
	title=title,
	model=model_key,
	model_key=model_key,
	sku=model,
	msrp="Unknown, ask Masters",
	msrp_numeric=None,
	term="",
	wifi="",
	ethernet_ports="",
	serial_ports="",
	battery="",
	ruggedization="",
	modem_type="5G gateway (web-sourced model index)",
	primary_use_case="Verizon gateway model token coverage",
	device_type="Router",
	poe="",
	notes=(
	"Web-sourced (not from internal docs): model token from Verizon support "
	"gateway equipment pages; verify exact hardware revision before quoting."
	),
	sources_label="Web-sourced (not from internal docs): Verizon support equipment listing",
	)
	)
	return rows


	def _write_router_catalog(rows: Sequence[CatalogRow]) -> Path:
	OUT_DIR.mkdir(parents=True, exist_ok=True)
	out = OUT_DIR / "router_pricing_catalog_normalized.csv"
	fields = [
	"source_file",
	"source_sheet",
	"source_row",
	"manufacturer",
	"item_type",
	"title",
	"model",
	"model_key",
	"sku",
	"msrp",
	"msrp_numeric",
	"term",
	"wifi",
	"ethernet_ports",
	"serial_ports",
	"battery",
	"ruggedization",
	"modem_type",
	"primary_use_case",
	"device_type",
	"poe",
	"notes",
	"sources_label",
	]
	with out.open("w", encoding="utf-8", newline="") as f:
	w = csv.DictWriter(f, fieldnames=fields)
	w.writeheader()
	for r in rows:
	w.writerow({
	"source_file": r.source_file,
	"source_sheet": r.source_sheet,
	"source_row": r.source_row,
	"manufacturer": r.manufacturer,
	"item_type": r.item_type,
	"title": r.title,
	"model": r.model,
	"model_key": r.model_key,
	"sku": r.sku,
	"msrp": r.msrp,
	"msrp_numeric": f"{r.msrp_numeric:.2f}" if isinstance(r.msrp_numeric, float) else "",
	"term": r.term,
	"wifi": r.wifi,
	"ethernet_ports": r.ethernet_ports,
	"serial_ports": r.serial_ports,
	"battery": r.battery,
	"ruggedization": r.ruggedization,
	"modem_type": r.modem_type,
	"primary_use_case": r.primary_use_case,
	"device_type": r.device_type,
	"poe": r.poe,
	"notes": r.notes,
	"sources_label": r.sources_label,
	})
	return out


	def _write_variant_options(rows: Sequence[CatalogRow]) -> Path:
	out = OUT_DIR / "router_variant_options_normalized.csv"
	fields = [
	"manufacturer",
	"model_key",
	"model",
	"default_option",
	"sku",
	"title",
	"term",
	"msrp",
	"msrp_numeric",
	"wifi",
	"ethernet_ports",
	"serial_ports",
	"poe",
	"battery",
	"modem_type",
	"source_file",
	"source_sheet",
	"source_row",
	]
	groups: Dict[Tuple[str, str], List[CatalogRow]] = {}
	for r in rows:
	groups.setdefault((r.manufacturer, r.model_key), []).append(r)

	def rank_row(r: CatalogRow) -> Tuple[int, float, str]:
	term = _norm(r.term).upper()
	if term == "1YR":
	term_rank = 0
	elif not term:
	term_rank = 1
	else:
	term_rank = 2
	msrp = float(r.msrp_numeric) if isinstance(r.msrp_numeric, float) else 999999.0
	return (term_rank, msrp, _norm(r.sku).upper())

	with out.open("w", encoding="utf-8", newline="") as f:
	w = csv.DictWriter(f, fieldnames=fields)
	w.writeheader()
	for (manu, key), items in sorted(groups.items()):
	items_sorted = sorted(items, key=rank_row)
	default_sku = _norm(items_sorted[0].sku)
	for r in items_sorted:
	w.writerow(
	{
	"manufacturer": manu,
	"model_key": key,
	"model": r.model,
	"default_option": "yes" if _norm(r.sku) == default_sku else "no",
	"sku": r.sku,
	"title": r.title,
	"term": r.term,
	"msrp": r.msrp,
	"msrp_numeric": f"{r.msrp_numeric:.2f}" if isinstance(r.msrp_numeric, float) else "",
	"wifi": r.wifi,
	"ethernet_ports": r.ethernet_ports,
	"serial_ports": r.serial_ports,
	"poe": r.poe,
	"battery": r.battery,
	"modem_type": r.modem_type,
	"source_file": r.source_file,
	"source_sheet": r.source_sheet,
	"source_row": r.source_row,
	}
	)
	return out


	def _parse_parsec_family(description: str) -> str:
	up = _norm(description).upper()
	for fam in FAMILY_HINTS:
	if fam in up:
	return fam.title()
	m = re.search(r"PRO\s+([A-Z][A-Z ]{2,30})", up)
	if m:
	return _norm(m.group(1)).title()
	return ""


	def _write_parsec_pricing() -> Path:
	src = _find_parsec_source_xlsx()
	out = OUT_DIR / "parsec_pricing_normalized.csv"
	fields = [
	"source_file",
	"source_sheet",
	"source_row",
	"manufacturer",
	"category",
	"family",
	"part_number",
	"title",
	"description",
	"msrp",
	"msrp_numeric",
	"connector_summary",
	"fit_profile",
	]
	rows: List[Dict[str, Any]] = []
	if src and src.exists():
	wb = openpyxl.load_workbook(src, data_only=True, read_only=True)
	for ws in wb.worksheets:
	matrix = [list(r) for r in ws.iter_rows(min_row=1, max_row=min(2500, ws.max_row or 0), min_col=1, max_col=min(20, ws.max_column or 0), values_only=True)]
	if not matrix:
	continue
	hdr_idx = _detect_header_row(matrix)
	if hdr_idx is None:
	continue
	idx_map = _header_index_map(matrix[hdr_idx])
	if "sku" not in idx_map or "msrp" not in idx_map:
	continue
	for i in range(hdr_idx + 1, len(matrix)):
	row = matrix[i]
	part = _rowval(row, idx_map, "sku")
	desc = _rowval(row, idx_map, "description") or _rowval(row, idx_map, "title")
	msrp_raw = _rowval(row, idx_map, "msrp")
	if not part or not msrp_raw:
	continue
	msrp, msrp_n = _parse_money(msrp_raw)
	if msrp_n is None:
	continue
	fit = ""
	low_desc = desc.lower()
	if "vehicle" in low_desc or "mobile" in low_desc:
	fit = "vehicle"
	elif "indoor" in low_desc:
	fit = "indoor"
	elif "directional" in low_desc:
	fit = "directional"
	elif "outdoor" in low_desc or "pole" in low_desc or "wall" in low_desc:
	fit = "outdoor fixed"
	family = _parse_parsec_family(desc or part)
	connector_summary = ""
	for patt in (r"\b\d\sLTE\b", r"\b\d\sWiFi\b", r"\b\d\sGNSS\b", r"SMA", r"RP\sSMA"):
	m = re.search(patt, desc, flags=re.IGNORECASE)
	if m:
	connector_summary += ("; " if connector_summary else "") + _norm(m.group(0))
	rows.append(
	{
	"source_file": src.name,
	"source_sheet": ws.title,
	"source_row": i + 1,
	"manufacturer": "Parsec",
	"category": ws.title,
	"family": family,
	"part_number": part,
	"title": part,
	"description": desc,
	"msrp": msrp,
	"msrp_numeric": f"{msrp_n:.2f}",
	"connector_summary": connector_summary,
	"fit_profile": fit,
	}
	)
	with out.open("w", encoding="utf-8", newline="") as f:
	w = csv.DictWriter(f, fieldnames=fields)
	w.writeheader()
	for r in rows:
	w.writerow(r)
	return out


	def _write_peplink_overlay() -> Path:
	src = _find_peplink_overlay_source_xlsx()
	out = OUT_DIR / "peplink_replacement_overlay.csv"
	fields = [
	"source_file",
	"source_sheet",
	"source_row",
	"old_item_type",
	"old_item_name",
	"old_description",
	"old_model_key",
	"old_sku",
	"new_item_type",
	"new_item_name",
	"new_description",
	"new_model_key",
	"new_sku",
	"eos_year",
	"eol_year",
	"notes",
	]
	rows: List[Dict[str, Any]] = []
	if src and src.exists():
	wb = openpyxl.load_workbook(src, data_only=True, read_only=True)
	ws = wb.worksheets[0]
	matrix = [list(r) for r in ws.iter_rows(min_row=1, max_row=min(1000, ws.max_row or 0), min_col=1, max_col=min(20, ws.max_column or 0), values_only=True)]
	if matrix:
	hdr = [_norm(x).lower() for x in matrix[0]]
	def ix(name: str) -> int:
	for i, h in enumerate(hdr):
	if name in h:
	return i
	return -1
	old_item_t = ix("item type")
	old_item = ix("item name")
	old_desc = ix("description")
	new_item_t = ix("alternative item type")
	new_item = ix("alternative item name")
	new_desc = ix("alternative description")
	for ridx, r in enumerate(matrix[1:], start=2):
	old_name = _norm(r[old_item]) if old_item >= 0 and old_item < len(r) else ""
	new_name = _norm(r[new_item]) if new_item >= 0 and new_item < len(r) else ""
	if not old_name or not new_name:
	continue
	old_d = _norm(r[old_desc]) if old_desc >= 0 and old_desc < len(r) else ""
	new_d = _norm(r[new_desc]) if new_desc >= 0 and new_desc < len(r) else ""
	old_model = (_model_candidates(f"{old_name} {old_d}") or [_compact(old_name)])[0]
	new_model = (_model_candidates(f"{new_name} {new_d}") or [_compact(new_name)])[0]
	old_sku = next((m for m in _model_candidates(old_d) if m != old_model), "")
	new_sku = next((m for m in _model_candidates(new_d) if m != new_model), "")
	rows.append(
	{
	"source_file": src.name,
	"source_sheet": ws.title,
	"source_row": ridx,
	"old_item_type": _norm(r[old_item_t]) if old_item_t >= 0 and old_item_t < len(r) else "",
	"old_item_name": old_name,
	"old_description": old_d,
	"old_model_key": old_model,
	"old_sku": old_sku,
	"new_item_type": _norm(r[new_item_t]) if new_item_t >= 0 and new_item_t < len(r) else "",
	"new_item_name": new_name,
	"new_description": new_d,
	"new_model_key": new_model,
	"new_sku": new_sku,
	"eos_year": "2024",
	"eol_year": "2025",
	"notes": "Policy default overlay from Peplink replacement list (EOS=2024, EOL=2025); retain stricter existing lifecycle dates where present.",
	}
	)
	with out.open("w", encoding="utf-8", newline="") as f:
	w = csv.DictWriter(f, fieldnames=fields)
	w.writeheader()
	for r in rows:
	w.writerow(r)
	return out


	def main() -> None:
	OUT_DIR.mkdir(parents=True, exist_ok=True)
	catalog_rows = _build_catalog_rows()
	catalog_rows.extend(_verizon_gateway_web_rows())
	catalog_rows = [r for r in catalog_rows if r.model_key and r.msrp]
	catalog_path = _write_router_catalog(catalog_rows)
	variants_path = _write_variant_options(catalog_rows)
	parsec_path = _write_parsec_pricing()
	peplink_overlay_path = _write_peplink_overlay()
	source_dirs = _iter_source_dirs()
	parsec_source = _find_parsec_source_xlsx()
	peplink_overlay_source = _find_peplink_overlay_source_xlsx()
	atel_source = _find_atel_pdf()

	summary = OUT_DIR / "pricing_normalization_summary.txt"
	model_count = len({r.model_key for r in catalog_rows})
	verizon_model_count = len({r.model_key for r in catalog_rows if "verizon" in _norm(r.sources_label).lower()})
	with summary.open("w", encoding="utf-8") as f:
	f.write(f"source_dirs={';'.join(_display_path(d) for d in source_dirs)}\n")
	f.write(f"parsec_source={_display_path(parsec_source)}\n")
	f.write(f"peplink_overlay_source={_display_path(peplink_overlay_source)}\n")
	f.write(f"atel_source={_display_path(atel_source)}\n")
	f.write(f"catalog_rows={len(catalog_rows)}\n")
	f.write(f"catalog_models={model_count}\n")
	f.write(f"verizon_gateway_models={verizon_model_count}\n")
	f.write(f"catalog_path={_display_path(catalog_path)}\n")
	f.write(f"variants_path={_display_path(variants_path)}\n")
	f.write(f"parsec_path={_display_path(parsec_path)}\n")
	f.write(f"peplink_overlay_path={_display_path(peplink_overlay_path)}\n")

	print(f"Wrote {catalog_path}")
	print(f"Wrote {variants_path}")
	print(f"Wrote {parsec_path}")
	print(f"Wrote {peplink_overlay_path}")
	print(f"Summary: {summary}")


	if __name__ == "__main__":
	main()