File size: 2,733 Bytes
5a3b322 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from __future__ import annotations
import argparse
import csv
import json
import sys
import urllib.request
from pathlib import Path
from typing import List
def read_labels(path: str) -> List[str]:
p = Path(path)
if p.suffix == ".json":
data = json.loads(p.read_text())
if isinstance(data, dict) and "unmatched_labels" in data:
return list(data["unmatched_labels"])
if isinstance(data, list):
return data
if p.suffix in {".txt", ".csv"}:
with p.open() as f:
return [line.strip() for line in f if line.strip()]
raise ValueError(f"Unsupported input format for labels: {path}")
def classify_html(status: int, html: str) -> str:
if status == 404:
return "NOT_FOUND"
if status in {401, 403}:
return "ACCESS_BLOCKED"
lowered = html.lower()
if any(marker in lowered for marker in ["page not found", "error occurred", "404"]):
return "NOT_FOUND"
markers = ["test type", "assessment length", "description", "catalogue__circle"]
if any(m in lowered for m in markers):
return "DETAIL_PAGE_VALID"
return "NOT_CATALOG_ITEM"
def probe_url(url: str, timeout: int = 10) -> dict:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
html = resp.read().decode("utf-8", errors="ignore")
status = resp.getcode()
final_url = resp.geturl()
except Exception as exc: # pragma: no cover - network variability
return {"url": url, "final_url": None, "status": None, "classification": "ERROR", "error": str(exc)}
classification = classify_html(status, html)
return {"url": url, "final_url": final_url, "status": status, "classification": classification, "error": None}
def main():
parser = argparse.ArgumentParser(description="Probe unmatched label URLs and classify them.")
parser.add_argument("--labels", required=True, help="Path to labels input (json with unmatched_labels, txt, or csv)")
parser.add_argument("--output", required=True, help="CSV output path")
args = parser.parse_args()
labels = read_labels(args.labels)
rows = []
for url in labels:
rows.append(probe_url(url))
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
fieldnames = ["url", "final_url", "status", "classification", "error"]
with open(args.output, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
print(f"Wrote probe results for {len(rows)} labels to {args.output}")
if __name__ == "__main__":
sys.exit(main())
|