"""Standalone single-image inference for CLIPSeg.""" import argparse from pathlib import Path import numpy as np import torch import yaml from PIL import Image from src.model.clipseg_wrapper import load_model_and_processor from src.train import get_device PROJECT_ROOT = Path(__file__).resolve().parents[1] def predict(image_path: str, prompt: str, config_path: str | None = None, output_path: str | None = None): config_path = config_path or str(PROJECT_ROOT / "configs" / "train_config.yaml") with open(config_path) as f: config = yaml.safe_load(f) device = get_device() model, processor = load_model_and_processor(config["model"]["name"], config["model"]["freeze_backbone"]) ckpt = PROJECT_ROOT / "outputs" / "checkpoints" / "best_model.pt" model.load_state_dict(torch.load(ckpt, map_location="cpu", weights_only=True)) model = model.to(device).eval() image = Image.open(image_path).convert("RGB") orig_w, orig_h = image.size inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): logits = model(**inputs).logits pred = (torch.sigmoid(logits[0]) > config["evaluation"]["threshold"]).cpu().numpy().astype(np.uint8) mask = Image.fromarray(pred * 255, mode="L").resize((orig_w, orig_h), Image.NEAREST) if output_path is None: stem = Path(image_path).stem slug = prompt.replace(" ", "_") output_path = str(PROJECT_ROOT / "outputs" / "masks" / f"{stem}__{slug}.png") Path(output_path).parent.mkdir(parents=True, exist_ok=True) mask.save(output_path) print(f"Saved mask to {output_path}") return mask if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("image", help="Path to input image") parser.add_argument("prompt", help="Text prompt, e.g. 'segment crack'") parser.add_argument("--output", help="Output mask path") args = parser.parse_args() predict(args.image, args.prompt, output_path=args.output)