| """Standalone single-image inference for CLIPSeg.""" |
|
|
| import argparse |
| from pathlib import Path |
|
|
| import numpy as np |
| import torch |
| import yaml |
| from PIL import Image |
|
|
| from src.model.clipseg_wrapper import load_model_and_processor |
| from src.train import get_device |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
|
|
|
|
| def predict(image_path: str, prompt: str, config_path: str | None = None, output_path: str | None = None): |
| config_path = config_path or str(PROJECT_ROOT / "configs" / "train_config.yaml") |
| with open(config_path) as f: |
| config = yaml.safe_load(f) |
|
|
| device = get_device() |
| model, processor = load_model_and_processor(config["model"]["name"], config["model"]["freeze_backbone"]) |
| ckpt = PROJECT_ROOT / "outputs" / "checkpoints" / "best_model.pt" |
| model.load_state_dict(torch.load(ckpt, map_location="cpu", weights_only=True)) |
| model = model.to(device).eval() |
|
|
| image = Image.open(image_path).convert("RGB") |
| orig_w, orig_h = image.size |
|
|
| inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True) |
| inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
| with torch.no_grad(): |
| logits = model(**inputs).logits |
|
|
| pred = (torch.sigmoid(logits[0]) > config["evaluation"]["threshold"]).cpu().numpy().astype(np.uint8) |
| mask = Image.fromarray(pred * 255, mode="L").resize((orig_w, orig_h), Image.NEAREST) |
|
|
| if output_path is None: |
| stem = Path(image_path).stem |
| slug = prompt.replace(" ", "_") |
| output_path = str(PROJECT_ROOT / "outputs" / "masks" / f"{stem}__{slug}.png") |
|
|
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) |
| mask.save(output_path) |
| print(f"Saved mask to {output_path}") |
| return mask |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("image", help="Path to input image") |
| parser.add_argument("prompt", help="Text prompt, e.g. 'segment crack'") |
| parser.add_argument("--output", help="Output mask path") |
| args = parser.parse_args() |
| predict(args.image, args.prompt, output_path=args.output) |
|
|