Spaces:

Kesherat
/

blade-inspection-demo

Sleeping

File size: 34,116 Bytes

"""
GPTOSSWrapper - Simple integration wrapper for OpenAI or Hugging Face Inference API.

Usage:
    from gptoss_wrapper import GPTOSSWrapper
    w = GPTOSSWrapper(model="gpt-oss-120")
    text = w.generate(prompt)

Behavior:
- Provider selection (priority):
    1) If OPENAI_API_KEY is set -> use OpenAI Chat Completions (v1/chat/completions)
    2) Else if HUGGINGFACE_API_TOKEN or HF_API_TOKEN is set -> use Hugging Face Inference API
    3) Else -> generate() will raise a RuntimeError describing missing credentials.

Note for Spaces:
- Add the secret in your Space settings (Settings → Secrets & variables → Add secret):
    - For OpenAI: key name = OPENAI_API_KEY, value = <your_openai_api_key>
    - For Hugging Face: key name = HUGGINGFACE_API_TOKEN (or HF_API_TOKEN), value = <your_hf_token>

This file intentionally uses only the requests stdlib-friendly HTTP approach to avoid depending on extra SDKs.
"""
import os
import time
import requests
import base64
import torch
from PIL import Image
from typing import Optional


class GPTOSSWrapper:
    """
    Lightweight wrapper that can call either OpenAI or Hugging Face inference endpoints.

    Constructor:
        GPTOSSWrapper(model="gpt-oss-120", provider="auto")

    - model: model name to request (for OpenAI it must be an available model for your account;
             for Hugging Face it should be a model id hosted on HF).
    - provider: "auto" (default) | "openai" | "hf"
    """

    def __init__(self, model: str = "gpt-oss-120", provider: str = "auto"):
        # Allow overriding the model via env var MODEL_ID (useful in Spaces)
        env_model = os.getenv("MODEL_ID")
        if env_model:
            self.model = env_model
        else:
            self.model = model

        self.request_timeout = 30
        self.openai_key = os.getenv("OPENAI_API_KEY")
        # Accept multiple HF token environment variable names for compatibility:
        # HUGGINGFACE_API_TOKEN, HF_API_TOKEN, or HF_TOKEN (used by some HF examples)
        self.hf_token = (
            os.getenv("HUGGINGFACE_API_TOKEN")
            or os.getenv("HF_API_TOKEN")
            or os.getenv("HF_TOKEN")
        )
        self.provider = provider.lower() if provider else "auto"

        # If we have an HF token and the user didn't explicitly set a MODEL_ID,
        # prefer the HF router and use a sensible default router model id.
        if self.hf_token and not env_model and model == "gpt-oss-120":
            # Default router model id; you can override via MODEL_ID env var in the Space
            self.model = "openai/gpt-oss-120b:fireworks-ai"

        if self.provider == "auto":
            if self.openai_key:
                self.provider = "openai"
            elif self.hf_token:
                self.provider = "hf"
            else:
                self.provider = "none"

    def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
        """
        Generate a textual response for the given prompt.

        Returns:
            A string with the generated text.

        Raises:
            RuntimeError if no credentials are found or the remote call fails.
        """
        if self.provider == "openai":
            return self._generate_openai(prompt, max_tokens=max_tokens, temperature=temperature)
        elif self.provider == "hf":
            return self._generate_hf(prompt, max_tokens=max_tokens, temperature=temperature)
        else:
            raise RuntimeError(
                "No API key configured for GPT wrapper. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN in the environment."
            )

    def analyze_image(self, image_path: str, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
        """
        Analyze an image using vision models (OpenAI GPT-4 Vision or Hugging Face Qwen2-VL).

        Args:
            image_path: Path to the image file
            prompt: Text prompt for analysis
            max_tokens: Maximum tokens in response
            temperature: Temperature for generation

        Returns:
            Analysis text from vision model

        Raises:
            RuntimeError if no vision model is available or if the call fails
        """
        if self.provider == "openai":
            return self._analyze_image_openai(image_path, prompt, max_tokens, temperature)
        elif self.provider == "hf":
            return self._analyze_image_hf(image_path, prompt, max_tokens, temperature)
        else:
            raise RuntimeError("Image analysis requires either OpenAI API key or Hugging Face token. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN.")

    def detect_objects_owlv2(self, image_path: str, text_queries: list, threshold: float = 0.1) -> dict:
        """
        Detect objects in image using OWL-V2 or Grounding DINO zero-shot detection with text queries.
        Runs on HF GPU when available.

        Args:
            image_path: Path to the image file
            text_queries: List of text descriptions to search for (e.g., ["crack", "erosion", "dirt"])
            threshold: Confidence threshold for detections

        Returns:
            Dictionary with detections: {"detections": [{"label": str, "confidence": float, "bbox": [x1,y1,x2,y2]}, ...]}

        Raises:
            RuntimeError if models not available or detection fails
        """
        print(f"Starting zero-shot detection with {len(text_queries)} queries")

        # Try Grounding DINO first (usually better for zero-shot), then OWL-V2 as fallback
        try:
            print("Attempting Grounding DINO detection...")
            return self._detect_grounding_dino(image_path, text_queries, threshold)
        except Exception as e:
            print(f"Grounding DINO failed: {e}")
            print("Falling back to OWL-V2...")
            try:
                return self._detect_owlv2_local(image_path, text_queries, threshold)
            except Exception as e2:
                print(f"OWL-V2 also failed: {e2}")
                # Return empty detections instead of failing completely
                print("Both models failed, returning empty detections")
                return {"detections": []}

    def _generate_openai(self, prompt: str, max_tokens: int, temperature: float) -> str:
        if not self.openai_key:
            raise RuntimeError("OPENAI_API_KEY not set in environment.")

        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.openai_key}",
            "Content-Type": "application/json",
        }

        # Build a simple chat conversation with a single system + user message
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."},
                {"role": "user", "content": prompt},
            ],
            "max_tokens": max_tokens,
            "temperature": float(temperature),
            "n": 1,
        }

        try:
            r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
            r.raise_for_status()
            data = r.json()
            # OpenAI API returns a list of choices
            choices = data.get("choices", [])
            if not choices:
                raise RuntimeError(f"OpenAI returned empty choices: {data}")
            # Extract the assistant message
            msg = choices[0].get("message", {}).get("content")
            if msg is None:
                # Some deployments return text in 'text' or in other fields; fallback to stringifying response
                return str(data)
            return msg.strip()
        except Exception as e:
            # Surface a clear error for the calling code to handle (the app catches exceptions)
            raise RuntimeError(f"OpenAI API call failed: {e}")

    def _generate_hf(self, prompt: str, max_tokens: int, temperature: float) -> str:
        if not self.hf_token:
            raise RuntimeError("HUGGINGFACE_API_TOKEN (or HF_API_TOKEN / HF_TOKEN) not set in environment.")

        # Prefer the HF router automatically when an HF token is present unless explicitly disabled.
        use_router = False
        # If HF token exists, default to using the router (unless HF_USE_ROUTER is set to a falsey value).
        if self.hf_token:
            hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower()
            if hf_use_router_val in ("0", "false", "no"):
                use_router = False
            else:
                use_router = True
        # Explicit enable via HF_USE_ROUTER env var
        if os.getenv("HF_USE_ROUTER", "").lower() in ("1", "true", "yes"):
            use_router = True
        # Also enable router if model id looks like an OpenAI-style id
        if "openai/" in (self.model or "") or ":" in (self.model or ""):
            use_router = True

        try:
            if use_router:
                # Router (OpenAI-compatible) endpoint: accepts chat/completions style payloads
                url = "https://router.huggingface.co/v1/chat/completions"
                headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"}
                payload = {
                    "model": self.model,
                    "messages": [
                        {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."},
                        {"role": "user", "content": prompt},
                    ],
                    "max_tokens": max_tokens,
                    "temperature": float(temperature),
                    "n": 1,
                }
                r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
                r.raise_for_status()
                data = r.json()
                # Try to extract OpenAI-style response
                choices = data.get("choices", [])
                if choices and isinstance(choices, list):
                    first = choices[0]
                    # OpenAI-compatible router usually returns message under 'message'
                    msg = first.get("message", {}).get("content") if isinstance(first, dict) else None
                    # Some router variants may return text under 'text' or 'content'
                    if not msg:
                        msg = first.get("text") or first.get("content")
                    if msg:
                        return msg.strip()
                # Fallback stringify
                return str(data)

            else:
                # Standard Hugging Face inference API
                url = f"https://api-inference.huggingface.co/models/{self.model}"
                headers = {"Authorization": f"Bearer {self.hf_token}"}
                payload = {
                    "inputs": prompt,
                    "parameters": {
                        "max_new_tokens": max_tokens,
                        "temperature": float(temperature),
                    },
                    "options": {"wait_for_model": True},
                }
                r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
                r.raise_for_status()
                data = r.json()
                # Hugging Face inference may return a list of generated outputs or a dict
                if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict) and "generated_text" in data[0]:
                    return data[0]["generated_text"].strip()
                elif isinstance(data, dict) and "generated_text" in data:
                    return data["generated_text"].strip()
                elif isinstance(data, dict) and "error" in data:
                    raise RuntimeError(f"Hugging Face error: {data['error']}")
                else:
                    # Some text-generation endpoints return a plain string or different struct; try to stringify
                    return str(data)
        except Exception as e:
            raise RuntimeError(f"Hugging Face API call failed: {e}")

    def _analyze_image_openai(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str:
        """
        Analyze an image using OpenAI GPT-4 Vision API.
        """
        if not self.openai_key:
            raise RuntimeError("OPENAI_API_KEY not set in environment.")

        # Encode image to base64
        try:
            with open(image_path, "rb") as image_file:
                base64_image = base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            raise RuntimeError(f"Failed to read image file {image_path}: {e}")

        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.openai_key}",
            "Content-Type": "application/json",
        }

        # Use GPT-4 Vision model
        vision_model = "gpt-4-vision-preview"

        # Build payload for vision API
        payload = {
            "model": vision_model,
            "messages": [
                {
                    "role": "system",
                    "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish."
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                                "detail": "high"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": max_tokens,
            "temperature": float(temperature),
        }

        try:
            r = requests.post(url, headers=headers, json=payload, timeout=60)  # Longer timeout for vision
            r.raise_for_status()
            data = r.json()

            choices = data.get("choices", [])
            if not choices:
                raise RuntimeError(f"OpenAI Vision returned empty choices: {data}")

            msg = choices[0].get("message", {}).get("content")
            if msg is None:
                return str(data)
            return msg.strip()

        except Exception as e:
            raise RuntimeError(f"OpenAI Vision API call failed: {e}")

    def _analyze_image_hf(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str:
        """
        Analyze an image using Hugging Face vision models (like Qwen2-VL).
        """
        if not self.hf_token:
            raise RuntimeError("HUGGINGFACE_API_TOKEN not set in environment.")

        # Encode image to base64
        try:
            with open(image_path, "rb") as image_file:
                base64_image = base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            raise RuntimeError(f"Failed to read image file {image_path}: {e}")

        # Use Qwen2-VL model for vision analysis
        vision_model = os.getenv("VISION_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct")

        # Check if we should use the router
        use_router = False
        if self.hf_token:
            hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower()
            if hf_use_router_val not in ("0", "false", "no"):
                use_router = True

        try:
            if use_router:
                # Router endpoint for vision models
                url = "https://router.huggingface.co/v1/chat/completions"
                headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"}

                payload = {
                    "model": vision_model,
                    "messages": [
                        {
                            "role": "system",
                            "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish."
                        },
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": prompt
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{base64_image}"
                                    }
                                }
                            ]
                        }
                    ],
                    "max_tokens": max_tokens,
                    "temperature": float(temperature),
                }

                r = requests.post(url, headers=headers, json=payload, timeout=120)
                r.raise_for_status()
                data = r.json()

                choices = data.get("choices", [])
                if choices and isinstance(choices, list):
                    first = choices[0]
                    msg = first.get("message", {}).get("content") if isinstance(first, dict) else None
                    if not msg:
                        msg = first.get("text") or first.get("content")
                    if msg:
                        return msg.strip()
                return str(data)

            else:
                # Direct Hugging Face Inference API for vision models
                url = f"https://api-inference.huggingface.co/models/{vision_model}"
                headers = {"Authorization": f"Bearer {self.hf_token}"}

                # For vision models, we need to send both text and image
                payload = {
                    "inputs": {
                        "text": prompt,
                        "image": base64_image
                    },
                    "parameters": {
                        "max_new_tokens": max_tokens,
                        "temperature": float(temperature),
                    },
                    "options": {"wait_for_model": True},
                }

                r = requests.post(url, headers=headers, json=payload, timeout=120)
                r.raise_for_status()
                data = r.json()

                # Handle different response formats
                if isinstance(data, list) and len(data) > 0:
                    if isinstance(data[0], dict):
                        if "generated_text" in data[0]:
                            return data[0]["generated_text"].strip()
                        elif "text" in data[0]:
                            return data[0]["text"].strip()
                elif isinstance(data, dict):
                    if "generated_text" in data:
                        return data["generated_text"].strip()
                    elif "text" in data:
                        return data["text"].strip()
                    elif "error" in data:
                        raise RuntimeError(f"Hugging Face error: {data['error']}")

                return str(data)

        except Exception as e:
            raise RuntimeError(f"Hugging Face Vision API call failed: {e}")

    def _detect_grounding_dino(self, image_path: str, text_queries: list, threshold: float) -> dict:
        """
        Detect objects using Grounding DINO. Try HF API first, then local model.
        """
        # Try HF API first (more reliable)
        if self.hf_token:
            try:
                return self._detect_grounding_dino_api(image_path, text_queries, threshold)
            except Exception as e:
                print(f"Grounding DINO API failed: {e}")
                print("Falling back to local model...")

        # Fallback to local model
        try:
            from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

            # Load Grounding DINO model (will use HF GPU)
            model_id = "IDEA-Research/grounding-dino-base"
            device = "cuda" if torch.cuda.is_available() else "cpu"

            print(f"Loading Grounding DINO on device: {device}")
            processor = AutoProcessor.from_pretrained(model_id)
            model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

            # Load image
            image = Image.open(image_path)

            # Prepare text queries (VERY important: lowercase + end with dot)
            text = ". ".join([query.lower() for query in text_queries]) + "."
            print(f"Grounding DINO text query: {text}")

            # Process inputs
            inputs = processor(images=image, text=text, return_tensors="pt").to(device)

            # Run inference
            with torch.no_grad():
                outputs = model(**inputs)

            # Post-process results (detectar sintaxis automáticamente)
            try:
                # Intentar sintaxis nueva (transformers >= 4.44)
                results = processor.post_process_grounded_object_detection(
                    outputs,
                    inputs.input_ids,
                    box_threshold=threshold,
                    text_threshold=0.3,
                    target_sizes=[image.size[::-1]]
                )
            except TypeError as e:
                if "box_threshold" in str(e):
                    # Fallback a sintaxis antigua (transformers < 4.44)
                    print("Using legacy post_process_grounded_object_detection syntax")
                    results = processor.post_process_grounded_object_detection(
                        outputs,
                        inputs.input_ids,
                        threshold=threshold,
                        target_sizes=[image.size[::-1]]
                    )
                else:
                    raise e

            # Convert to our format
            detections = []
            if results and len(results) > 0:
                result = results[0]
                boxes = result.get("boxes", [])
                scores = result.get("scores", [])
                labels = result.get("labels", [])

                print(f"Grounding DINO found {len(boxes)} detections")

                for i, (box, score, label_info) in enumerate(zip(boxes, scores, labels)):
                    try:
                        # Convert score to float safely
                        score_val = float(score.item() if hasattr(score, 'item') else score)

                        if score_val >= threshold:
                            # Convert box coordinates safely
                            if hasattr(box, 'tolist'):
                                x1, y1, x2, y2 = box.tolist()
                            else:
                                x1, y1, x2, y2 = box

                            # Handle label safely
                            if isinstance(label_info, (int, float)):
                                label_idx = int(label_info)
                                label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown"
                            else:
                                label = str(label_info)

                            detections.append({
                                "label": label,
                                "confidence": score_val,
                                "bbox": [int(x1), int(y1), int(x2), int(y2)]
                            })
                    except Exception as e:
                        print(f"Error processing detection {i}: {e}")
                        continue

            return {"detections": detections}

        except Exception as e:
            raise RuntimeError(f"Grounding DINO detection failed: {e}")

    def _detect_grounding_dino_api(self, image_path: str, text_queries: list, threshold: float) -> dict:
        """
        Detect objects using Grounding DINO via HF Inference API.
        """
        if not self.hf_token:
            raise RuntimeError("HF token required for Grounding DINO API")

        try:
            import base64

            # Encode image to base64
            with open(image_path, "rb") as image_file:
                base64_image = base64.b64encode(image_file.read()).decode('utf-8')

            # Prepare text queries (VERY important: lowercase + end with dot)
            text = ". ".join([query.lower() for query in text_queries]) + "."
            print(f"Grounding DINO API text query: {text}")

            # Use Grounding DINO model via API
            model_id = "IDEA-Research/grounding-dino-base"
            url = f"https://api-inference.huggingface.co/models/{model_id}"
            headers = {"Authorization": f"Bearer {self.hf_token}"}

            # Prepare payload for Grounding DINO API
            payload = {
                "inputs": {
                    "image": base64_image,
                    "text": text
                },
                "parameters": {
                    "threshold": threshold
                }
            }

            response = requests.post(url, headers=headers, json=payload, timeout=30)

            if response.status_code == 200:
                data = response.json()

                # Convert API response to our format
                detections = []
                if isinstance(data, list):
                    for detection in data:
                        if detection.get("score", 0) >= threshold:
                            box = detection.get("box", {})
                            detections.append({
                                "label": detection.get("label", "unknown"),
                                "confidence": float(detection.get("score", 0)),
                                "bbox": [
                                    int(box.get("xmin", 0)),
                                    int(box.get("ymin", 0)),
                                    int(box.get("xmax", 0)),
                                    int(box.get("ymax", 0))
                                ]
                            })

                print(f"Grounding DINO API found {len(detections)} detections")
                return {"detections": detections}
            else:
                raise RuntimeError(f"API call failed with status {response.status_code}: {response.text}")

        except Exception as e:
            raise RuntimeError(f"Grounding DINO API detection failed: {e}")

    def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict:
        """
        Detect objects using OWL-V2 running on HF GPU.
        """
        try:
            from transformers import Owlv2Processor, Owlv2ForObjectDetection

            # Load OWL-V2 model (will use HF GPU)
            device = "cuda" if torch.cuda.is_available() else "cpu"
            print(f"Loading OWL-V2 on device: {device}")

            processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble")
            model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble").to(device)

            # Load image
            image = Image.open(image_path)

            # Prepare text queries (format: [["query1", "query2", ...]])
            texts = [text_queries]
            print(f"OWL-V2 text queries: {texts}")

            # Process inputs
            inputs = processor(text=texts, images=image, return_tensors="pt").to(device)

            # Run inference
            with torch.no_grad():
                outputs = model(**inputs)

            # Target image sizes for rescaling
            target_sizes = torch.Tensor([image.size[::-1]])

            # Post-process results
            results = processor.post_process_object_detection(
                outputs=outputs,
                target_sizes=target_sizes,
                threshold=threshold
            )

            # Convert to our format
            detections = []
            if results and len(results) > 0:
                result = results[0]
                boxes = result.get("boxes", [])
                scores = result.get("scores", [])
                labels = result.get("labels", [])

                print(f"OWL-V2 found {len(boxes)} detections")

                for box, score, label_idx in zip(boxes, scores, labels):
                    if score >= threshold:
                        x1, y1, x2, y2 = box.tolist()
                        label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown"

                        detections.append({
                            "label": label,
                            "confidence": float(score),
                            "bbox": [int(x1), int(y1), int(x2), int(y2)]
                        })

            return {"detections": detections}

        except Exception as e:
            raise RuntimeError(f"OWL-V2 detection failed: {e}")



    def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict:
        """
        Detect objects using OWL-V2 locally.
        """
        try:
            from transformers import Owlv2Processor, Owlv2ForObjectDetection

            # Load OWL-V2 model
            processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble")
            model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble")

            # Load image
            image = Image.open(image_path)

            # Prepare text queries (format: [["query1", "query2", ...]])
            texts = [text_queries]

            # Process inputs
            inputs = processor(text=texts, images=image, return_tensors="pt")

            # Run inference
            with torch.no_grad():
                outputs = model(**inputs)

            # Target image sizes for rescaling
            target_sizes = torch.Tensor([image.size[::-1]])

            # Post-process results
            results = processor.post_process_object_detection(
                outputs=outputs,
                target_sizes=target_sizes,
                threshold=threshold
            )

            # Convert to our format
            detections = []
            if results and len(results) > 0:
                result = results[0]
                boxes = result.get("boxes", [])
                scores = result.get("scores", [])
                labels = result.get("labels", [])

                for box, score, label_idx in zip(boxes, scores, labels):
                    if score >= threshold:
                        x1, y1, x2, y2 = box.tolist()
                        label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown"

                        detections.append({
                            "label": label,
                            "confidence": float(score),
                            "bbox": [int(x1), int(y1), int(x2), int(y2)]
                        })

            return {"detections": detections}

        except Exception as e:
            raise RuntimeError(f"OWL-V2 detection failed: {e}")

    def _detect_owlv2_hf(self, image_path: str, text_queries: list, threshold: float) -> dict:
        """
        Detect objects using OWL-V2 via Hugging Face Inference API.
        """
        try:
            with open(image_path, "rb") as image_file:
                image_data = image_file.read()
        except Exception as e:
            raise RuntimeError(f"Failed to read image file {image_path}: {e}")

        # DETR model endpoint (object detection)
        detr_model = os.getenv("DETR_MODEL_ID", "facebook/detr-resnet-101")
        url = f"https://api-inference.huggingface.co/models/{detr_model}"
        headers = {"Authorization": f"Bearer {self.hf_token}"}

        # Prepare payload for DETR
        # OWL-V2 expects image as binary data and text queries as parameters
        payload = {
            "parameters": {
                "candidate_labels": text_queries,
                "threshold": threshold
            },
            "options": {"wait_for_model": True}
        }

        try:
            # Send image as binary data with parameters
            files = {"inputs": image_data}
            data = {"parameters": str(payload["parameters"]).replace("'", '"')}

            r = requests.post(url, headers=headers, files=files, data=data, timeout=120)
            r.raise_for_status()
            response_data = r.json()

            # Parse OWL-V2 response format
            detections = []
            if isinstance(response_data, list):
                for detection in response_data:
                    if isinstance(detection, dict):
                        # Extract detection info
                        label = detection.get("label", "unknown")
                        confidence = detection.get("score", 0.0)
                        bbox = detection.get("box", {})

                        # Convert bbox format if needed
                        if bbox:
                            x1 = bbox.get("xmin", 0)
                            y1 = bbox.get("ymin", 0)
                            x2 = bbox.get("xmax", 0)
                            y2 = bbox.get("ymax", 0)

                            detections.append({
                                "label": label,
                                "confidence": confidence,
                                "bbox": [x1, y1, x2, y2]
                            })

            return {"detections": detections}

        except Exception as e:
            raise RuntimeError(f"OWL-V2 detection failed: {e}")


# Backwards-compatible factory in case caller expects a function or attribute
def GPTOSSWrapperFactory(model: Optional[str] = None, provider: Optional[str] = None):
    return GPTOSSWrapper(model=model or "gpt-oss-120", provider=provider or "auto")