Spaces:

Kesherat
/

blade-inspection-demo

Sleeping

blade-inspection-demo / gptoss_wrapper.py

Kesheratmex

Improve detection parsing with safe conversions and error handling

e4bf22b 4 months ago

34.1 kB

	"""
	GPTOSSWrapper - Simple integration wrapper for OpenAI or Hugging Face Inference API.

	Usage:
	from gptoss_wrapper import GPTOSSWrapper
	w = GPTOSSWrapper(model="gpt-oss-120")
	text = w.generate(prompt)

	Behavior:
	- Provider selection (priority):
	1) If OPENAI_API_KEY is set -> use OpenAI Chat Completions (v1/chat/completions)
	2) Else if HUGGINGFACE_API_TOKEN or HF_API_TOKEN is set -> use Hugging Face Inference API
	3) Else -> generate() will raise a RuntimeError describing missing credentials.

	Note for Spaces:
	- Add the secret in your Space settings (Settings → Secrets & variables → Add secret):
	- For OpenAI: key name = OPENAI_API_KEY, value = <your_openai_api_key>
	- For Hugging Face: key name = HUGGINGFACE_API_TOKEN (or HF_API_TOKEN), value = <your_hf_token>

	This file intentionally uses only the requests stdlib-friendly HTTP approach to avoid depending on extra SDKs.
	"""
	import os
	import time
	import requests
	import base64
	import torch
	from PIL import Image
	from typing import Optional


	class GPTOSSWrapper:
	"""
	Lightweight wrapper that can call either OpenAI or Hugging Face inference endpoints.

	Constructor:
	GPTOSSWrapper(model="gpt-oss-120", provider="auto")

	- model: model name to request (for OpenAI it must be an available model for your account;
	for Hugging Face it should be a model id hosted on HF).
	- provider: "auto" (default) \| "openai" \| "hf"
	"""

	def __init__(self, model: str = "gpt-oss-120", provider: str = "auto"):
	# Allow overriding the model via env var MODEL_ID (useful in Spaces)
	env_model = os.getenv("MODEL_ID")
	if env_model:
	self.model = env_model
	else:
	self.model = model

	self.request_timeout = 30
	self.openai_key = os.getenv("OPENAI_API_KEY")
	# Accept multiple HF token environment variable names for compatibility:
	# HUGGINGFACE_API_TOKEN, HF_API_TOKEN, or HF_TOKEN (used by some HF examples)
	self.hf_token = (
	os.getenv("HUGGINGFACE_API_TOKEN")
	or os.getenv("HF_API_TOKEN")
	or os.getenv("HF_TOKEN")
	)
	self.provider = provider.lower() if provider else "auto"

	# If we have an HF token and the user didn't explicitly set a MODEL_ID,
	# prefer the HF router and use a sensible default router model id.
	if self.hf_token and not env_model and model == "gpt-oss-120":
	# Default router model id; you can override via MODEL_ID env var in the Space
	self.model = "openai/gpt-oss-120b:fireworks-ai"

	if self.provider == "auto":
	if self.openai_key:
	self.provider = "openai"
	elif self.hf_token:
	self.provider = "hf"
	else:
	self.provider = "none"

	def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
	"""
	Generate a textual response for the given prompt.

	Returns:
	A string with the generated text.

	Raises:
	RuntimeError if no credentials are found or the remote call fails.
	"""
	if self.provider == "openai":
	return self._generate_openai(prompt, max_tokens=max_tokens, temperature=temperature)
	elif self.provider == "hf":
	return self._generate_hf(prompt, max_tokens=max_tokens, temperature=temperature)
	else:
	raise RuntimeError(
	"No API key configured for GPT wrapper. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN in the environment."
	)

	def analyze_image(self, image_path: str, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
	"""
	Analyze an image using vision models (OpenAI GPT-4 Vision or Hugging Face Qwen2-VL).

	Args:
	image_path: Path to the image file
	prompt: Text prompt for analysis
	max_tokens: Maximum tokens in response
	temperature: Temperature for generation

	Returns:
	Analysis text from vision model

	Raises:
	RuntimeError if no vision model is available or if the call fails
	"""
	if self.provider == "openai":
	return self._analyze_image_openai(image_path, prompt, max_tokens, temperature)
	elif self.provider == "hf":
	return self._analyze_image_hf(image_path, prompt, max_tokens, temperature)
	else:
	raise RuntimeError("Image analysis requires either OpenAI API key or Hugging Face token. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN.")

	def detect_objects_owlv2(self, image_path: str, text_queries: list, threshold: float = 0.1) -> dict:
	"""
	Detect objects in image using OWL-V2 or Grounding DINO zero-shot detection with text queries.
	Runs on HF GPU when available.

	Args:
	image_path: Path to the image file
	text_queries: List of text descriptions to search for (e.g., ["crack", "erosion", "dirt"])
	threshold: Confidence threshold for detections

	Returns:
	Dictionary with detections: {"detections": [{"label": str, "confidence": float, "bbox": [x1,y1,x2,y2]}, ...]}

	Raises:
	RuntimeError if models not available or detection fails
	"""
	print(f"Starting zero-shot detection with {len(text_queries)} queries")

	# Try Grounding DINO first (usually better for zero-shot), then OWL-V2 as fallback
	try:
	print("Attempting Grounding DINO detection...")
	return self._detect_grounding_dino(image_path, text_queries, threshold)
	except Exception as e:
	print(f"Grounding DINO failed: {e}")
	print("Falling back to OWL-V2...")
	try:
	return self._detect_owlv2_local(image_path, text_queries, threshold)
	except Exception as e2:
	print(f"OWL-V2 also failed: {e2}")
	# Return empty detections instead of failing completely
	print("Both models failed, returning empty detections")
	return {"detections": []}

	def _generate_openai(self, prompt: str, max_tokens: int, temperature: float) -> str:
	if not self.openai_key:
	raise RuntimeError("OPENAI_API_KEY not set in environment.")

	url = "https://api.openai.com/v1/chat/completions"
	headers = {
	"Authorization": f"Bearer {self.openai_key}",
	"Content-Type": "application/json",
	}

	# Build a simple chat conversation with a single system + user message
	payload = {
	"model": self.model,
	"messages": [
	{"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."},
	{"role": "user", "content": prompt},
	],
	"max_tokens": max_tokens,
	"temperature": float(temperature),
	"n": 1,
	}

	try:
	r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
	r.raise_for_status()
	data = r.json()
	# OpenAI API returns a list of choices
	choices = data.get("choices", [])
	if not choices:
	raise RuntimeError(f"OpenAI returned empty choices: {data}")
	# Extract the assistant message
	msg = choices[0].get("message", {}).get("content")
	if msg is None:
	# Some deployments return text in 'text' or in other fields; fallback to stringifying response
	return str(data)
	return msg.strip()
	except Exception as e:
	# Surface a clear error for the calling code to handle (the app catches exceptions)
	raise RuntimeError(f"OpenAI API call failed: {e}")

	def _generate_hf(self, prompt: str, max_tokens: int, temperature: float) -> str:
	if not self.hf_token:
	raise RuntimeError("HUGGINGFACE_API_TOKEN (or HF_API_TOKEN / HF_TOKEN) not set in environment.")

	# Prefer the HF router automatically when an HF token is present unless explicitly disabled.
	use_router = False
	# If HF token exists, default to using the router (unless HF_USE_ROUTER is set to a falsey value).
	if self.hf_token:
	hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower()
	if hf_use_router_val in ("0", "false", "no"):
	use_router = False
	else:
	use_router = True
	# Explicit enable via HF_USE_ROUTER env var
	if os.getenv("HF_USE_ROUTER", "").lower() in ("1", "true", "yes"):
	use_router = True
	# Also enable router if model id looks like an OpenAI-style id
	if "openai/" in (self.model or "") or ":" in (self.model or ""):
	use_router = True

	try:
	if use_router:
	# Router (OpenAI-compatible) endpoint: accepts chat/completions style payloads
	url = "https://router.huggingface.co/v1/chat/completions"
	headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"}
	payload = {
	"model": self.model,
	"messages": [
	{"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."},
	{"role": "user", "content": prompt},
	],
	"max_tokens": max_tokens,
	"temperature": float(temperature),
	"n": 1,
	}
	r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
	r.raise_for_status()
	data = r.json()
	# Try to extract OpenAI-style response
	choices = data.get("choices", [])
	if choices and isinstance(choices, list):
	first = choices[0]
	# OpenAI-compatible router usually returns message under 'message'
	msg = first.get("message", {}).get("content") if isinstance(first, dict) else None
	# Some router variants may return text under 'text' or 'content'
	if not msg:
	msg = first.get("text") or first.get("content")
	if msg:
	return msg.strip()
	# Fallback stringify
	return str(data)

	else:
	# Standard Hugging Face inference API
	url = f"https://api-inference.huggingface.co/models/{self.model}"
	headers = {"Authorization": f"Bearer {self.hf_token}"}
	payload = {
	"inputs": prompt,
	"parameters": {
	"max_new_tokens": max_tokens,
	"temperature": float(temperature),
	},
	"options": {"wait_for_model": True},
	}
	r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
	r.raise_for_status()
	data = r.json()
	# Hugging Face inference may return a list of generated outputs or a dict
	if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict) and "generated_text" in data[0]:
	return data[0]["generated_text"].strip()
	elif isinstance(data, dict) and "generated_text" in data:
	return data["generated_text"].strip()
	elif isinstance(data, dict) and "error" in data:
	raise RuntimeError(f"Hugging Face error: {data['error']}")
	else:
	# Some text-generation endpoints return a plain string or different struct; try to stringify
	return str(data)
	except Exception as e:
	raise RuntimeError(f"Hugging Face API call failed: {e}")

	def _analyze_image_openai(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str:
	"""
	Analyze an image using OpenAI GPT-4 Vision API.
	"""
	if not self.openai_key:
	raise RuntimeError("OPENAI_API_KEY not set in environment.")

	# Encode image to base64
	try:
	with open(image_path, "rb") as image_file:
	base64_image = base64.b64encode(image_file.read()).decode('utf-8')
	except Exception as e:
	raise RuntimeError(f"Failed to read image file {image_path}: {e}")

	url = "https://api.openai.com/v1/chat/completions"
	headers = {
	"Authorization": f"Bearer {self.openai_key}",
	"Content-Type": "application/json",
	}

	# Use GPT-4 Vision model
	vision_model = "gpt-4-vision-preview"

	# Build payload for vision API
	payload = {
	"model": vision_model,
	"messages": [
	{
	"role": "system",
	"content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish."
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	"detail": "high"
	}
	}
	]
	}
	],
	"max_tokens": max_tokens,
	"temperature": float(temperature),
	}

	try:
	r = requests.post(url, headers=headers, json=payload, timeout=60) # Longer timeout for vision
	r.raise_for_status()
	data = r.json()

	choices = data.get("choices", [])
	if not choices:
	raise RuntimeError(f"OpenAI Vision returned empty choices: {data}")

	msg = choices[0].get("message", {}).get("content")
	if msg is None:
	return str(data)
	return msg.strip()

	except Exception as e:
	raise RuntimeError(f"OpenAI Vision API call failed: {e}")

	def _analyze_image_hf(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str:
	"""
	Analyze an image using Hugging Face vision models (like Qwen2-VL).
	"""
	if not self.hf_token:
	raise RuntimeError("HUGGINGFACE_API_TOKEN not set in environment.")

	# Encode image to base64
	try:
	with open(image_path, "rb") as image_file:
	base64_image = base64.b64encode(image_file.read()).decode('utf-8')
	except Exception as e:
	raise RuntimeError(f"Failed to read image file {image_path}: {e}")

	# Use Qwen2-VL model for vision analysis
	vision_model = os.getenv("VISION_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct")

	# Check if we should use the router
	use_router = False
	if self.hf_token:
	hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower()
	if hf_use_router_val not in ("0", "false", "no"):
	use_router = True

	try:
	if use_router:
	# Router endpoint for vision models
	url = "https://router.huggingface.co/v1/chat/completions"
	headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"}

	payload = {
	"model": vision_model,
	"messages": [
	{
	"role": "system",
	"content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish."
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}"
	}
	}
	]
	}
	],
	"max_tokens": max_tokens,
	"temperature": float(temperature),
	}

	r = requests.post(url, headers=headers, json=payload, timeout=120)
	r.raise_for_status()
	data = r.json()

	choices = data.get("choices", [])
	if choices and isinstance(choices, list):
	first = choices[0]
	msg = first.get("message", {}).get("content") if isinstance(first, dict) else None
	if not msg:
	msg = first.get("text") or first.get("content")
	if msg:
	return msg.strip()
	return str(data)

	else:
	# Direct Hugging Face Inference API for vision models
	url = f"https://api-inference.huggingface.co/models/{vision_model}"
	headers = {"Authorization": f"Bearer {self.hf_token}"}

	# For vision models, we need to send both text and image
	payload = {
	"inputs": {
	"text": prompt,
	"image": base64_image
	},
	"parameters": {
	"max_new_tokens": max_tokens,
	"temperature": float(temperature),
	},
	"options": {"wait_for_model": True},
	}

	r = requests.post(url, headers=headers, json=payload, timeout=120)
	r.raise_for_status()
	data = r.json()

	# Handle different response formats
	if isinstance(data, list) and len(data) > 0:
	if isinstance(data[0], dict):
	if "generated_text" in data[0]:
	return data[0]["generated_text"].strip()
	elif "text" in data[0]:
	return data[0]["text"].strip()
	elif isinstance(data, dict):
	if "generated_text" in data:
	return data["generated_text"].strip()
	elif "text" in data:
	return data["text"].strip()
	elif "error" in data:
	raise RuntimeError(f"Hugging Face error: {data['error']}")

	return str(data)

	except Exception as e:
	raise RuntimeError(f"Hugging Face Vision API call failed: {e}")

	def _detect_grounding_dino(self, image_path: str, text_queries: list, threshold: float) -> dict:
	"""
	Detect objects using Grounding DINO. Try HF API first, then local model.
	"""
	# Try HF API first (more reliable)
	if self.hf_token:
	try:
	return self._detect_grounding_dino_api(image_path, text_queries, threshold)
	except Exception as e:
	print(f"Grounding DINO API failed: {e}")
	print("Falling back to local model...")

	# Fallback to local model
	try:
	from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

	# Load Grounding DINO model (will use HF GPU)
	model_id = "IDEA-Research/grounding-dino-base"
	device = "cuda" if torch.cuda.is_available() else "cpu"

	print(f"Loading Grounding DINO on device: {device}")
	processor = AutoProcessor.from_pretrained(model_id)
	model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

	# Load image
	image = Image.open(image_path)

	# Prepare text queries (VERY important: lowercase + end with dot)
	text = ". ".join([query.lower() for query in text_queries]) + "."
	print(f"Grounding DINO text query: {text}")

	# Process inputs
	inputs = processor(images=image, text=text, return_tensors="pt").to(device)

	# Run inference
	with torch.no_grad():
	outputs = model(**inputs)

	# Post-process results (detectar sintaxis automáticamente)
	try:
	# Intentar sintaxis nueva (transformers >= 4.44)
	results = processor.post_process_grounded_object_detection(
	outputs,
	inputs.input_ids,
	box_threshold=threshold,
	text_threshold=0.3,
	target_sizes=[image.size[::-1]]
	)
	except TypeError as e:
	if "box_threshold" in str(e):
	# Fallback a sintaxis antigua (transformers < 4.44)
	print("Using legacy post_process_grounded_object_detection syntax")
	results = processor.post_process_grounded_object_detection(
	outputs,
	inputs.input_ids,
	threshold=threshold,
	target_sizes=[image.size[::-1]]
	)
	else:
	raise e

	# Convert to our format
	detections = []
	if results and len(results) > 0:
	result = results[0]
	boxes = result.get("boxes", [])
	scores = result.get("scores", [])
	labels = result.get("labels", [])

	print(f"Grounding DINO found {len(boxes)} detections")

	for i, (box, score, label_info) in enumerate(zip(boxes, scores, labels)):
	try:
	# Convert score to float safely
	score_val = float(score.item() if hasattr(score, 'item') else score)

	if score_val >= threshold:
	# Convert box coordinates safely
	if hasattr(box, 'tolist'):
	x1, y1, x2, y2 = box.tolist()
	else:
	x1, y1, x2, y2 = box

	# Handle label safely
	if isinstance(label_info, (int, float)):
	label_idx = int(label_info)
	label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown"
	else:
	label = str(label_info)

	detections.append({
	"label": label,
	"confidence": score_val,
	"bbox": [int(x1), int(y1), int(x2), int(y2)]
	})
	except Exception as e:
	print(f"Error processing detection {i}: {e}")
	continue

	return {"detections": detections}

	except Exception as e:
	raise RuntimeError(f"Grounding DINO detection failed: {e}")

	def _detect_grounding_dino_api(self, image_path: str, text_queries: list, threshold: float) -> dict:
	"""
	Detect objects using Grounding DINO via HF Inference API.
	"""
	if not self.hf_token:
	raise RuntimeError("HF token required for Grounding DINO API")

	try:
	import base64

	# Encode image to base64
	with open(image_path, "rb") as image_file:
	base64_image = base64.b64encode(image_file.read()).decode('utf-8')

	# Prepare text queries (VERY important: lowercase + end with dot)
	text = ". ".join([query.lower() for query in text_queries]) + "."
	print(f"Grounding DINO API text query: {text}")

	# Use Grounding DINO model via API
	model_id = "IDEA-Research/grounding-dino-base"
	url = f"https://api-inference.huggingface.co/models/{model_id}"
	headers = {"Authorization": f"Bearer {self.hf_token}"}

	# Prepare payload for Grounding DINO API
	payload = {
	"inputs": {
	"image": base64_image,
	"text": text
	},
	"parameters": {
	"threshold": threshold
	}
	}

	response = requests.post(url, headers=headers, json=payload, timeout=30)

	if response.status_code == 200:
	data = response.json()

	# Convert API response to our format
	detections = []
	if isinstance(data, list):
	for detection in data:
	if detection.get("score", 0) >= threshold:
	box = detection.get("box", {})
	detections.append({
	"label": detection.get("label", "unknown"),
	"confidence": float(detection.get("score", 0)),
	"bbox": [
	int(box.get("xmin", 0)),
	int(box.get("ymin", 0)),
	int(box.get("xmax", 0)),
	int(box.get("ymax", 0))
	]
	})

	print(f"Grounding DINO API found {len(detections)} detections")
	return {"detections": detections}
	else:
	raise RuntimeError(f"API call failed with status {response.status_code}: {response.text}")

	except Exception as e:
	raise RuntimeError(f"Grounding DINO API detection failed: {e}")

	def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict:
	"""
	Detect objects using OWL-V2 running on HF GPU.
	"""
	try:
	from transformers import Owlv2Processor, Owlv2ForObjectDetection

	# Load OWL-V2 model (will use HF GPU)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Loading OWL-V2 on device: {device}")

	processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble")
	model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble").to(device)

	# Load image
	image = Image.open(image_path)

	# Prepare text queries (format: [["query1", "query2", ...]])
	texts = [text_queries]
	print(f"OWL-V2 text queries: {texts}")

	# Process inputs
	inputs = processor(text=texts, images=image, return_tensors="pt").to(device)

	# Run inference
	with torch.no_grad():
	outputs = model(**inputs)

	# Target image sizes for rescaling
	target_sizes = torch.Tensor([image.size[::-1]])

	# Post-process results
	results = processor.post_process_object_detection(
	outputs=outputs,
	target_sizes=target_sizes,
	threshold=threshold
	)

	# Convert to our format
	detections = []
	if results and len(results) > 0:
	result = results[0]
	boxes = result.get("boxes", [])
	scores = result.get("scores", [])
	labels = result.get("labels", [])

	print(f"OWL-V2 found {len(boxes)} detections")

	for box, score, label_idx in zip(boxes, scores, labels):
	if score >= threshold:
	x1, y1, x2, y2 = box.tolist()
	label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown"

	detections.append({
	"label": label,
	"confidence": float(score),
	"bbox": [int(x1), int(y1), int(x2), int(y2)]
	})

	return {"detections": detections}

	except Exception as e:
	raise RuntimeError(f"OWL-V2 detection failed: {e}")



	def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict:
	"""
	Detect objects using OWL-V2 locally.
	"""
	try:
	from transformers import Owlv2Processor, Owlv2ForObjectDetection

	# Load OWL-V2 model
	processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble")
	model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble")

	# Load image
	image = Image.open(image_path)

	# Prepare text queries (format: [["query1", "query2", ...]])
	texts = [text_queries]

	# Process inputs
	inputs = processor(text=texts, images=image, return_tensors="pt")

	# Run inference
	with torch.no_grad():
	outputs = model(**inputs)

	# Target image sizes for rescaling
	target_sizes = torch.Tensor([image.size[::-1]])

	# Post-process results
	results = processor.post_process_object_detection(
	outputs=outputs,
	target_sizes=target_sizes,
	threshold=threshold
	)

	# Convert to our format
	detections = []
	if results and len(results) > 0:
	result = results[0]
	boxes = result.get("boxes", [])
	scores = result.get("scores", [])
	labels = result.get("labels", [])

	for box, score, label_idx in zip(boxes, scores, labels):
	if score >= threshold:
	x1, y1, x2, y2 = box.tolist()
	label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown"

	detections.append({
	"label": label,
	"confidence": float(score),
	"bbox": [int(x1), int(y1), int(x2), int(y2)]
	})

	return {"detections": detections}

	except Exception as e:
	raise RuntimeError(f"OWL-V2 detection failed: {e}")

	def _detect_owlv2_hf(self, image_path: str, text_queries: list, threshold: float) -> dict:
	"""
	Detect objects using OWL-V2 via Hugging Face Inference API.
	"""
	try:
	with open(image_path, "rb") as image_file:
	image_data = image_file.read()
	except Exception as e:
	raise RuntimeError(f"Failed to read image file {image_path}: {e}")

	# DETR model endpoint (object detection)
	detr_model = os.getenv("DETR_MODEL_ID", "facebook/detr-resnet-101")
	url = f"https://api-inference.huggingface.co/models/{detr_model}"
	headers = {"Authorization": f"Bearer {self.hf_token}"}

	# Prepare payload for DETR
	# OWL-V2 expects image as binary data and text queries as parameters
	payload = {
	"parameters": {
	"candidate_labels": text_queries,
	"threshold": threshold
	},
	"options": {"wait_for_model": True}
	}

	try:
	# Send image as binary data with parameters
	files = {"inputs": image_data}
	data = {"parameters": str(payload["parameters"]).replace("'", '"')}

	r = requests.post(url, headers=headers, files=files, data=data, timeout=120)
	r.raise_for_status()
	response_data = r.json()

	# Parse OWL-V2 response format
	detections = []
	if isinstance(response_data, list):
	for detection in response_data:
	if isinstance(detection, dict):
	# Extract detection info
	label = detection.get("label", "unknown")
	confidence = detection.get("score", 0.0)
	bbox = detection.get("box", {})

	# Convert bbox format if needed
	if bbox:
	x1 = bbox.get("xmin", 0)
	y1 = bbox.get("ymin", 0)
	x2 = bbox.get("xmax", 0)
	y2 = bbox.get("ymax", 0)

	detections.append({
	"label": label,
	"confidence": confidence,
	"bbox": [x1, y1, x2, y2]
	})

	return {"detections": detections}

	except Exception as e:
	raise RuntimeError(f"OWL-V2 detection failed: {e}")


	# Backwards-compatible factory in case caller expects a function or attribute
	def GPTOSSWrapperFactory(model: Optional[str] = None, provider: Optional[str] = None):
	return GPTOSSWrapper(model=model or "gpt-oss-120", provider=provider or "auto")