Spaces:
Sleeping
Sleeping
| """ | |
| GPTOSSWrapper - Simple integration wrapper for OpenAI or Hugging Face Inference API. | |
| Usage: | |
| from gptoss_wrapper import GPTOSSWrapper | |
| w = GPTOSSWrapper(model="gpt-oss-120") | |
| text = w.generate(prompt) | |
| Behavior: | |
| - Provider selection (priority): | |
| 1) If OPENAI_API_KEY is set -> use OpenAI Chat Completions (v1/chat/completions) | |
| 2) Else if HUGGINGFACE_API_TOKEN or HF_API_TOKEN is set -> use Hugging Face Inference API | |
| 3) Else -> generate() will raise a RuntimeError describing missing credentials. | |
| Note for Spaces: | |
| - Add the secret in your Space settings (Settings → Secrets & variables → Add secret): | |
| - For OpenAI: key name = OPENAI_API_KEY, value = <your_openai_api_key> | |
| - For Hugging Face: key name = HUGGINGFACE_API_TOKEN (or HF_API_TOKEN), value = <your_hf_token> | |
| This file intentionally uses only the requests stdlib-friendly HTTP approach to avoid depending on extra SDKs. | |
| """ | |
| import os | |
| import time | |
| import requests | |
| import base64 | |
| import torch | |
| from PIL import Image | |
| from typing import Optional | |
| class GPTOSSWrapper: | |
| """ | |
| Lightweight wrapper that can call either OpenAI or Hugging Face inference endpoints. | |
| Constructor: | |
| GPTOSSWrapper(model="gpt-oss-120", provider="auto") | |
| - model: model name to request (for OpenAI it must be an available model for your account; | |
| for Hugging Face it should be a model id hosted on HF). | |
| - provider: "auto" (default) | "openai" | "hf" | |
| """ | |
| def __init__(self, model: str = "gpt-oss-120", provider: str = "auto"): | |
| # Allow overriding the model via env var MODEL_ID (useful in Spaces) | |
| env_model = os.getenv("MODEL_ID") | |
| if env_model: | |
| self.model = env_model | |
| else: | |
| self.model = model | |
| self.request_timeout = 30 | |
| self.openai_key = os.getenv("OPENAI_API_KEY") | |
| # Accept multiple HF token environment variable names for compatibility: | |
| # HUGGINGFACE_API_TOKEN, HF_API_TOKEN, or HF_TOKEN (used by some HF examples) | |
| self.hf_token = ( | |
| os.getenv("HUGGINGFACE_API_TOKEN") | |
| or os.getenv("HF_API_TOKEN") | |
| or os.getenv("HF_TOKEN") | |
| ) | |
| self.provider = provider.lower() if provider else "auto" | |
| # If we have an HF token and the user didn't explicitly set a MODEL_ID, | |
| # prefer the HF router and use a sensible default router model id. | |
| if self.hf_token and not env_model and model == "gpt-oss-120": | |
| # Default router model id; you can override via MODEL_ID env var in the Space | |
| self.model = "openai/gpt-oss-120b:fireworks-ai" | |
| if self.provider == "auto": | |
| if self.openai_key: | |
| self.provider = "openai" | |
| elif self.hf_token: | |
| self.provider = "hf" | |
| else: | |
| self.provider = "none" | |
| def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str: | |
| """ | |
| Generate a textual response for the given prompt. | |
| Returns: | |
| A string with the generated text. | |
| Raises: | |
| RuntimeError if no credentials are found or the remote call fails. | |
| """ | |
| if self.provider == "openai": | |
| return self._generate_openai(prompt, max_tokens=max_tokens, temperature=temperature) | |
| elif self.provider == "hf": | |
| return self._generate_hf(prompt, max_tokens=max_tokens, temperature=temperature) | |
| else: | |
| raise RuntimeError( | |
| "No API key configured for GPT wrapper. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN in the environment." | |
| ) | |
| def analyze_image(self, image_path: str, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str: | |
| """ | |
| Analyze an image using vision models (OpenAI GPT-4 Vision or Hugging Face Qwen2-VL). | |
| Args: | |
| image_path: Path to the image file | |
| prompt: Text prompt for analysis | |
| max_tokens: Maximum tokens in response | |
| temperature: Temperature for generation | |
| Returns: | |
| Analysis text from vision model | |
| Raises: | |
| RuntimeError if no vision model is available or if the call fails | |
| """ | |
| if self.provider == "openai": | |
| return self._analyze_image_openai(image_path, prompt, max_tokens, temperature) | |
| elif self.provider == "hf": | |
| return self._analyze_image_hf(image_path, prompt, max_tokens, temperature) | |
| else: | |
| raise RuntimeError("Image analysis requires either OpenAI API key or Hugging Face token. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN.") | |
| def detect_objects_owlv2(self, image_path: str, text_queries: list, threshold: float = 0.1) -> dict: | |
| """ | |
| Detect objects in image using OWL-V2 or Grounding DINO zero-shot detection with text queries. | |
| Runs on HF GPU when available. | |
| Args: | |
| image_path: Path to the image file | |
| text_queries: List of text descriptions to search for (e.g., ["crack", "erosion", "dirt"]) | |
| threshold: Confidence threshold for detections | |
| Returns: | |
| Dictionary with detections: {"detections": [{"label": str, "confidence": float, "bbox": [x1,y1,x2,y2]}, ...]} | |
| Raises: | |
| RuntimeError if models not available or detection fails | |
| """ | |
| print(f"Starting zero-shot detection with {len(text_queries)} queries") | |
| # Try Grounding DINO first (usually better for zero-shot), then OWL-V2 as fallback | |
| try: | |
| print("Attempting Grounding DINO detection...") | |
| return self._detect_grounding_dino(image_path, text_queries, threshold) | |
| except Exception as e: | |
| print(f"Grounding DINO failed: {e}") | |
| print("Falling back to OWL-V2...") | |
| try: | |
| return self._detect_owlv2_local(image_path, text_queries, threshold) | |
| except Exception as e2: | |
| print(f"OWL-V2 also failed: {e2}") | |
| # Return empty detections instead of failing completely | |
| print("Both models failed, returning empty detections") | |
| return {"detections": []} | |
| def _generate_openai(self, prompt: str, max_tokens: int, temperature: float) -> str: | |
| if not self.openai_key: | |
| raise RuntimeError("OPENAI_API_KEY not set in environment.") | |
| url = "https://api.openai.com/v1/chat/completions" | |
| headers = { | |
| "Authorization": f"Bearer {self.openai_key}", | |
| "Content-Type": "application/json", | |
| } | |
| # Build a simple chat conversation with a single system + user message | |
| payload = { | |
| "model": self.model, | |
| "messages": [ | |
| {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": float(temperature), | |
| "n": 1, | |
| } | |
| try: | |
| r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout) | |
| r.raise_for_status() | |
| data = r.json() | |
| # OpenAI API returns a list of choices | |
| choices = data.get("choices", []) | |
| if not choices: | |
| raise RuntimeError(f"OpenAI returned empty choices: {data}") | |
| # Extract the assistant message | |
| msg = choices[0].get("message", {}).get("content") | |
| if msg is None: | |
| # Some deployments return text in 'text' or in other fields; fallback to stringifying response | |
| return str(data) | |
| return msg.strip() | |
| except Exception as e: | |
| # Surface a clear error for the calling code to handle (the app catches exceptions) | |
| raise RuntimeError(f"OpenAI API call failed: {e}") | |
| def _generate_hf(self, prompt: str, max_tokens: int, temperature: float) -> str: | |
| if not self.hf_token: | |
| raise RuntimeError("HUGGINGFACE_API_TOKEN (or HF_API_TOKEN / HF_TOKEN) not set in environment.") | |
| # Prefer the HF router automatically when an HF token is present unless explicitly disabled. | |
| use_router = False | |
| # If HF token exists, default to using the router (unless HF_USE_ROUTER is set to a falsey value). | |
| if self.hf_token: | |
| hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower() | |
| if hf_use_router_val in ("0", "false", "no"): | |
| use_router = False | |
| else: | |
| use_router = True | |
| # Explicit enable via HF_USE_ROUTER env var | |
| if os.getenv("HF_USE_ROUTER", "").lower() in ("1", "true", "yes"): | |
| use_router = True | |
| # Also enable router if model id looks like an OpenAI-style id | |
| if "openai/" in (self.model or "") or ":" in (self.model or ""): | |
| use_router = True | |
| try: | |
| if use_router: | |
| # Router (OpenAI-compatible) endpoint: accepts chat/completions style payloads | |
| url = "https://router.huggingface.co/v1/chat/completions" | |
| headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"} | |
| payload = { | |
| "model": self.model, | |
| "messages": [ | |
| {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": float(temperature), | |
| "n": 1, | |
| } | |
| r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout) | |
| r.raise_for_status() | |
| data = r.json() | |
| # Try to extract OpenAI-style response | |
| choices = data.get("choices", []) | |
| if choices and isinstance(choices, list): | |
| first = choices[0] | |
| # OpenAI-compatible router usually returns message under 'message' | |
| msg = first.get("message", {}).get("content") if isinstance(first, dict) else None | |
| # Some router variants may return text under 'text' or 'content' | |
| if not msg: | |
| msg = first.get("text") or first.get("content") | |
| if msg: | |
| return msg.strip() | |
| # Fallback stringify | |
| return str(data) | |
| else: | |
| # Standard Hugging Face inference API | |
| url = f"https://api-inference.huggingface.co/models/{self.model}" | |
| headers = {"Authorization": f"Bearer {self.hf_token}"} | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": max_tokens, | |
| "temperature": float(temperature), | |
| }, | |
| "options": {"wait_for_model": True}, | |
| } | |
| r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout) | |
| r.raise_for_status() | |
| data = r.json() | |
| # Hugging Face inference may return a list of generated outputs or a dict | |
| if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict) and "generated_text" in data[0]: | |
| return data[0]["generated_text"].strip() | |
| elif isinstance(data, dict) and "generated_text" in data: | |
| return data["generated_text"].strip() | |
| elif isinstance(data, dict) and "error" in data: | |
| raise RuntimeError(f"Hugging Face error: {data['error']}") | |
| else: | |
| # Some text-generation endpoints return a plain string or different struct; try to stringify | |
| return str(data) | |
| except Exception as e: | |
| raise RuntimeError(f"Hugging Face API call failed: {e}") | |
| def _analyze_image_openai(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str: | |
| """ | |
| Analyze an image using OpenAI GPT-4 Vision API. | |
| """ | |
| if not self.openai_key: | |
| raise RuntimeError("OPENAI_API_KEY not set in environment.") | |
| # Encode image to base64 | |
| try: | |
| with open(image_path, "rb") as image_file: | |
| base64_image = base64.b64encode(image_file.read()).decode('utf-8') | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to read image file {image_path}: {e}") | |
| url = "https://api.openai.com/v1/chat/completions" | |
| headers = { | |
| "Authorization": f"Bearer {self.openai_key}", | |
| "Content-Type": "application/json", | |
| } | |
| # Use GPT-4 Vision model | |
| vision_model = "gpt-4-vision-preview" | |
| # Build payload for vision API | |
| payload = { | |
| "model": vision_model, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish." | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": prompt | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{base64_image}", | |
| "detail": "high" | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": float(temperature), | |
| } | |
| try: | |
| r = requests.post(url, headers=headers, json=payload, timeout=60) # Longer timeout for vision | |
| r.raise_for_status() | |
| data = r.json() | |
| choices = data.get("choices", []) | |
| if not choices: | |
| raise RuntimeError(f"OpenAI Vision returned empty choices: {data}") | |
| msg = choices[0].get("message", {}).get("content") | |
| if msg is None: | |
| return str(data) | |
| return msg.strip() | |
| except Exception as e: | |
| raise RuntimeError(f"OpenAI Vision API call failed: {e}") | |
| def _analyze_image_hf(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str: | |
| """ | |
| Analyze an image using Hugging Face vision models (like Qwen2-VL). | |
| """ | |
| if not self.hf_token: | |
| raise RuntimeError("HUGGINGFACE_API_TOKEN not set in environment.") | |
| # Encode image to base64 | |
| try: | |
| with open(image_path, "rb") as image_file: | |
| base64_image = base64.b64encode(image_file.read()).decode('utf-8') | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to read image file {image_path}: {e}") | |
| # Use Qwen2-VL model for vision analysis | |
| vision_model = os.getenv("VISION_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct") | |
| # Check if we should use the router | |
| use_router = False | |
| if self.hf_token: | |
| hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower() | |
| if hf_use_router_val not in ("0", "false", "no"): | |
| use_router = True | |
| try: | |
| if use_router: | |
| # Router endpoint for vision models | |
| url = "https://router.huggingface.co/v1/chat/completions" | |
| headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"} | |
| payload = { | |
| "model": vision_model, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish." | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": prompt | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{base64_image}" | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": float(temperature), | |
| } | |
| r = requests.post(url, headers=headers, json=payload, timeout=120) | |
| r.raise_for_status() | |
| data = r.json() | |
| choices = data.get("choices", []) | |
| if choices and isinstance(choices, list): | |
| first = choices[0] | |
| msg = first.get("message", {}).get("content") if isinstance(first, dict) else None | |
| if not msg: | |
| msg = first.get("text") or first.get("content") | |
| if msg: | |
| return msg.strip() | |
| return str(data) | |
| else: | |
| # Direct Hugging Face Inference API for vision models | |
| url = f"https://api-inference.huggingface.co/models/{vision_model}" | |
| headers = {"Authorization": f"Bearer {self.hf_token}"} | |
| # For vision models, we need to send both text and image | |
| payload = { | |
| "inputs": { | |
| "text": prompt, | |
| "image": base64_image | |
| }, | |
| "parameters": { | |
| "max_new_tokens": max_tokens, | |
| "temperature": float(temperature), | |
| }, | |
| "options": {"wait_for_model": True}, | |
| } | |
| r = requests.post(url, headers=headers, json=payload, timeout=120) | |
| r.raise_for_status() | |
| data = r.json() | |
| # Handle different response formats | |
| if isinstance(data, list) and len(data) > 0: | |
| if isinstance(data[0], dict): | |
| if "generated_text" in data[0]: | |
| return data[0]["generated_text"].strip() | |
| elif "text" in data[0]: | |
| return data[0]["text"].strip() | |
| elif isinstance(data, dict): | |
| if "generated_text" in data: | |
| return data["generated_text"].strip() | |
| elif "text" in data: | |
| return data["text"].strip() | |
| elif "error" in data: | |
| raise RuntimeError(f"Hugging Face error: {data['error']}") | |
| return str(data) | |
| except Exception as e: | |
| raise RuntimeError(f"Hugging Face Vision API call failed: {e}") | |
| def _detect_grounding_dino(self, image_path: str, text_queries: list, threshold: float) -> dict: | |
| """ | |
| Detect objects using Grounding DINO. Try HF API first, then local model. | |
| """ | |
| # Try HF API first (more reliable) | |
| if self.hf_token: | |
| try: | |
| return self._detect_grounding_dino_api(image_path, text_queries, threshold) | |
| except Exception as e: | |
| print(f"Grounding DINO API failed: {e}") | |
| print("Falling back to local model...") | |
| # Fallback to local model | |
| try: | |
| from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection | |
| # Load Grounding DINO model (will use HF GPU) | |
| model_id = "IDEA-Research/grounding-dino-base" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Loading Grounding DINO on device: {device}") | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) | |
| # Load image | |
| image = Image.open(image_path) | |
| # Prepare text queries (VERY important: lowercase + end with dot) | |
| text = ". ".join([query.lower() for query in text_queries]) + "." | |
| print(f"Grounding DINO text query: {text}") | |
| # Process inputs | |
| inputs = processor(images=image, text=text, return_tensors="pt").to(device) | |
| # Run inference | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Post-process results (detectar sintaxis automáticamente) | |
| try: | |
| # Intentar sintaxis nueva (transformers >= 4.44) | |
| results = processor.post_process_grounded_object_detection( | |
| outputs, | |
| inputs.input_ids, | |
| box_threshold=threshold, | |
| text_threshold=0.3, | |
| target_sizes=[image.size[::-1]] | |
| ) | |
| except TypeError as e: | |
| if "box_threshold" in str(e): | |
| # Fallback a sintaxis antigua (transformers < 4.44) | |
| print("Using legacy post_process_grounded_object_detection syntax") | |
| results = processor.post_process_grounded_object_detection( | |
| outputs, | |
| inputs.input_ids, | |
| threshold=threshold, | |
| target_sizes=[image.size[::-1]] | |
| ) | |
| else: | |
| raise e | |
| # Convert to our format | |
| detections = [] | |
| if results and len(results) > 0: | |
| result = results[0] | |
| boxes = result.get("boxes", []) | |
| scores = result.get("scores", []) | |
| labels = result.get("labels", []) | |
| print(f"Grounding DINO found {len(boxes)} detections") | |
| for i, (box, score, label_info) in enumerate(zip(boxes, scores, labels)): | |
| try: | |
| # Convert score to float safely | |
| score_val = float(score.item() if hasattr(score, 'item') else score) | |
| if score_val >= threshold: | |
| # Convert box coordinates safely | |
| if hasattr(box, 'tolist'): | |
| x1, y1, x2, y2 = box.tolist() | |
| else: | |
| x1, y1, x2, y2 = box | |
| # Handle label safely | |
| if isinstance(label_info, (int, float)): | |
| label_idx = int(label_info) | |
| label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown" | |
| else: | |
| label = str(label_info) | |
| detections.append({ | |
| "label": label, | |
| "confidence": score_val, | |
| "bbox": [int(x1), int(y1), int(x2), int(y2)] | |
| }) | |
| except Exception as e: | |
| print(f"Error processing detection {i}: {e}") | |
| continue | |
| return {"detections": detections} | |
| except Exception as e: | |
| raise RuntimeError(f"Grounding DINO detection failed: {e}") | |
| def _detect_grounding_dino_api(self, image_path: str, text_queries: list, threshold: float) -> dict: | |
| """ | |
| Detect objects using Grounding DINO via HF Inference API. | |
| """ | |
| if not self.hf_token: | |
| raise RuntimeError("HF token required for Grounding DINO API") | |
| try: | |
| import base64 | |
| # Encode image to base64 | |
| with open(image_path, "rb") as image_file: | |
| base64_image = base64.b64encode(image_file.read()).decode('utf-8') | |
| # Prepare text queries (VERY important: lowercase + end with dot) | |
| text = ". ".join([query.lower() for query in text_queries]) + "." | |
| print(f"Grounding DINO API text query: {text}") | |
| # Use Grounding DINO model via API | |
| model_id = "IDEA-Research/grounding-dino-base" | |
| url = f"https://api-inference.huggingface.co/models/{model_id}" | |
| headers = {"Authorization": f"Bearer {self.hf_token}"} | |
| # Prepare payload for Grounding DINO API | |
| payload = { | |
| "inputs": { | |
| "image": base64_image, | |
| "text": text | |
| }, | |
| "parameters": { | |
| "threshold": threshold | |
| } | |
| } | |
| response = requests.post(url, headers=headers, json=payload, timeout=30) | |
| if response.status_code == 200: | |
| data = response.json() | |
| # Convert API response to our format | |
| detections = [] | |
| if isinstance(data, list): | |
| for detection in data: | |
| if detection.get("score", 0) >= threshold: | |
| box = detection.get("box", {}) | |
| detections.append({ | |
| "label": detection.get("label", "unknown"), | |
| "confidence": float(detection.get("score", 0)), | |
| "bbox": [ | |
| int(box.get("xmin", 0)), | |
| int(box.get("ymin", 0)), | |
| int(box.get("xmax", 0)), | |
| int(box.get("ymax", 0)) | |
| ] | |
| }) | |
| print(f"Grounding DINO API found {len(detections)} detections") | |
| return {"detections": detections} | |
| else: | |
| raise RuntimeError(f"API call failed with status {response.status_code}: {response.text}") | |
| except Exception as e: | |
| raise RuntimeError(f"Grounding DINO API detection failed: {e}") | |
| def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict: | |
| """ | |
| Detect objects using OWL-V2 running on HF GPU. | |
| """ | |
| try: | |
| from transformers import Owlv2Processor, Owlv2ForObjectDetection | |
| # Load OWL-V2 model (will use HF GPU) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Loading OWL-V2 on device: {device}") | |
| processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble") | |
| model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble").to(device) | |
| # Load image | |
| image = Image.open(image_path) | |
| # Prepare text queries (format: [["query1", "query2", ...]]) | |
| texts = [text_queries] | |
| print(f"OWL-V2 text queries: {texts}") | |
| # Process inputs | |
| inputs = processor(text=texts, images=image, return_tensors="pt").to(device) | |
| # Run inference | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Target image sizes for rescaling | |
| target_sizes = torch.Tensor([image.size[::-1]]) | |
| # Post-process results | |
| results = processor.post_process_object_detection( | |
| outputs=outputs, | |
| target_sizes=target_sizes, | |
| threshold=threshold | |
| ) | |
| # Convert to our format | |
| detections = [] | |
| if results and len(results) > 0: | |
| result = results[0] | |
| boxes = result.get("boxes", []) | |
| scores = result.get("scores", []) | |
| labels = result.get("labels", []) | |
| print(f"OWL-V2 found {len(boxes)} detections") | |
| for box, score, label_idx in zip(boxes, scores, labels): | |
| if score >= threshold: | |
| x1, y1, x2, y2 = box.tolist() | |
| label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown" | |
| detections.append({ | |
| "label": label, | |
| "confidence": float(score), | |
| "bbox": [int(x1), int(y1), int(x2), int(y2)] | |
| }) | |
| return {"detections": detections} | |
| except Exception as e: | |
| raise RuntimeError(f"OWL-V2 detection failed: {e}") | |
| def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict: | |
| """ | |
| Detect objects using OWL-V2 locally. | |
| """ | |
| try: | |
| from transformers import Owlv2Processor, Owlv2ForObjectDetection | |
| # Load OWL-V2 model | |
| processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble") | |
| model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble") | |
| # Load image | |
| image = Image.open(image_path) | |
| # Prepare text queries (format: [["query1", "query2", ...]]) | |
| texts = [text_queries] | |
| # Process inputs | |
| inputs = processor(text=texts, images=image, return_tensors="pt") | |
| # Run inference | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Target image sizes for rescaling | |
| target_sizes = torch.Tensor([image.size[::-1]]) | |
| # Post-process results | |
| results = processor.post_process_object_detection( | |
| outputs=outputs, | |
| target_sizes=target_sizes, | |
| threshold=threshold | |
| ) | |
| # Convert to our format | |
| detections = [] | |
| if results and len(results) > 0: | |
| result = results[0] | |
| boxes = result.get("boxes", []) | |
| scores = result.get("scores", []) | |
| labels = result.get("labels", []) | |
| for box, score, label_idx in zip(boxes, scores, labels): | |
| if score >= threshold: | |
| x1, y1, x2, y2 = box.tolist() | |
| label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown" | |
| detections.append({ | |
| "label": label, | |
| "confidence": float(score), | |
| "bbox": [int(x1), int(y1), int(x2), int(y2)] | |
| }) | |
| return {"detections": detections} | |
| except Exception as e: | |
| raise RuntimeError(f"OWL-V2 detection failed: {e}") | |
| def _detect_owlv2_hf(self, image_path: str, text_queries: list, threshold: float) -> dict: | |
| """ | |
| Detect objects using OWL-V2 via Hugging Face Inference API. | |
| """ | |
| try: | |
| with open(image_path, "rb") as image_file: | |
| image_data = image_file.read() | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to read image file {image_path}: {e}") | |
| # DETR model endpoint (object detection) | |
| detr_model = os.getenv("DETR_MODEL_ID", "facebook/detr-resnet-101") | |
| url = f"https://api-inference.huggingface.co/models/{detr_model}" | |
| headers = {"Authorization": f"Bearer {self.hf_token}"} | |
| # Prepare payload for DETR | |
| # OWL-V2 expects image as binary data and text queries as parameters | |
| payload = { | |
| "parameters": { | |
| "candidate_labels": text_queries, | |
| "threshold": threshold | |
| }, | |
| "options": {"wait_for_model": True} | |
| } | |
| try: | |
| # Send image as binary data with parameters | |
| files = {"inputs": image_data} | |
| data = {"parameters": str(payload["parameters"]).replace("'", '"')} | |
| r = requests.post(url, headers=headers, files=files, data=data, timeout=120) | |
| r.raise_for_status() | |
| response_data = r.json() | |
| # Parse OWL-V2 response format | |
| detections = [] | |
| if isinstance(response_data, list): | |
| for detection in response_data: | |
| if isinstance(detection, dict): | |
| # Extract detection info | |
| label = detection.get("label", "unknown") | |
| confidence = detection.get("score", 0.0) | |
| bbox = detection.get("box", {}) | |
| # Convert bbox format if needed | |
| if bbox: | |
| x1 = bbox.get("xmin", 0) | |
| y1 = bbox.get("ymin", 0) | |
| x2 = bbox.get("xmax", 0) | |
| y2 = bbox.get("ymax", 0) | |
| detections.append({ | |
| "label": label, | |
| "confidence": confidence, | |
| "bbox": [x1, y1, x2, y2] | |
| }) | |
| return {"detections": detections} | |
| except Exception as e: | |
| raise RuntimeError(f"OWL-V2 detection failed: {e}") | |
| # Backwards-compatible factory in case caller expects a function or attribute | |
| def GPTOSSWrapperFactory(model: Optional[str] = None, provider: Optional[str] = None): | |
| return GPTOSSWrapper(model=model or "gpt-oss-120", provider=provider or "auto") |