""" GPTOSSWrapper - Simple integration wrapper for OpenAI or Hugging Face Inference API. Usage: from gptoss_wrapper import GPTOSSWrapper w = GPTOSSWrapper(model="gpt-oss-120") text = w.generate(prompt) Behavior: - Provider selection (priority): 1) If OPENAI_API_KEY is set -> use OpenAI Chat Completions (v1/chat/completions) 2) Else if HUGGINGFACE_API_TOKEN or HF_API_TOKEN is set -> use Hugging Face Inference API 3) Else -> generate() will raise a RuntimeError describing missing credentials. Note for Spaces: - Add the secret in your Space settings (Settings → Secrets & variables → Add secret): - For OpenAI: key name = OPENAI_API_KEY, value = - For Hugging Face: key name = HUGGINGFACE_API_TOKEN (or HF_API_TOKEN), value = This file intentionally uses only the requests stdlib-friendly HTTP approach to avoid depending on extra SDKs. """ import os import time import requests import base64 import torch from PIL import Image from typing import Optional class GPTOSSWrapper: """ Lightweight wrapper that can call either OpenAI or Hugging Face inference endpoints. Constructor: GPTOSSWrapper(model="gpt-oss-120", provider="auto") - model: model name to request (for OpenAI it must be an available model for your account; for Hugging Face it should be a model id hosted on HF). - provider: "auto" (default) | "openai" | "hf" """ def __init__(self, model: str = "gpt-oss-120", provider: str = "auto"): # Allow overriding the model via env var MODEL_ID (useful in Spaces) env_model = os.getenv("MODEL_ID") if env_model: self.model = env_model else: self.model = model self.request_timeout = 30 self.openai_key = os.getenv("OPENAI_API_KEY") # Accept multiple HF token environment variable names for compatibility: # HUGGINGFACE_API_TOKEN, HF_API_TOKEN, or HF_TOKEN (used by some HF examples) self.hf_token = ( os.getenv("HUGGINGFACE_API_TOKEN") or os.getenv("HF_API_TOKEN") or os.getenv("HF_TOKEN") ) self.provider = provider.lower() if provider else "auto" # If we have an HF token and the user didn't explicitly set a MODEL_ID, # prefer the HF router and use a sensible default router model id. if self.hf_token and not env_model and model == "gpt-oss-120": # Default router model id; you can override via MODEL_ID env var in the Space self.model = "openai/gpt-oss-120b:fireworks-ai" if self.provider == "auto": if self.openai_key: self.provider = "openai" elif self.hf_token: self.provider = "hf" else: self.provider = "none" def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str: """ Generate a textual response for the given prompt. Returns: A string with the generated text. Raises: RuntimeError if no credentials are found or the remote call fails. """ if self.provider == "openai": return self._generate_openai(prompt, max_tokens=max_tokens, temperature=temperature) elif self.provider == "hf": return self._generate_hf(prompt, max_tokens=max_tokens, temperature=temperature) else: raise RuntimeError( "No API key configured for GPT wrapper. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN in the environment." ) def analyze_image(self, image_path: str, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str: """ Analyze an image using vision models (OpenAI GPT-4 Vision or Hugging Face Qwen2-VL). Args: image_path: Path to the image file prompt: Text prompt for analysis max_tokens: Maximum tokens in response temperature: Temperature for generation Returns: Analysis text from vision model Raises: RuntimeError if no vision model is available or if the call fails """ if self.provider == "openai": return self._analyze_image_openai(image_path, prompt, max_tokens, temperature) elif self.provider == "hf": return self._analyze_image_hf(image_path, prompt, max_tokens, temperature) else: raise RuntimeError("Image analysis requires either OpenAI API key or Hugging Face token. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN.") def detect_objects_owlv2(self, image_path: str, text_queries: list, threshold: float = 0.1) -> dict: """ Detect objects in image using OWL-V2 or Grounding DINO zero-shot detection with text queries. Runs on HF GPU when available. Args: image_path: Path to the image file text_queries: List of text descriptions to search for (e.g., ["crack", "erosion", "dirt"]) threshold: Confidence threshold for detections Returns: Dictionary with detections: {"detections": [{"label": str, "confidence": float, "bbox": [x1,y1,x2,y2]}, ...]} Raises: RuntimeError if models not available or detection fails """ print(f"Starting zero-shot detection with {len(text_queries)} queries") # Try Grounding DINO first (usually better for zero-shot), then OWL-V2 as fallback try: print("Attempting Grounding DINO detection...") return self._detect_grounding_dino(image_path, text_queries, threshold) except Exception as e: print(f"Grounding DINO failed: {e}") print("Falling back to OWL-V2...") try: return self._detect_owlv2_local(image_path, text_queries, threshold) except Exception as e2: print(f"OWL-V2 also failed: {e2}") # Return empty detections instead of failing completely print("Both models failed, returning empty detections") return {"detections": []} def _generate_openai(self, prompt: str, max_tokens: int, temperature: float) -> str: if not self.openai_key: raise RuntimeError("OPENAI_API_KEY not set in environment.") url = "https://api.openai.com/v1/chat/completions" headers = { "Authorization": f"Bearer {self.openai_key}", "Content-Type": "application/json", } # Build a simple chat conversation with a single system + user message payload = { "model": self.model, "messages": [ {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."}, {"role": "user", "content": prompt}, ], "max_tokens": max_tokens, "temperature": float(temperature), "n": 1, } try: r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout) r.raise_for_status() data = r.json() # OpenAI API returns a list of choices choices = data.get("choices", []) if not choices: raise RuntimeError(f"OpenAI returned empty choices: {data}") # Extract the assistant message msg = choices[0].get("message", {}).get("content") if msg is None: # Some deployments return text in 'text' or in other fields; fallback to stringifying response return str(data) return msg.strip() except Exception as e: # Surface a clear error for the calling code to handle (the app catches exceptions) raise RuntimeError(f"OpenAI API call failed: {e}") def _generate_hf(self, prompt: str, max_tokens: int, temperature: float) -> str: if not self.hf_token: raise RuntimeError("HUGGINGFACE_API_TOKEN (or HF_API_TOKEN / HF_TOKEN) not set in environment.") # Prefer the HF router automatically when an HF token is present unless explicitly disabled. use_router = False # If HF token exists, default to using the router (unless HF_USE_ROUTER is set to a falsey value). if self.hf_token: hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower() if hf_use_router_val in ("0", "false", "no"): use_router = False else: use_router = True # Explicit enable via HF_USE_ROUTER env var if os.getenv("HF_USE_ROUTER", "").lower() in ("1", "true", "yes"): use_router = True # Also enable router if model id looks like an OpenAI-style id if "openai/" in (self.model or "") or ":" in (self.model or ""): use_router = True try: if use_router: # Router (OpenAI-compatible) endpoint: accepts chat/completions style payloads url = "https://router.huggingface.co/v1/chat/completions" headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"} payload = { "model": self.model, "messages": [ {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."}, {"role": "user", "content": prompt}, ], "max_tokens": max_tokens, "temperature": float(temperature), "n": 1, } r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout) r.raise_for_status() data = r.json() # Try to extract OpenAI-style response choices = data.get("choices", []) if choices and isinstance(choices, list): first = choices[0] # OpenAI-compatible router usually returns message under 'message' msg = first.get("message", {}).get("content") if isinstance(first, dict) else None # Some router variants may return text under 'text' or 'content' if not msg: msg = first.get("text") or first.get("content") if msg: return msg.strip() # Fallback stringify return str(data) else: # Standard Hugging Face inference API url = f"https://api-inference.huggingface.co/models/{self.model}" headers = {"Authorization": f"Bearer {self.hf_token}"} payload = { "inputs": prompt, "parameters": { "max_new_tokens": max_tokens, "temperature": float(temperature), }, "options": {"wait_for_model": True}, } r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout) r.raise_for_status() data = r.json() # Hugging Face inference may return a list of generated outputs or a dict if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict) and "generated_text" in data[0]: return data[0]["generated_text"].strip() elif isinstance(data, dict) and "generated_text" in data: return data["generated_text"].strip() elif isinstance(data, dict) and "error" in data: raise RuntimeError(f"Hugging Face error: {data['error']}") else: # Some text-generation endpoints return a plain string or different struct; try to stringify return str(data) except Exception as e: raise RuntimeError(f"Hugging Face API call failed: {e}") def _analyze_image_openai(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str: """ Analyze an image using OpenAI GPT-4 Vision API. """ if not self.openai_key: raise RuntimeError("OPENAI_API_KEY not set in environment.") # Encode image to base64 try: with open(image_path, "rb") as image_file: base64_image = base64.b64encode(image_file.read()).decode('utf-8') except Exception as e: raise RuntimeError(f"Failed to read image file {image_path}: {e}") url = "https://api.openai.com/v1/chat/completions" headers = { "Authorization": f"Bearer {self.openai_key}", "Content-Type": "application/json", } # Use GPT-4 Vision model vision_model = "gpt-4-vision-preview" # Build payload for vision API payload = { "model": vision_model, "messages": [ { "role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish." }, { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}", "detail": "high" } } ] } ], "max_tokens": max_tokens, "temperature": float(temperature), } try: r = requests.post(url, headers=headers, json=payload, timeout=60) # Longer timeout for vision r.raise_for_status() data = r.json() choices = data.get("choices", []) if not choices: raise RuntimeError(f"OpenAI Vision returned empty choices: {data}") msg = choices[0].get("message", {}).get("content") if msg is None: return str(data) return msg.strip() except Exception as e: raise RuntimeError(f"OpenAI Vision API call failed: {e}") def _analyze_image_hf(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str: """ Analyze an image using Hugging Face vision models (like Qwen2-VL). """ if not self.hf_token: raise RuntimeError("HUGGINGFACE_API_TOKEN not set in environment.") # Encode image to base64 try: with open(image_path, "rb") as image_file: base64_image = base64.b64encode(image_file.read()).decode('utf-8') except Exception as e: raise RuntimeError(f"Failed to read image file {image_path}: {e}") # Use Qwen2-VL model for vision analysis vision_model = os.getenv("VISION_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct") # Check if we should use the router use_router = False if self.hf_token: hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower() if hf_use_router_val not in ("0", "false", "no"): use_router = True try: if use_router: # Router endpoint for vision models url = "https://router.huggingface.co/v1/chat/completions" headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"} payload = { "model": vision_model, "messages": [ { "role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish." }, { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } ] } ], "max_tokens": max_tokens, "temperature": float(temperature), } r = requests.post(url, headers=headers, json=payload, timeout=120) r.raise_for_status() data = r.json() choices = data.get("choices", []) if choices and isinstance(choices, list): first = choices[0] msg = first.get("message", {}).get("content") if isinstance(first, dict) else None if not msg: msg = first.get("text") or first.get("content") if msg: return msg.strip() return str(data) else: # Direct Hugging Face Inference API for vision models url = f"https://api-inference.huggingface.co/models/{vision_model}" headers = {"Authorization": f"Bearer {self.hf_token}"} # For vision models, we need to send both text and image payload = { "inputs": { "text": prompt, "image": base64_image }, "parameters": { "max_new_tokens": max_tokens, "temperature": float(temperature), }, "options": {"wait_for_model": True}, } r = requests.post(url, headers=headers, json=payload, timeout=120) r.raise_for_status() data = r.json() # Handle different response formats if isinstance(data, list) and len(data) > 0: if isinstance(data[0], dict): if "generated_text" in data[0]: return data[0]["generated_text"].strip() elif "text" in data[0]: return data[0]["text"].strip() elif isinstance(data, dict): if "generated_text" in data: return data["generated_text"].strip() elif "text" in data: return data["text"].strip() elif "error" in data: raise RuntimeError(f"Hugging Face error: {data['error']}") return str(data) except Exception as e: raise RuntimeError(f"Hugging Face Vision API call failed: {e}") def _detect_grounding_dino(self, image_path: str, text_queries: list, threshold: float) -> dict: """ Detect objects using Grounding DINO. Try HF API first, then local model. """ # Try HF API first (more reliable) if self.hf_token: try: return self._detect_grounding_dino_api(image_path, text_queries, threshold) except Exception as e: print(f"Grounding DINO API failed: {e}") print("Falling back to local model...") # Fallback to local model try: from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection # Load Grounding DINO model (will use HF GPU) model_id = "IDEA-Research/grounding-dino-base" device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading Grounding DINO on device: {device}") processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) # Load image image = Image.open(image_path) # Prepare text queries (VERY important: lowercase + end with dot) text = ". ".join([query.lower() for query in text_queries]) + "." print(f"Grounding DINO text query: {text}") # Process inputs inputs = processor(images=image, text=text, return_tensors="pt").to(device) # Run inference with torch.no_grad(): outputs = model(**inputs) # Post-process results (detectar sintaxis automáticamente) try: # Intentar sintaxis nueva (transformers >= 4.44) results = processor.post_process_grounded_object_detection( outputs, inputs.input_ids, box_threshold=threshold, text_threshold=0.3, target_sizes=[image.size[::-1]] ) except TypeError as e: if "box_threshold" in str(e): # Fallback a sintaxis antigua (transformers < 4.44) print("Using legacy post_process_grounded_object_detection syntax") results = processor.post_process_grounded_object_detection( outputs, inputs.input_ids, threshold=threshold, target_sizes=[image.size[::-1]] ) else: raise e # Convert to our format detections = [] if results and len(results) > 0: result = results[0] boxes = result.get("boxes", []) scores = result.get("scores", []) labels = result.get("labels", []) print(f"Grounding DINO found {len(boxes)} detections") for i, (box, score, label_info) in enumerate(zip(boxes, scores, labels)): try: # Convert score to float safely score_val = float(score.item() if hasattr(score, 'item') else score) if score_val >= threshold: # Convert box coordinates safely if hasattr(box, 'tolist'): x1, y1, x2, y2 = box.tolist() else: x1, y1, x2, y2 = box # Handle label safely if isinstance(label_info, (int, float)): label_idx = int(label_info) label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown" else: label = str(label_info) detections.append({ "label": label, "confidence": score_val, "bbox": [int(x1), int(y1), int(x2), int(y2)] }) except Exception as e: print(f"Error processing detection {i}: {e}") continue return {"detections": detections} except Exception as e: raise RuntimeError(f"Grounding DINO detection failed: {e}") def _detect_grounding_dino_api(self, image_path: str, text_queries: list, threshold: float) -> dict: """ Detect objects using Grounding DINO via HF Inference API. """ if not self.hf_token: raise RuntimeError("HF token required for Grounding DINO API") try: import base64 # Encode image to base64 with open(image_path, "rb") as image_file: base64_image = base64.b64encode(image_file.read()).decode('utf-8') # Prepare text queries (VERY important: lowercase + end with dot) text = ". ".join([query.lower() for query in text_queries]) + "." print(f"Grounding DINO API text query: {text}") # Use Grounding DINO model via API model_id = "IDEA-Research/grounding-dino-base" url = f"https://api-inference.huggingface.co/models/{model_id}" headers = {"Authorization": f"Bearer {self.hf_token}"} # Prepare payload for Grounding DINO API payload = { "inputs": { "image": base64_image, "text": text }, "parameters": { "threshold": threshold } } response = requests.post(url, headers=headers, json=payload, timeout=30) if response.status_code == 200: data = response.json() # Convert API response to our format detections = [] if isinstance(data, list): for detection in data: if detection.get("score", 0) >= threshold: box = detection.get("box", {}) detections.append({ "label": detection.get("label", "unknown"), "confidence": float(detection.get("score", 0)), "bbox": [ int(box.get("xmin", 0)), int(box.get("ymin", 0)), int(box.get("xmax", 0)), int(box.get("ymax", 0)) ] }) print(f"Grounding DINO API found {len(detections)} detections") return {"detections": detections} else: raise RuntimeError(f"API call failed with status {response.status_code}: {response.text}") except Exception as e: raise RuntimeError(f"Grounding DINO API detection failed: {e}") def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict: """ Detect objects using OWL-V2 running on HF GPU. """ try: from transformers import Owlv2Processor, Owlv2ForObjectDetection # Load OWL-V2 model (will use HF GPU) device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading OWL-V2 on device: {device}") processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble") model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble").to(device) # Load image image = Image.open(image_path) # Prepare text queries (format: [["query1", "query2", ...]]) texts = [text_queries] print(f"OWL-V2 text queries: {texts}") # Process inputs inputs = processor(text=texts, images=image, return_tensors="pt").to(device) # Run inference with torch.no_grad(): outputs = model(**inputs) # Target image sizes for rescaling target_sizes = torch.Tensor([image.size[::-1]]) # Post-process results results = processor.post_process_object_detection( outputs=outputs, target_sizes=target_sizes, threshold=threshold ) # Convert to our format detections = [] if results and len(results) > 0: result = results[0] boxes = result.get("boxes", []) scores = result.get("scores", []) labels = result.get("labels", []) print(f"OWL-V2 found {len(boxes)} detections") for box, score, label_idx in zip(boxes, scores, labels): if score >= threshold: x1, y1, x2, y2 = box.tolist() label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown" detections.append({ "label": label, "confidence": float(score), "bbox": [int(x1), int(y1), int(x2), int(y2)] }) return {"detections": detections} except Exception as e: raise RuntimeError(f"OWL-V2 detection failed: {e}") def _detect_owlv2_local(self, image_path: str, text_queries: list, threshold: float) -> dict: """ Detect objects using OWL-V2 locally. """ try: from transformers import Owlv2Processor, Owlv2ForObjectDetection # Load OWL-V2 model processor = Owlv2Processor.from_pretrained("google/owlv2-large-patch14-ensemble") model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble") # Load image image = Image.open(image_path) # Prepare text queries (format: [["query1", "query2", ...]]) texts = [text_queries] # Process inputs inputs = processor(text=texts, images=image, return_tensors="pt") # Run inference with torch.no_grad(): outputs = model(**inputs) # Target image sizes for rescaling target_sizes = torch.Tensor([image.size[::-1]]) # Post-process results results = processor.post_process_object_detection( outputs=outputs, target_sizes=target_sizes, threshold=threshold ) # Convert to our format detections = [] if results and len(results) > 0: result = results[0] boxes = result.get("boxes", []) scores = result.get("scores", []) labels = result.get("labels", []) for box, score, label_idx in zip(boxes, scores, labels): if score >= threshold: x1, y1, x2, y2 = box.tolist() label = text_queries[label_idx] if label_idx < len(text_queries) else "unknown" detections.append({ "label": label, "confidence": float(score), "bbox": [int(x1), int(y1), int(x2), int(y2)] }) return {"detections": detections} except Exception as e: raise RuntimeError(f"OWL-V2 detection failed: {e}") def _detect_owlv2_hf(self, image_path: str, text_queries: list, threshold: float) -> dict: """ Detect objects using OWL-V2 via Hugging Face Inference API. """ try: with open(image_path, "rb") as image_file: image_data = image_file.read() except Exception as e: raise RuntimeError(f"Failed to read image file {image_path}: {e}") # DETR model endpoint (object detection) detr_model = os.getenv("DETR_MODEL_ID", "facebook/detr-resnet-101") url = f"https://api-inference.huggingface.co/models/{detr_model}" headers = {"Authorization": f"Bearer {self.hf_token}"} # Prepare payload for DETR # OWL-V2 expects image as binary data and text queries as parameters payload = { "parameters": { "candidate_labels": text_queries, "threshold": threshold }, "options": {"wait_for_model": True} } try: # Send image as binary data with parameters files = {"inputs": image_data} data = {"parameters": str(payload["parameters"]).replace("'", '"')} r = requests.post(url, headers=headers, files=files, data=data, timeout=120) r.raise_for_status() response_data = r.json() # Parse OWL-V2 response format detections = [] if isinstance(response_data, list): for detection in response_data: if isinstance(detection, dict): # Extract detection info label = detection.get("label", "unknown") confidence = detection.get("score", 0.0) bbox = detection.get("box", {}) # Convert bbox format if needed if bbox: x1 = bbox.get("xmin", 0) y1 = bbox.get("ymin", 0) x2 = bbox.get("xmax", 0) y2 = bbox.get("ymax", 0) detections.append({ "label": label, "confidence": confidence, "bbox": [x1, y1, x2, y2] }) return {"detections": detections} except Exception as e: raise RuntimeError(f"OWL-V2 detection failed: {e}") # Backwards-compatible factory in case caller expects a function or attribute def GPTOSSWrapperFactory(model: Optional[str] = None, provider: Optional[str] = None): return GPTOSSWrapper(model=model or "gpt-oss-120", provider=provider or "auto")