Spaces:

Kesherat
/

blade-inspection-demo

Sleeping

File size: 18,239 Bytes

"""
GPTOSSWrapper - Simple integration wrapper for OpenAI or Hugging Face Inference API.

Usage:
    from gptoss_wrapper import GPTOSSWrapper
    w = GPTOSSWrapper(model="gpt-oss-120")
    text = w.generate(prompt)

Behavior:
- Provider selection (priority):
    1) If OPENAI_API_KEY is set -> use OpenAI Chat Completions (v1/chat/completions)
    2) Else if HUGGINGFACE_API_TOKEN or HF_API_TOKEN is set -> use Hugging Face Inference API
    3) Else -> generate() will raise a RuntimeError describing missing credentials.

Note for Spaces:
- Add the secret in your Space settings (Settings → Secrets & variables → Add secret):
    - For OpenAI: key name = OPENAI_API_KEY, value = <your_openai_api_key>
    - For Hugging Face: key name = HUGGINGFACE_API_TOKEN (or HF_API_TOKEN), value = <your_hf_token>

This file intentionally uses only the requests stdlib-friendly HTTP approach to avoid depending on extra SDKs.
"""
import os
import time
import requests
import base64
from typing import Optional


class GPTOSSWrapper:
    """
    Lightweight wrapper that can call either OpenAI or Hugging Face inference endpoints.

    Constructor:
        GPTOSSWrapper(model="gpt-oss-120", provider="auto")

    - model: model name to request (for OpenAI it must be an available model for your account;
             for Hugging Face it should be a model id hosted on HF).
    - provider: "auto" (default) | "openai" | "hf"
    """

    def __init__(self, model: str = "gpt-oss-120", provider: str = "auto"):
        # Allow overriding the model via env var MODEL_ID (useful in Spaces)
        env_model = os.getenv("MODEL_ID")
        if env_model:
            self.model = env_model
        else:
            self.model = model

        self.request_timeout = 30
        self.openai_key = os.getenv("OPENAI_API_KEY")
        # Accept multiple HF token environment variable names for compatibility:
        # HUGGINGFACE_API_TOKEN, HF_API_TOKEN, or HF_TOKEN (used by some HF examples)
        self.hf_token = (
            os.getenv("HUGGINGFACE_API_TOKEN")
            or os.getenv("HF_API_TOKEN")
            or os.getenv("HF_TOKEN")
        )
        self.provider = provider.lower() if provider else "auto"

        # If we have an HF token and the user didn't explicitly set a MODEL_ID,
        # prefer the HF router and use a sensible default router model id.
        if self.hf_token and not env_model and model == "gpt-oss-120":
            # Default router model id; you can override via MODEL_ID env var in the Space
            self.model = "openai/gpt-oss-120b:fireworks-ai"

        if self.provider == "auto":
            if self.openai_key:
                self.provider = "openai"
            elif self.hf_token:
                self.provider = "hf"
            else:
                self.provider = "none"

    def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
        """
        Generate a textual response for the given prompt.

        Returns:
            A string with the generated text.

        Raises:
            RuntimeError if no credentials are found or the remote call fails.
        """
        if self.provider == "openai":
            return self._generate_openai(prompt, max_tokens=max_tokens, temperature=temperature)
        elif self.provider == "hf":
            return self._generate_hf(prompt, max_tokens=max_tokens, temperature=temperature)
        else:
            raise RuntimeError(
                "No API key configured for GPT wrapper. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN in the environment."
            )

    def analyze_image(self, image_path: str, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
        """
        Analyze an image using vision models (OpenAI GPT-4 Vision or Hugging Face Qwen2-VL).

        Args:
            image_path: Path to the image file
            prompt: Text prompt for analysis
            max_tokens: Maximum tokens in response
            temperature: Temperature for generation

        Returns:
            Analysis text from vision model

        Raises:
            RuntimeError if no vision model is available or if the call fails
        """
        if self.provider == "openai":
            return self._analyze_image_openai(image_path, prompt, max_tokens, temperature)
        elif self.provider == "hf":
            return self._analyze_image_hf(image_path, prompt, max_tokens, temperature)
        else:
            raise RuntimeError("Image analysis requires either OpenAI API key or Hugging Face token. Set OPENAI_API_KEY or HUGGINGFACE_API_TOKEN.")

    def _generate_openai(self, prompt: str, max_tokens: int, temperature: float) -> str:
        if not self.openai_key:
            raise RuntimeError("OPENAI_API_KEY not set in environment.")

        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.openai_key}",
            "Content-Type": "application/json",
        }

        # Build a simple chat conversation with a single system + user message
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."},
                {"role": "user", "content": prompt},
            ],
            "max_tokens": max_tokens,
            "temperature": float(temperature),
            "n": 1,
        }

        try:
            r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
            r.raise_for_status()
            data = r.json()
            # OpenAI API returns a list of choices
            choices = data.get("choices", [])
            if not choices:
                raise RuntimeError(f"OpenAI returned empty choices: {data}")
            # Extract the assistant message
            msg = choices[0].get("message", {}).get("content")
            if msg is None:
                # Some deployments return text in 'text' or in other fields; fallback to stringifying response
                return str(data)
            return msg.strip()
        except Exception as e:
            # Surface a clear error for the calling code to handle (the app catches exceptions)
            raise RuntimeError(f"OpenAI API call failed: {e}")

    def _generate_hf(self, prompt: str, max_tokens: int, temperature: float) -> str:
        if not self.hf_token:
            raise RuntimeError("HUGGINGFACE_API_TOKEN (or HF_API_TOKEN / HF_TOKEN) not set in environment.")

        # Prefer the HF router automatically when an HF token is present unless explicitly disabled.
        use_router = False
        # If HF token exists, default to using the router (unless HF_USE_ROUTER is set to a falsey value).
        if self.hf_token:
            hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower()
            if hf_use_router_val in ("0", "false", "no"):
                use_router = False
            else:
                use_router = True
        # Explicit enable via HF_USE_ROUTER env var
        if os.getenv("HF_USE_ROUTER", "").lower() in ("1", "true", "yes"):
            use_router = True
        # Also enable router if model id looks like an OpenAI-style id
        if "openai/" in (self.model or "") or ":" in (self.model or ""):
            use_router = True

        try:
            if use_router:
                # Router (OpenAI-compatible) endpoint: accepts chat/completions style payloads
                url = "https://router.huggingface.co/v1/chat/completions"
                headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"}
                payload = {
                    "model": self.model,
                    "messages": [
                        {"role": "system", "content": "You are an expert inspection assistant for wind turbine blade images/videos."},
                        {"role": "user", "content": prompt},
                    ],
                    "max_tokens": max_tokens,
                    "temperature": float(temperature),
                    "n": 1,
                }
                r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
                r.raise_for_status()
                data = r.json()
                # Try to extract OpenAI-style response
                choices = data.get("choices", [])
                if choices and isinstance(choices, list):
                    first = choices[0]
                    # OpenAI-compatible router usually returns message under 'message'
                    msg = first.get("message", {}).get("content") if isinstance(first, dict) else None
                    # Some router variants may return text under 'text' or 'content'
                    if not msg:
                        msg = first.get("text") or first.get("content")
                    if msg:
                        return msg.strip()
                # Fallback stringify
                return str(data)

            else:
                # Standard Hugging Face inference API
                url = f"https://api-inference.huggingface.co/models/{self.model}"
                headers = {"Authorization": f"Bearer {self.hf_token}"}
                payload = {
                    "inputs": prompt,
                    "parameters": {
                        "max_new_tokens": max_tokens,
                        "temperature": float(temperature),
                    },
                    "options": {"wait_for_model": True},
                }
                r = requests.post(url, headers=headers, json=payload, timeout=self.request_timeout)
                r.raise_for_status()
                data = r.json()
                # Hugging Face inference may return a list of generated outputs or a dict
                if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict) and "generated_text" in data[0]:
                    return data[0]["generated_text"].strip()
                elif isinstance(data, dict) and "generated_text" in data:
                    return data["generated_text"].strip()
                elif isinstance(data, dict) and "error" in data:
                    raise RuntimeError(f"Hugging Face error: {data['error']}")
                else:
                    # Some text-generation endpoints return a plain string or different struct; try to stringify
                    return str(data)
        except Exception as e:
            raise RuntimeError(f"Hugging Face API call failed: {e}")

    def _analyze_image_openai(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str:
        """
        Analyze an image using OpenAI GPT-4 Vision API.
        """
        if not self.openai_key:
            raise RuntimeError("OPENAI_API_KEY not set in environment.")

        # Encode image to base64
        try:
            with open(image_path, "rb") as image_file:
                base64_image = base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            raise RuntimeError(f"Failed to read image file {image_path}: {e}")

        url = "https://api.openai.com/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.openai_key}",
            "Content-Type": "application/json",
        }

        # Use GPT-4 Vision model
        vision_model = "gpt-4-vision-preview"

        # Build payload for vision API
        payload = {
            "model": vision_model,
            "messages": [
                {
                    "role": "system",
                    "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish."
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                                "detail": "high"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": max_tokens,
            "temperature": float(temperature),
        }

        try:
            r = requests.post(url, headers=headers, json=payload, timeout=60)  # Longer timeout for vision
            r.raise_for_status()
            data = r.json()

            choices = data.get("choices", [])
            if not choices:
                raise RuntimeError(f"OpenAI Vision returned empty choices: {data}")

            msg = choices[0].get("message", {}).get("content")
            if msg is None:
                return str(data)
            return msg.strip()

        except Exception as e:
            raise RuntimeError(f"OpenAI Vision API call failed: {e}")

    def _analyze_image_hf(self, image_path: str, prompt: str, max_tokens: int, temperature: float) -> str:
        """
        Analyze an image using Hugging Face vision models (like Qwen2-VL).
        """
        if not self.hf_token:
            raise RuntimeError("HUGGINGFACE_API_TOKEN not set in environment.")

        # Encode image to base64
        try:
            with open(image_path, "rb") as image_file:
                base64_image = base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            raise RuntimeError(f"Failed to read image file {image_path}: {e}")

        # Use Qwen2-VL model for vision analysis
        vision_model = os.getenv("VISION_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct")

        # Check if we should use the router
        use_router = False
        if self.hf_token:
            hf_use_router_val = os.getenv("HF_USE_ROUTER", "").lower()
            if hf_use_router_val not in ("0", "false", "no"):
                use_router = True

        try:
            if use_router:
                # Router endpoint for vision models
                url = "https://router.huggingface.co/v1/chat/completions"
                headers = {"Authorization": f"Bearer {self.hf_token}", "Content-Type": "application/json"}

                payload = {
                    "model": vision_model,
                    "messages": [
                        {
                            "role": "system",
                            "content": "You are an expert inspection assistant for wind turbine blade images/videos. Analyze images in detail and provide comprehensive assessments in Spanish."
                        },
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": prompt
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{base64_image}"
                                    }
                                }
                            ]
                        }
                    ],
                    "max_tokens": max_tokens,
                    "temperature": float(temperature),
                }

                r = requests.post(url, headers=headers, json=payload, timeout=120)
                r.raise_for_status()
                data = r.json()

                choices = data.get("choices", [])
                if choices and isinstance(choices, list):
                    first = choices[0]
                    msg = first.get("message", {}).get("content") if isinstance(first, dict) else None
                    if not msg:
                        msg = first.get("text") or first.get("content")
                    if msg:
                        return msg.strip()
                return str(data)

            else:
                # Direct Hugging Face Inference API for vision models
                url = f"https://api-inference.huggingface.co/models/{vision_model}"
                headers = {"Authorization": f"Bearer {self.hf_token}"}

                # For vision models, we need to send both text and image
                payload = {
                    "inputs": {
                        "text": prompt,
                        "image": base64_image
                    },
                    "parameters": {
                        "max_new_tokens": max_tokens,
                        "temperature": float(temperature),
                    },
                    "options": {"wait_for_model": True},
                }

                r = requests.post(url, headers=headers, json=payload, timeout=120)
                r.raise_for_status()
                data = r.json()

                # Handle different response formats
                if isinstance(data, list) and len(data) > 0:
                    if isinstance(data[0], dict):
                        if "generated_text" in data[0]:
                            return data[0]["generated_text"].strip()
                        elif "text" in data[0]:
                            return data[0]["text"].strip()
                elif isinstance(data, dict):
                    if "generated_text" in data:
                        return data["generated_text"].strip()
                    elif "text" in data:
                        return data["text"].strip()
                    elif "error" in data:
                        raise RuntimeError(f"Hugging Face error: {data['error']}")

                return str(data)

        except Exception as e:
            raise RuntimeError(f"Hugging Face Vision API call failed: {e}")


# Backwards-compatible factory in case caller expects a function or attribute
def GPTOSSWrapperFactory(model: Optional[str] = None, provider: Optional[str] = None):
    return GPTOSSWrapper(model=model or "gpt-oss-120", provider=provider or "auto")