agentbee

Sleeping

File size: 11,514 Bytes

#!/usr/bin/env python3
"""
Phase 0: HuggingFace Inference API Vision Validation
Author: @mangubee
Date: 2026-01-07

Tests HF Inference API with vision models to validate multimodal support BEFORE
implementation. Decision gate: Only proceed to Phase 1 if ≥1 model works.

Models to test (smallest → largest):
1. microsoft/Phi-3.5-vision-instruct (3.8B)
2. meta-llama/Llama-3.2-11B-Vision-Instruct (11B)
3. Qwen/Qwen2-VL-72B-Instruct (72B)
"""

import os
import base64
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from huggingface_hub import InferenceClient

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

# ============================================================================
# CONFIG
# ============================================================================

HF_TOKEN = os.getenv("HF_TOKEN")
TEST_IMAGE_PATH = "test/fixtures/test_image_real.png"  # Real image for better testing

# Models to test (user specified with provider routing)
VISION_MODELS = [
    "google/gemma-3-27b-it:scaleway",
]

# Test questions (progressive complexity)
TEST_QUESTIONS = [
    "What is in this image?",
    "Describe the image in detail.",
    "What colors do you see?",
]

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# ============================================================================
# Helper Functions
# ============================================================================


def encode_image_to_base64(image_path: str) -> str:
    """Encode image file to base64 string."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


def get_test_image() -> str:
    """Get test image path, verify it exists."""
    path = Path(TEST_IMAGE_PATH)
    if not path.exists():
        raise FileNotFoundError(f"Test image not found: {TEST_IMAGE_PATH}")
    return TEST_IMAGE_PATH


# ============================================================================
# Test Functions
# ============================================================================


def test_vision_model_with_base64(model: str, image_b64: str, question: str) -> Dict[str, Any]:
    """
    Test HF Inference API with base64-encoded image.

    Args:
        model: Model name (e.g., "microsoft/Phi-3.5-vision-instruct")
        image_b64: Base64-encoded image string
        question: Question to ask about the image

    Returns:
        dict: Test result with status, response, error
    """
    result = {
        "model": model,
        "format": "base64",
        "question": question,
        "status": "unknown",
        "response": None,
        "error": None,
    }

    try:
        client = InferenceClient(token=HF_TOKEN)

        # Try chat_completion with image content
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_b64}"
                        }
                    }
                ]
            }
        ]

        response = client.chat_completion(
            model=model,
            messages=messages,
            max_tokens=500,
        )

        result["status"] = "success"
        result["response"] = response.choices[0].message.content
        logger.info(f"✓ {model} (base64): Success")

    except Exception as e:
        result["status"] = "failed"
        result["error"] = str(e)
        logger.error(f"✗ {model} (base64): {e}")

    return result


def test_vision_model_with_url(model: str, image_path: str, question: str) -> Dict[str, Any]:
    """
    Test HF Inference API with local file path (converted to URL).

    Args:
        model: Model name
        image_path: Path to local image file
        question: Question to ask

    Returns:
        dict: Test result
    """
    result = {
        "model": model,
        "format": "file_path",
        "question": question,
        "status": "unknown",
        "response": None,
        "error": None,
    }

    try:
        client = InferenceClient(token=HF_TOKEN)

        # Try with file:// URL
        file_url = f"file://{Path(image_path).absolute()}"

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {
                        "type": "image_url",
                        "image_url": {"url": file_url}
                    }
                ]
            }
        ]

        response = client.chat_completion(
            model=model,
            messages=messages,
            max_tokens=500,
        )

        result["status"] = "success"
        result["response"] = response.choices[0].message.content
        logger.info(f"✓ {model} (file_path): Success")

    except Exception as e:
        result["status"] = "failed"
        result["error"] = str(e)
        logger.error(f"✗ {model} (file_path): {e}")

    return result


def test_ocr_model(model: str, image_path: str) -> Dict[str, Any]:
    """
    Test OCR model using image-to-text approach (not chat completion).

    For models like DeepSeek-OCR that are image-to-text, not chat models.

    Args:
        model: Model name
        image_path: Path to local image file

    Returns:
        dict: Test result
    """
    result = {
        "model": model,
        "format": "image_to_text",
        "question": "OCR/Text extraction",
        "status": "unknown",
        "response": None,
        "error": None,
    }

    try:
        client = InferenceClient(model=model, token=HF_TOKEN)

        # Try image-to-text endpoint
        with open(image_path, "rb") as f:
            image_data = f.read()

        response = client.image_to_text(image=image_data)

        result["status"] = "success"
        result["response"] = str(response)
        logger.info(f"✓ {model} (image_to_text): Success")

    except Exception as e:
        result["status"] = "failed"
        result["error"] = str(e)
        logger.error(f"✗ {model} (image_to_text): {e}")

    return result


# ============================================================================
# Main Test Execution
# ============================================================================


def run_phase0_validation() -> Dict[str, Any]:
    """
    Run Phase 0 validation: Test all models with all formats.

    Returns:
        dict: Summary of all test results
    """
    if not HF_TOKEN:
        raise ValueError("HF_TOKEN environment variable not set")

    # Get test image
    image_path = get_test_image()
    image_b64 = encode_image_to_base64(image_path)

    logger.info(f"Test image: {image_path}")
    logger.info(f"Image size: {len(image_b64)} chars (base64)")
    logger.info(f"Testing {len(VISION_MODELS)} models with 3 formats each")
    logger.info("=" * 60)

    all_results = []

    # Test each model
    for model in VISION_MODELS:
        logger.info(f"\nTesting model: {model}")
        logger.info("-" * 60)

        model_results = []

        # Check if this is an OCR model (contains "OCR" in name)
        is_ocr_model = "ocr" in model.lower()

        if is_ocr_model:
            # Test with image_to_text endpoint for OCR models
            result = test_ocr_model(model, image_path)
            model_results.append(result)
        else:
            # Test with base64 (most likely to work for chat models)
            for question in TEST_QUESTIONS[:1]:  # Just 1 question for speed
                result = test_vision_model_with_base64(model, image_b64, question)
                model_results.append(result)

                # If base64 works, test other formats
                if result["status"] == "success":
                    # Test file path
                    result_fp = test_vision_model_with_url(model, image_path, question)
                    model_results.append(result_fp)

                    # Don't test other questions if first worked
                    break

        all_results.extend(model_results)

    # Compile summary
    summary = {
        "total_tests": len(all_results),
        "successful": sum(1 for r in all_results if r["status"] == "success"),
        "failed": sum(1 for r in all_results if r["status"] == "failed"),
        "working_models": list(set(r["model"] for r in all_results if r["status"] == "success")),
        "working_formats": list(set(r["format"] for r in all_results if r["status"] == "success")),
        "results": all_results,
    }

    return summary


def print_summary(summary: Dict[str, Any]) -> None:
    """Print test summary and decision gate."""
    logger.info("\n" + "=" * 60)
    logger.info("PHASE 0 VALIDATION SUMMARY")
    logger.info("=" * 60)

    logger.info(f"\nTotal tests: {summary['total_tests']}")
    logger.info(f"✓ Successful: {summary['successful']}")
    logger.info(f"✗ Failed: {summary['failed']}")

    logger.info(f"\nWorking models: {summary['working_models']}")
    logger.info(f"Working formats: {summary['working_formats']}")

    # Decision gate
    logger.info("\n" + "=" * 60)
    logger.info("DECISION GATE")
    logger.info("=" * 60)

    if summary['successful'] > 0:
        logger.info("\n✅ GO - Proceed to Phase 1 (Implementation)")
        logger.info(f"Recommended model: {summary['working_models'][0]} (smallest working)")
        logger.info(f"Use format: {summary['working_formats'][0]}")
    else:
        logger.info("\n❌ NO-GO - Pivot to backup options")
        logger.info("Backup options:")
        logger.info("  - Option C: HF Spaces deployment (custom endpoint)")
        logger.info("  - Option D: Local transformers library (no API)")
        logger.info("  - Option E: Hybrid (HF text + Gemini/Claude vision only)")

    # Print detailed results
    logger.info("\n" + "=" * 60)
    logger.info("DETAILED RESULTS")
    logger.info("=" * 60)

    for result in summary['results']:
        logger.info(f"\nModel: {result['model']}")
        logger.info(f"Format: {result['format']}")
        logger.info(f"Status: {result['status']}")
        if result['error']:
            logger.info(f"Error: {result['error']}")
        if result['response']:
            logger.info(f"Response: {result['response'][:200]}...")


if __name__ == "__main__":
    print("\n" + "=" * 60)
    print("PHASE 0: HF INFERENCE API VISION VALIDATION")
    print("=" * 60)
    print(f"HF Token: {'Set' if HF_TOKEN else 'NOT SET'}")
    print(f"Test image: {TEST_IMAGE_PATH}")
    print("=" * 60 + "\n")

    try:
        summary = run_phase0_validation()
        print_summary(summary)

        # Export results for documentation
        import json
        from datetime import datetime

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_dir = Path("user_io/result_ServerApp")
        output_dir.mkdir(parents=True, exist_ok=True)

        output_file = output_dir / f"phase0_vision_validation_{timestamp}.json"
        with open(output_file, "w") as f:
            json.dump(summary, f, indent=2)

        logger.info(f"\n✓ Results exported to: {output_file}")

    except Exception as e:
        logger.error(f"\nPhase 0 validation failed: {e}")
        raise