#!/usr/bin/env python3 """ Phase 0: HuggingFace Inference API Vision Validation Author: @mangubee Date: 2026-01-07 Tests HF Inference API with vision models to validate multimodal support BEFORE implementation. Decision gate: Only proceed to Phase 1 if ≥1 model works. Models to test (smallest → largest): 1. microsoft/Phi-3.5-vision-instruct (3.8B) 2. meta-llama/Llama-3.2-11B-Vision-Instruct (11B) 3. Qwen/Qwen2-VL-72B-Instruct (72B) """ import os import base64 import logging from pathlib import Path from typing import Dict, Any, Optional from huggingface_hub import InferenceClient # Load environment variables from .env file from dotenv import load_dotenv load_dotenv() # ============================================================================ # CONFIG # ============================================================================ HF_TOKEN = os.getenv("HF_TOKEN") TEST_IMAGE_PATH = "test/fixtures/test_image_real.png" # Real image for better testing # Models to test (user specified with provider routing) VISION_MODELS = [ "google/gemma-3-27b-it:scaleway", ] # Test questions (progressive complexity) TEST_QUESTIONS = [ "What is in this image?", "Describe the image in detail.", "What colors do you see?", ] # Logging setup logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # ============================================================================ # Helper Functions # ============================================================================ def encode_image_to_base64(image_path: str) -> str: """Encode image file to base64 string.""" with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") def get_test_image() -> str: """Get test image path, verify it exists.""" path = Path(TEST_IMAGE_PATH) if not path.exists(): raise FileNotFoundError(f"Test image not found: {TEST_IMAGE_PATH}") return TEST_IMAGE_PATH # ============================================================================ # Test Functions # ============================================================================ def test_vision_model_with_base64(model: str, image_b64: str, question: str) -> Dict[str, Any]: """ Test HF Inference API with base64-encoded image. Args: model: Model name (e.g., "microsoft/Phi-3.5-vision-instruct") image_b64: Base64-encoded image string question: Question to ask about the image Returns: dict: Test result with status, response, error """ result = { "model": model, "format": "base64", "question": question, "status": "unknown", "response": None, "error": None, } try: client = InferenceClient(token=HF_TOKEN) # Try chat_completion with image content messages = [ { "role": "user", "content": [ {"type": "text", "text": question}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{image_b64}" } } ] } ] response = client.chat_completion( model=model, messages=messages, max_tokens=500, ) result["status"] = "success" result["response"] = response.choices[0].message.content logger.info(f"✓ {model} (base64): Success") except Exception as e: result["status"] = "failed" result["error"] = str(e) logger.error(f"✗ {model} (base64): {e}") return result def test_vision_model_with_url(model: str, image_path: str, question: str) -> Dict[str, Any]: """ Test HF Inference API with local file path (converted to URL). Args: model: Model name image_path: Path to local image file question: Question to ask Returns: dict: Test result """ result = { "model": model, "format": "file_path", "question": question, "status": "unknown", "response": None, "error": None, } try: client = InferenceClient(token=HF_TOKEN) # Try with file:// URL file_url = f"file://{Path(image_path).absolute()}" messages = [ { "role": "user", "content": [ {"type": "text", "text": question}, { "type": "image_url", "image_url": {"url": file_url} } ] } ] response = client.chat_completion( model=model, messages=messages, max_tokens=500, ) result["status"] = "success" result["response"] = response.choices[0].message.content logger.info(f"✓ {model} (file_path): Success") except Exception as e: result["status"] = "failed" result["error"] = str(e) logger.error(f"✗ {model} (file_path): {e}") return result def test_ocr_model(model: str, image_path: str) -> Dict[str, Any]: """ Test OCR model using image-to-text approach (not chat completion). For models like DeepSeek-OCR that are image-to-text, not chat models. Args: model: Model name image_path: Path to local image file Returns: dict: Test result """ result = { "model": model, "format": "image_to_text", "question": "OCR/Text extraction", "status": "unknown", "response": None, "error": None, } try: client = InferenceClient(model=model, token=HF_TOKEN) # Try image-to-text endpoint with open(image_path, "rb") as f: image_data = f.read() response = client.image_to_text(image=image_data) result["status"] = "success" result["response"] = str(response) logger.info(f"✓ {model} (image_to_text): Success") except Exception as e: result["status"] = "failed" result["error"] = str(e) logger.error(f"✗ {model} (image_to_text): {e}") return result # ============================================================================ # Main Test Execution # ============================================================================ def run_phase0_validation() -> Dict[str, Any]: """ Run Phase 0 validation: Test all models with all formats. Returns: dict: Summary of all test results """ if not HF_TOKEN: raise ValueError("HF_TOKEN environment variable not set") # Get test image image_path = get_test_image() image_b64 = encode_image_to_base64(image_path) logger.info(f"Test image: {image_path}") logger.info(f"Image size: {len(image_b64)} chars (base64)") logger.info(f"Testing {len(VISION_MODELS)} models with 3 formats each") logger.info("=" * 60) all_results = [] # Test each model for model in VISION_MODELS: logger.info(f"\nTesting model: {model}") logger.info("-" * 60) model_results = [] # Check if this is an OCR model (contains "OCR" in name) is_ocr_model = "ocr" in model.lower() if is_ocr_model: # Test with image_to_text endpoint for OCR models result = test_ocr_model(model, image_path) model_results.append(result) else: # Test with base64 (most likely to work for chat models) for question in TEST_QUESTIONS[:1]: # Just 1 question for speed result = test_vision_model_with_base64(model, image_b64, question) model_results.append(result) # If base64 works, test other formats if result["status"] == "success": # Test file path result_fp = test_vision_model_with_url(model, image_path, question) model_results.append(result_fp) # Don't test other questions if first worked break all_results.extend(model_results) # Compile summary summary = { "total_tests": len(all_results), "successful": sum(1 for r in all_results if r["status"] == "success"), "failed": sum(1 for r in all_results if r["status"] == "failed"), "working_models": list(set(r["model"] for r in all_results if r["status"] == "success")), "working_formats": list(set(r["format"] for r in all_results if r["status"] == "success")), "results": all_results, } return summary def print_summary(summary: Dict[str, Any]) -> None: """Print test summary and decision gate.""" logger.info("\n" + "=" * 60) logger.info("PHASE 0 VALIDATION SUMMARY") logger.info("=" * 60) logger.info(f"\nTotal tests: {summary['total_tests']}") logger.info(f"✓ Successful: {summary['successful']}") logger.info(f"✗ Failed: {summary['failed']}") logger.info(f"\nWorking models: {summary['working_models']}") logger.info(f"Working formats: {summary['working_formats']}") # Decision gate logger.info("\n" + "=" * 60) logger.info("DECISION GATE") logger.info("=" * 60) if summary['successful'] > 0: logger.info("\n✅ GO - Proceed to Phase 1 (Implementation)") logger.info(f"Recommended model: {summary['working_models'][0]} (smallest working)") logger.info(f"Use format: {summary['working_formats'][0]}") else: logger.info("\n❌ NO-GO - Pivot to backup options") logger.info("Backup options:") logger.info(" - Option C: HF Spaces deployment (custom endpoint)") logger.info(" - Option D: Local transformers library (no API)") logger.info(" - Option E: Hybrid (HF text + Gemini/Claude vision only)") # Print detailed results logger.info("\n" + "=" * 60) logger.info("DETAILED RESULTS") logger.info("=" * 60) for result in summary['results']: logger.info(f"\nModel: {result['model']}") logger.info(f"Format: {result['format']}") logger.info(f"Status: {result['status']}") if result['error']: logger.info(f"Error: {result['error']}") if result['response']: logger.info(f"Response: {result['response'][:200]}...") if __name__ == "__main__": print("\n" + "=" * 60) print("PHASE 0: HF INFERENCE API VISION VALIDATION") print("=" * 60) print(f"HF Token: {'Set' if HF_TOKEN else 'NOT SET'}") print(f"Test image: {TEST_IMAGE_PATH}") print("=" * 60 + "\n") try: summary = run_phase0_validation() print_summary(summary) # Export results for documentation import json from datetime import datetime timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = Path("user_io/result_ServerApp") output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / f"phase0_vision_validation_{timestamp}.json" with open(output_file, "w") as f: json.dump(summary, f, indent=2) logger.info(f"\n✓ Results exported to: {output_file}") except Exception as e: logger.error(f"\nPhase 0 validation failed: {e}") raise