File size: 11,514 Bytes
630f609 e7b4937 630f609 3dcf523 630f609 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 |
#!/usr/bin/env python3
"""
Phase 0: HuggingFace Inference API Vision Validation
Author: @mangubee
Date: 2026-01-07
Tests HF Inference API with vision models to validate multimodal support BEFORE
implementation. Decision gate: Only proceed to Phase 1 if β₯1 model works.
Models to test (smallest β largest):
1. microsoft/Phi-3.5-vision-instruct (3.8B)
2. meta-llama/Llama-3.2-11B-Vision-Instruct (11B)
3. Qwen/Qwen2-VL-72B-Instruct (72B)
"""
import os
import base64
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from huggingface_hub import InferenceClient
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
# ============================================================================
# CONFIG
# ============================================================================
HF_TOKEN = os.getenv("HF_TOKEN")
TEST_IMAGE_PATH = "test/fixtures/test_image_real.png" # Real image for better testing
# Models to test (user specified with provider routing)
VISION_MODELS = [
"google/gemma-3-27b-it:scaleway",
]
# Test questions (progressive complexity)
TEST_QUESTIONS = [
"What is in this image?",
"Describe the image in detail.",
"What colors do you see?",
]
# Logging setup
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# ============================================================================
# Helper Functions
# ============================================================================
def encode_image_to_base64(image_path: str) -> str:
"""Encode image file to base64 string."""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def get_test_image() -> str:
"""Get test image path, verify it exists."""
path = Path(TEST_IMAGE_PATH)
if not path.exists():
raise FileNotFoundError(f"Test image not found: {TEST_IMAGE_PATH}")
return TEST_IMAGE_PATH
# ============================================================================
# Test Functions
# ============================================================================
def test_vision_model_with_base64(model: str, image_b64: str, question: str) -> Dict[str, Any]:
"""
Test HF Inference API with base64-encoded image.
Args:
model: Model name (e.g., "microsoft/Phi-3.5-vision-instruct")
image_b64: Base64-encoded image string
question: Question to ask about the image
Returns:
dict: Test result with status, response, error
"""
result = {
"model": model,
"format": "base64",
"question": question,
"status": "unknown",
"response": None,
"error": None,
}
try:
client = InferenceClient(token=HF_TOKEN)
# Try chat_completion with image content
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_b64}"
}
}
]
}
]
response = client.chat_completion(
model=model,
messages=messages,
max_tokens=500,
)
result["status"] = "success"
result["response"] = response.choices[0].message.content
logger.info(f"β {model} (base64): Success")
except Exception as e:
result["status"] = "failed"
result["error"] = str(e)
logger.error(f"β {model} (base64): {e}")
return result
def test_vision_model_with_url(model: str, image_path: str, question: str) -> Dict[str, Any]:
"""
Test HF Inference API with local file path (converted to URL).
Args:
model: Model name
image_path: Path to local image file
question: Question to ask
Returns:
dict: Test result
"""
result = {
"model": model,
"format": "file_path",
"question": question,
"status": "unknown",
"response": None,
"error": None,
}
try:
client = InferenceClient(token=HF_TOKEN)
# Try with file:// URL
file_url = f"file://{Path(image_path).absolute()}"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {"url": file_url}
}
]
}
]
response = client.chat_completion(
model=model,
messages=messages,
max_tokens=500,
)
result["status"] = "success"
result["response"] = response.choices[0].message.content
logger.info(f"β {model} (file_path): Success")
except Exception as e:
result["status"] = "failed"
result["error"] = str(e)
logger.error(f"β {model} (file_path): {e}")
return result
def test_ocr_model(model: str, image_path: str) -> Dict[str, Any]:
"""
Test OCR model using image-to-text approach (not chat completion).
For models like DeepSeek-OCR that are image-to-text, not chat models.
Args:
model: Model name
image_path: Path to local image file
Returns:
dict: Test result
"""
result = {
"model": model,
"format": "image_to_text",
"question": "OCR/Text extraction",
"status": "unknown",
"response": None,
"error": None,
}
try:
client = InferenceClient(model=model, token=HF_TOKEN)
# Try image-to-text endpoint
with open(image_path, "rb") as f:
image_data = f.read()
response = client.image_to_text(image=image_data)
result["status"] = "success"
result["response"] = str(response)
logger.info(f"β {model} (image_to_text): Success")
except Exception as e:
result["status"] = "failed"
result["error"] = str(e)
logger.error(f"β {model} (image_to_text): {e}")
return result
# ============================================================================
# Main Test Execution
# ============================================================================
def run_phase0_validation() -> Dict[str, Any]:
"""
Run Phase 0 validation: Test all models with all formats.
Returns:
dict: Summary of all test results
"""
if not HF_TOKEN:
raise ValueError("HF_TOKEN environment variable not set")
# Get test image
image_path = get_test_image()
image_b64 = encode_image_to_base64(image_path)
logger.info(f"Test image: {image_path}")
logger.info(f"Image size: {len(image_b64)} chars (base64)")
logger.info(f"Testing {len(VISION_MODELS)} models with 3 formats each")
logger.info("=" * 60)
all_results = []
# Test each model
for model in VISION_MODELS:
logger.info(f"\nTesting model: {model}")
logger.info("-" * 60)
model_results = []
# Check if this is an OCR model (contains "OCR" in name)
is_ocr_model = "ocr" in model.lower()
if is_ocr_model:
# Test with image_to_text endpoint for OCR models
result = test_ocr_model(model, image_path)
model_results.append(result)
else:
# Test with base64 (most likely to work for chat models)
for question in TEST_QUESTIONS[:1]: # Just 1 question for speed
result = test_vision_model_with_base64(model, image_b64, question)
model_results.append(result)
# If base64 works, test other formats
if result["status"] == "success":
# Test file path
result_fp = test_vision_model_with_url(model, image_path, question)
model_results.append(result_fp)
# Don't test other questions if first worked
break
all_results.extend(model_results)
# Compile summary
summary = {
"total_tests": len(all_results),
"successful": sum(1 for r in all_results if r["status"] == "success"),
"failed": sum(1 for r in all_results if r["status"] == "failed"),
"working_models": list(set(r["model"] for r in all_results if r["status"] == "success")),
"working_formats": list(set(r["format"] for r in all_results if r["status"] == "success")),
"results": all_results,
}
return summary
def print_summary(summary: Dict[str, Any]) -> None:
"""Print test summary and decision gate."""
logger.info("\n" + "=" * 60)
logger.info("PHASE 0 VALIDATION SUMMARY")
logger.info("=" * 60)
logger.info(f"\nTotal tests: {summary['total_tests']}")
logger.info(f"β Successful: {summary['successful']}")
logger.info(f"β Failed: {summary['failed']}")
logger.info(f"\nWorking models: {summary['working_models']}")
logger.info(f"Working formats: {summary['working_formats']}")
# Decision gate
logger.info("\n" + "=" * 60)
logger.info("DECISION GATE")
logger.info("=" * 60)
if summary['successful'] > 0:
logger.info("\nβ
GO - Proceed to Phase 1 (Implementation)")
logger.info(f"Recommended model: {summary['working_models'][0]} (smallest working)")
logger.info(f"Use format: {summary['working_formats'][0]}")
else:
logger.info("\nβ NO-GO - Pivot to backup options")
logger.info("Backup options:")
logger.info(" - Option C: HF Spaces deployment (custom endpoint)")
logger.info(" - Option D: Local transformers library (no API)")
logger.info(" - Option E: Hybrid (HF text + Gemini/Claude vision only)")
# Print detailed results
logger.info("\n" + "=" * 60)
logger.info("DETAILED RESULTS")
logger.info("=" * 60)
for result in summary['results']:
logger.info(f"\nModel: {result['model']}")
logger.info(f"Format: {result['format']}")
logger.info(f"Status: {result['status']}")
if result['error']:
logger.info(f"Error: {result['error']}")
if result['response']:
logger.info(f"Response: {result['response'][:200]}...")
if __name__ == "__main__":
print("\n" + "=" * 60)
print("PHASE 0: HF INFERENCE API VISION VALIDATION")
print("=" * 60)
print(f"HF Token: {'Set' if HF_TOKEN else 'NOT SET'}")
print(f"Test image: {TEST_IMAGE_PATH}")
print("=" * 60 + "\n")
try:
summary = run_phase0_validation()
print_summary(summary)
# Export results for documentation
import json
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path("user_io/result_ServerApp")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"phase0_vision_validation_{timestamp}.json"
with open(output_file, "w") as f:
json.dump(summary, f, indent=2)
logger.info(f"\nβ Results exported to: {output_file}")
except Exception as e:
logger.error(f"\nPhase 0 validation failed: {e}")
raise
|