|
|
|
|
|
""" |
|
|
Phase 0: HuggingFace Inference API Vision Validation |
|
|
Author: @mangubee |
|
|
Date: 2026-01-07 |
|
|
|
|
|
Tests HF Inference API with vision models to validate multimodal support BEFORE |
|
|
implementation. Decision gate: Only proceed to Phase 1 if ≥1 model works. |
|
|
|
|
|
Models to test (smallest → largest): |
|
|
1. microsoft/Phi-3.5-vision-instruct (3.8B) |
|
|
2. meta-llama/Llama-3.2-11B-Vision-Instruct (11B) |
|
|
3. Qwen/Qwen2-VL-72B-Instruct (72B) |
|
|
""" |
|
|
|
|
|
import os |
|
|
import base64 |
|
|
import logging |
|
|
from pathlib import Path |
|
|
from typing import Dict, Any, Optional |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
|
|
|
from dotenv import load_dotenv |
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
TEST_IMAGE_PATH = "test/fixtures/test_image_real.png" |
|
|
|
|
|
|
|
|
VISION_MODELS = [ |
|
|
"google/gemma-3-27b-it:scaleway", |
|
|
] |
|
|
|
|
|
|
|
|
TEST_QUESTIONS = [ |
|
|
"What is in this image?", |
|
|
"Describe the image in detail.", |
|
|
"What colors do you see?", |
|
|
] |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def encode_image_to_base64(image_path: str) -> str: |
|
|
"""Encode image file to base64 string.""" |
|
|
with open(image_path, "rb") as f: |
|
|
return base64.b64encode(f.read()).decode("utf-8") |
|
|
|
|
|
|
|
|
def get_test_image() -> str: |
|
|
"""Get test image path, verify it exists.""" |
|
|
path = Path(TEST_IMAGE_PATH) |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Test image not found: {TEST_IMAGE_PATH}") |
|
|
return TEST_IMAGE_PATH |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_vision_model_with_base64(model: str, image_b64: str, question: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Test HF Inference API with base64-encoded image. |
|
|
|
|
|
Args: |
|
|
model: Model name (e.g., "microsoft/Phi-3.5-vision-instruct") |
|
|
image_b64: Base64-encoded image string |
|
|
question: Question to ask about the image |
|
|
|
|
|
Returns: |
|
|
dict: Test result with status, response, error |
|
|
""" |
|
|
result = { |
|
|
"model": model, |
|
|
"format": "base64", |
|
|
"question": question, |
|
|
"status": "unknown", |
|
|
"response": None, |
|
|
"error": None, |
|
|
} |
|
|
|
|
|
try: |
|
|
client = InferenceClient(token=HF_TOKEN) |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": question}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": { |
|
|
"url": f"data:image/jpeg;base64,{image_b64}" |
|
|
} |
|
|
} |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
response = client.chat_completion( |
|
|
model=model, |
|
|
messages=messages, |
|
|
max_tokens=500, |
|
|
) |
|
|
|
|
|
result["status"] = "success" |
|
|
result["response"] = response.choices[0].message.content |
|
|
logger.info(f"✓ {model} (base64): Success") |
|
|
|
|
|
except Exception as e: |
|
|
result["status"] = "failed" |
|
|
result["error"] = str(e) |
|
|
logger.error(f"✗ {model} (base64): {e}") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def test_vision_model_with_url(model: str, image_path: str, question: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Test HF Inference API with local file path (converted to URL). |
|
|
|
|
|
Args: |
|
|
model: Model name |
|
|
image_path: Path to local image file |
|
|
question: Question to ask |
|
|
|
|
|
Returns: |
|
|
dict: Test result |
|
|
""" |
|
|
result = { |
|
|
"model": model, |
|
|
"format": "file_path", |
|
|
"question": question, |
|
|
"status": "unknown", |
|
|
"response": None, |
|
|
"error": None, |
|
|
} |
|
|
|
|
|
try: |
|
|
client = InferenceClient(token=HF_TOKEN) |
|
|
|
|
|
|
|
|
file_url = f"file://{Path(image_path).absolute()}" |
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": question}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": {"url": file_url} |
|
|
} |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
response = client.chat_completion( |
|
|
model=model, |
|
|
messages=messages, |
|
|
max_tokens=500, |
|
|
) |
|
|
|
|
|
result["status"] = "success" |
|
|
result["response"] = response.choices[0].message.content |
|
|
logger.info(f"✓ {model} (file_path): Success") |
|
|
|
|
|
except Exception as e: |
|
|
result["status"] = "failed" |
|
|
result["error"] = str(e) |
|
|
logger.error(f"✗ {model} (file_path): {e}") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def test_ocr_model(model: str, image_path: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Test OCR model using image-to-text approach (not chat completion). |
|
|
|
|
|
For models like DeepSeek-OCR that are image-to-text, not chat models. |
|
|
|
|
|
Args: |
|
|
model: Model name |
|
|
image_path: Path to local image file |
|
|
|
|
|
Returns: |
|
|
dict: Test result |
|
|
""" |
|
|
result = { |
|
|
"model": model, |
|
|
"format": "image_to_text", |
|
|
"question": "OCR/Text extraction", |
|
|
"status": "unknown", |
|
|
"response": None, |
|
|
"error": None, |
|
|
} |
|
|
|
|
|
try: |
|
|
client = InferenceClient(model=model, token=HF_TOKEN) |
|
|
|
|
|
|
|
|
with open(image_path, "rb") as f: |
|
|
image_data = f.read() |
|
|
|
|
|
response = client.image_to_text(image=image_data) |
|
|
|
|
|
result["status"] = "success" |
|
|
result["response"] = str(response) |
|
|
logger.info(f"✓ {model} (image_to_text): Success") |
|
|
|
|
|
except Exception as e: |
|
|
result["status"] = "failed" |
|
|
result["error"] = str(e) |
|
|
logger.error(f"✗ {model} (image_to_text): {e}") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_phase0_validation() -> Dict[str, Any]: |
|
|
""" |
|
|
Run Phase 0 validation: Test all models with all formats. |
|
|
|
|
|
Returns: |
|
|
dict: Summary of all test results |
|
|
""" |
|
|
if not HF_TOKEN: |
|
|
raise ValueError("HF_TOKEN environment variable not set") |
|
|
|
|
|
|
|
|
image_path = get_test_image() |
|
|
image_b64 = encode_image_to_base64(image_path) |
|
|
|
|
|
logger.info(f"Test image: {image_path}") |
|
|
logger.info(f"Image size: {len(image_b64)} chars (base64)") |
|
|
logger.info(f"Testing {len(VISION_MODELS)} models with 3 formats each") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
all_results = [] |
|
|
|
|
|
|
|
|
for model in VISION_MODELS: |
|
|
logger.info(f"\nTesting model: {model}") |
|
|
logger.info("-" * 60) |
|
|
|
|
|
model_results = [] |
|
|
|
|
|
|
|
|
is_ocr_model = "ocr" in model.lower() |
|
|
|
|
|
if is_ocr_model: |
|
|
|
|
|
result = test_ocr_model(model, image_path) |
|
|
model_results.append(result) |
|
|
else: |
|
|
|
|
|
for question in TEST_QUESTIONS[:1]: |
|
|
result = test_vision_model_with_base64(model, image_b64, question) |
|
|
model_results.append(result) |
|
|
|
|
|
|
|
|
if result["status"] == "success": |
|
|
|
|
|
result_fp = test_vision_model_with_url(model, image_path, question) |
|
|
model_results.append(result_fp) |
|
|
|
|
|
|
|
|
break |
|
|
|
|
|
all_results.extend(model_results) |
|
|
|
|
|
|
|
|
summary = { |
|
|
"total_tests": len(all_results), |
|
|
"successful": sum(1 for r in all_results if r["status"] == "success"), |
|
|
"failed": sum(1 for r in all_results if r["status"] == "failed"), |
|
|
"working_models": list(set(r["model"] for r in all_results if r["status"] == "success")), |
|
|
"working_formats": list(set(r["format"] for r in all_results if r["status"] == "success")), |
|
|
"results": all_results, |
|
|
} |
|
|
|
|
|
return summary |
|
|
|
|
|
|
|
|
def print_summary(summary: Dict[str, Any]) -> None: |
|
|
"""Print test summary and decision gate.""" |
|
|
logger.info("\n" + "=" * 60) |
|
|
logger.info("PHASE 0 VALIDATION SUMMARY") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
logger.info(f"\nTotal tests: {summary['total_tests']}") |
|
|
logger.info(f"✓ Successful: {summary['successful']}") |
|
|
logger.info(f"✗ Failed: {summary['failed']}") |
|
|
|
|
|
logger.info(f"\nWorking models: {summary['working_models']}") |
|
|
logger.info(f"Working formats: {summary['working_formats']}") |
|
|
|
|
|
|
|
|
logger.info("\n" + "=" * 60) |
|
|
logger.info("DECISION GATE") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
if summary['successful'] > 0: |
|
|
logger.info("\n✅ GO - Proceed to Phase 1 (Implementation)") |
|
|
logger.info(f"Recommended model: {summary['working_models'][0]} (smallest working)") |
|
|
logger.info(f"Use format: {summary['working_formats'][0]}") |
|
|
else: |
|
|
logger.info("\n❌ NO-GO - Pivot to backup options") |
|
|
logger.info("Backup options:") |
|
|
logger.info(" - Option C: HF Spaces deployment (custom endpoint)") |
|
|
logger.info(" - Option D: Local transformers library (no API)") |
|
|
logger.info(" - Option E: Hybrid (HF text + Gemini/Claude vision only)") |
|
|
|
|
|
|
|
|
logger.info("\n" + "=" * 60) |
|
|
logger.info("DETAILED RESULTS") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
for result in summary['results']: |
|
|
logger.info(f"\nModel: {result['model']}") |
|
|
logger.info(f"Format: {result['format']}") |
|
|
logger.info(f"Status: {result['status']}") |
|
|
if result['error']: |
|
|
logger.info(f"Error: {result['error']}") |
|
|
if result['response']: |
|
|
logger.info(f"Response: {result['response'][:200]}...") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("\n" + "=" * 60) |
|
|
print("PHASE 0: HF INFERENCE API VISION VALIDATION") |
|
|
print("=" * 60) |
|
|
print(f"HF Token: {'Set' if HF_TOKEN else 'NOT SET'}") |
|
|
print(f"Test image: {TEST_IMAGE_PATH}") |
|
|
print("=" * 60 + "\n") |
|
|
|
|
|
try: |
|
|
summary = run_phase0_validation() |
|
|
print_summary(summary) |
|
|
|
|
|
|
|
|
import json |
|
|
from datetime import datetime |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
output_dir = Path("user_io/result_ServerApp") |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
output_file = output_dir / f"phase0_vision_validation_{timestamp}.json" |
|
|
with open(output_file, "w") as f: |
|
|
json.dump(summary, f, indent=2) |
|
|
|
|
|
logger.info(f"\n✓ Results exported to: {output_file}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"\nPhase 0 validation failed: {e}") |
|
|
raise |
|
|
|