agentbee

Running

App Files Files Community

agentbee / test /test_phase0_hf_vision_api.py

mangubee

fix: correct author name formatting in multiple files

e7b4937 9 days ago

raw

history blame contribute delete

11.5 kB

	#!/usr/bin/env python3
	"""
	Phase 0: HuggingFace Inference API Vision Validation
	Author: @mangubee
	Date: 2026-01-07

	Tests HF Inference API with vision models to validate multimodal support BEFORE
	implementation. Decision gate: Only proceed to Phase 1 if ≥1 model works.

	Models to test (smallest → largest):
	1. microsoft/Phi-3.5-vision-instruct (3.8B)
	2. meta-llama/Llama-3.2-11B-Vision-Instruct (11B)
	3. Qwen/Qwen2-VL-72B-Instruct (72B)
	"""

	import os
	import base64
	import logging
	from pathlib import Path
	from typing import Dict, Any, Optional
	from huggingface_hub import InferenceClient

	# Load environment variables from .env file
	from dotenv import load_dotenv
	load_dotenv()

	# ============================================================================
	# CONFIG
	# ============================================================================

	HF_TOKEN = os.getenv("HF_TOKEN")
	TEST_IMAGE_PATH = "test/fixtures/test_image_real.png" # Real image for better testing

	# Models to test (user specified with provider routing)
	VISION_MODELS = [
	"google/gemma-3-27b-it:scaleway",
	]

	# Test questions (progressive complexity)
	TEST_QUESTIONS = [
	"What is in this image?",
	"Describe the image in detail.",
	"What colors do you see?",
	]

	# Logging setup
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
	)
	logger = logging.getLogger(__name__)

	# ============================================================================
	# Helper Functions
	# ============================================================================


	def encode_image_to_base64(image_path: str) -> str:
	"""Encode image file to base64 string."""
	with open(image_path, "rb") as f:
	return base64.b64encode(f.read()).decode("utf-8")


	def get_test_image() -> str:
	"""Get test image path, verify it exists."""
	path = Path(TEST_IMAGE_PATH)
	if not path.exists():
	raise FileNotFoundError(f"Test image not found: {TEST_IMAGE_PATH}")
	return TEST_IMAGE_PATH


	# ============================================================================
	# Test Functions
	# ============================================================================


	def test_vision_model_with_base64(model: str, image_b64: str, question: str) -> Dict[str, Any]:
	"""
	Test HF Inference API with base64-encoded image.

	Args:
	model: Model name (e.g., "microsoft/Phi-3.5-vision-instruct")
	image_b64: Base64-encoded image string
	question: Question to ask about the image

	Returns:
	dict: Test result with status, response, error
	"""
	result = {
	"model": model,
	"format": "base64",
	"question": question,
	"status": "unknown",
	"response": None,
	"error": None,
	}

	try:
	client = InferenceClient(token=HF_TOKEN)

	# Try chat_completion with image content
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{image_b64}"
	}
	}
	]
	}
	]

	response = client.chat_completion(
	model=model,
	messages=messages,
	max_tokens=500,
	)

	result["status"] = "success"
	result["response"] = response.choices[0].message.content
	logger.info(f"✓ {model} (base64): Success")

	except Exception as e:
	result["status"] = "failed"
	result["error"] = str(e)
	logger.error(f"✗ {model} (base64): {e}")

	return result


	def test_vision_model_with_url(model: str, image_path: str, question: str) -> Dict[str, Any]:
	"""
	Test HF Inference API with local file path (converted to URL).

	Args:
	model: Model name
	image_path: Path to local image file
	question: Question to ask

	Returns:
	dict: Test result
	"""
	result = {
	"model": model,
	"format": "file_path",
	"question": question,
	"status": "unknown",
	"response": None,
	"error": None,
	}

	try:
	client = InferenceClient(token=HF_TOKEN)

	# Try with file:// URL
	file_url = f"file://{Path(image_path).absolute()}"

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	{
	"type": "image_url",
	"image_url": {"url": file_url}
	}
	]
	}
	]

	response = client.chat_completion(
	model=model,
	messages=messages,
	max_tokens=500,
	)

	result["status"] = "success"
	result["response"] = response.choices[0].message.content
	logger.info(f"✓ {model} (file_path): Success")

	except Exception as e:
	result["status"] = "failed"
	result["error"] = str(e)
	logger.error(f"✗ {model} (file_path): {e}")

	return result


	def test_ocr_model(model: str, image_path: str) -> Dict[str, Any]:
	"""
	Test OCR model using image-to-text approach (not chat completion).

	For models like DeepSeek-OCR that are image-to-text, not chat models.

	Args:
	model: Model name
	image_path: Path to local image file

	Returns:
	dict: Test result
	"""
	result = {
	"model": model,
	"format": "image_to_text",
	"question": "OCR/Text extraction",
	"status": "unknown",
	"response": None,
	"error": None,
	}

	try:
	client = InferenceClient(model=model, token=HF_TOKEN)

	# Try image-to-text endpoint
	with open(image_path, "rb") as f:
	image_data = f.read()

	response = client.image_to_text(image=image_data)

	result["status"] = "success"
	result["response"] = str(response)
	logger.info(f"✓ {model} (image_to_text): Success")

	except Exception as e:
	result["status"] = "failed"
	result["error"] = str(e)
	logger.error(f"✗ {model} (image_to_text): {e}")

	return result


	# ============================================================================
	# Main Test Execution
	# ============================================================================


	def run_phase0_validation() -> Dict[str, Any]:
	"""
	Run Phase 0 validation: Test all models with all formats.

	Returns:
	dict: Summary of all test results
	"""
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN environment variable not set")

	# Get test image
	image_path = get_test_image()
	image_b64 = encode_image_to_base64(image_path)

	logger.info(f"Test image: {image_path}")
	logger.info(f"Image size: {len(image_b64)} chars (base64)")
	logger.info(f"Testing {len(VISION_MODELS)} models with 3 formats each")
	logger.info("=" * 60)

	all_results = []

	# Test each model
	for model in VISION_MODELS:
	logger.info(f"\nTesting model: {model}")
	logger.info("-" * 60)

	model_results = []

	# Check if this is an OCR model (contains "OCR" in name)
	is_ocr_model = "ocr" in model.lower()

	if is_ocr_model:
	# Test with image_to_text endpoint for OCR models
	result = test_ocr_model(model, image_path)
	model_results.append(result)
	else:
	# Test with base64 (most likely to work for chat models)
	for question in TEST_QUESTIONS[:1]: # Just 1 question for speed
	result = test_vision_model_with_base64(model, image_b64, question)
	model_results.append(result)

	# If base64 works, test other formats
	if result["status"] == "success":
	# Test file path
	result_fp = test_vision_model_with_url(model, image_path, question)
	model_results.append(result_fp)

	# Don't test other questions if first worked
	break

	all_results.extend(model_results)

	# Compile summary
	summary = {
	"total_tests": len(all_results),
	"successful": sum(1 for r in all_results if r["status"] == "success"),
	"failed": sum(1 for r in all_results if r["status"] == "failed"),
	"working_models": list(set(r["model"] for r in all_results if r["status"] == "success")),
	"working_formats": list(set(r["format"] for r in all_results if r["status"] == "success")),
	"results": all_results,
	}

	return summary


	def print_summary(summary: Dict[str, Any]) -> None:
	"""Print test summary and decision gate."""
	logger.info("\n" + "=" * 60)
	logger.info("PHASE 0 VALIDATION SUMMARY")
	logger.info("=" * 60)

	logger.info(f"\nTotal tests: {summary['total_tests']}")
	logger.info(f"✓ Successful: {summary['successful']}")
	logger.info(f"✗ Failed: {summary['failed']}")

	logger.info(f"\nWorking models: {summary['working_models']}")
	logger.info(f"Working formats: {summary['working_formats']}")

	# Decision gate
	logger.info("\n" + "=" * 60)
	logger.info("DECISION GATE")
	logger.info("=" * 60)

	if summary['successful'] > 0:
	logger.info("\n✅ GO - Proceed to Phase 1 (Implementation)")
	logger.info(f"Recommended model: {summary['working_models'][0]} (smallest working)")
	logger.info(f"Use format: {summary['working_formats'][0]}")
	else:
	logger.info("\n❌ NO-GO - Pivot to backup options")
	logger.info("Backup options:")
	logger.info(" - Option C: HF Spaces deployment (custom endpoint)")
	logger.info(" - Option D: Local transformers library (no API)")
	logger.info(" - Option E: Hybrid (HF text + Gemini/Claude vision only)")

	# Print detailed results
	logger.info("\n" + "=" * 60)
	logger.info("DETAILED RESULTS")
	logger.info("=" * 60)

	for result in summary['results']:
	logger.info(f"\nModel: {result['model']}")
	logger.info(f"Format: {result['format']}")
	logger.info(f"Status: {result['status']}")
	if result['error']:
	logger.info(f"Error: {result['error']}")
	if result['response']:
	logger.info(f"Response: {result['response'][:200]}...")


	if __name__ == "__main__":
	print("\n" + "=" * 60)
	print("PHASE 0: HF INFERENCE API VISION VALIDATION")
	print("=" * 60)
	print(f"HF Token: {'Set' if HF_TOKEN else 'NOT SET'}")
	print(f"Test image: {TEST_IMAGE_PATH}")
	print("=" * 60 + "\n")

	try:
	summary = run_phase0_validation()
	print_summary(summary)

	# Export results for documentation
	import json
	from datetime import datetime

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_dir = Path("user_io/result_ServerApp")
	output_dir.mkdir(parents=True, exist_ok=True)

	output_file = output_dir / f"phase0_vision_validation_{timestamp}.json"
	with open(output_file, "w") as f:
	json.dump(summary, f, indent=2)

	logger.info(f"\n✓ Results exported to: {output_file}")

	except Exception as e:
	logger.error(f"\nPhase 0 validation failed: {e}")
	raise