File size: 3,032 Bytes
d33203e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
"""
Model processing utilities for local and remote AI models
"""
import requests
import base64
from io import BytesIO
from PIL import Image
from typing import Dict, Any, Optional


def image_to_base64(image: Image.Image) -> str:
    """Convert PIL image to base64 string"""
    buffer = BytesIO()
    image.save(buffer, format="PNG")
    img_str = base64.b64encode(buffer.getvalue()).decode()
    return img_str


def process_image_locally(image: Image.Image, prompt: str, model_name: str, local_manager) -> Dict[str, Any]:
    """
    Process image using local models
    """
    try:
        if model_name == "Person on Track Detector":
            # Special handling for person-on-track detection
            result = local_manager.person_on_track_detector.detect_person_on_track(image)
            return {"person_on_track_detection": result}
        else:
            caption = local_manager.generate_caption(model_name, image, prompt)
            return {"generated_text": caption}
    except Exception as e:
        return {"error": f"Local processing failed: {str(e)}"}


def query_huggingface_api(image: Image.Image, prompt: str, model_name: str, api_token: str) -> Dict[str, Any]:
    """
    Query Hugging Face API with image and prompt
    """
    API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
    headers = {"Authorization": f"Bearer {api_token}"}
    
    # Convert image to base64
    img_base64 = image_to_base64(image)
    
    # Prepare payload based on model type
    if "blip" in model_name.lower():
        # For BLIP models, send image directly
        buffer = BytesIO()
        image.save(buffer, format="PNG")
        response = requests.post(
            API_URL,
            headers=headers,
            files={"file": buffer.getvalue()}
        )
    else:
        # For other vision-language models
        payload = {
            "inputs": {
                "image": img_base64,
                "text": prompt
            }
        }
        response = requests.post(API_URL, headers=headers, json=payload)
    
    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"API request failed: {response.status_code} - {response.text}"}


def process_frame(frame_data: Dict, config: Dict[str, Any], local_manager=None) -> Dict[str, Any]:
    """
    Process a single frame using the configured model
    """
    model_type = config["model_type"]
    selected_model = config["selected_model"]
    prompt = config.get("prompt", "")
    api_token = config.get("api_token")
    
    # Process frame based on model type
    if model_type == "Local Models" and local_manager:
        result = process_image_locally(
            frame_data['frame'],
            prompt,
            selected_model,
            local_manager
        )
    else:
        result = query_huggingface_api(
            frame_data['frame'],
            prompt,
            selected_model,
            api_token
        )
    
    return result