File size: 4,830 Bytes
3f4fb13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import base64
import os
from openai import OpenAI

# Initialize NVIDIA Client
client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=os.getenv('GEMINI_API_KEY_1')
)

# Model configurations
PRIMARY_MODEL = "meta/llama-3.2-90b-vision-instruct"
FALLBACK_MODEL = "meta/llama-3.1-70b-instruct"  # Text-only fallback model
IMAGE_PATH = "image.png"


def encode_image(image_path):
    """Encode image to base64 string."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def summarize_with_vision_model(base64_image):
    """
    Attempt to summarize image using vision model.
    
    Args:
        base64_image: Base64 encoded image string
        
    Returns:
        str: Summary text or None if failed
    """
    try:
        print(f"πŸ” Attempting with primary vision model: {PRIMARY_MODEL}...")
        
        completion = client.chat.completions.create(
            model=PRIMARY_MODEL,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Please summarize what you see in this image."},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=500,
            temperature=0.2,
            stream=True
        )

        print("\nβœ… Image Summary (Vision Model):\n" + "-" * 50)
        summary = ""
        for chunk in completion:
            content = chunk.choices[0].delta.content
            if content is not None:
                print(content, end="", flush=True)
                summary += content
        print("\n" + "-" * 50)
        
        return summary

    except Exception as e:
        print(f"\n⚠️ Vision model failed: {e}")
        return None


def summarize_with_text_fallback():
    """
    Fallback method using text-only LLM.
    Provides a generic response when vision model fails.
    
    Returns:
        str: Fallback response
    """
    try:
        print(f"\nπŸ”„ Falling back to text model: {FALLBACK_MODEL}...")
        
        # Create a prompt that acknowledges the limitation
        prompt = """I attempted to analyze an image but the vision model is unavailable. 
Please provide a helpful response about what types of information can typically be extracted from images, 
and suggest alternative approaches for image analysis."""
        
        completion = client.chat.completions.create(
            model=FALLBACK_MODEL,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            max_tokens=500,
            temperature=0.2,
            stream=True
        )

        print("\nπŸ’‘ Fallback Response (Text Model):\n" + "-" * 50)
        response = ""
        for chunk in completion:
            content = chunk.choices[0].delta.content
            if content is not None:
                print(content, end="", flush=True)
                response += content
        print("\n" + "-" * 50)
        
        return response

    except Exception as e:
        print(f"\n❌ Fallback model also failed: {e}")
        return None


def summarize_image():
    """
    Main function to summarize an image with fallback support.
    
    Attempts to use vision model first, falls back to text model if needed.
    """
    # Check if image exists
    if not os.path.exists(IMAGE_PATH):
        print(f"❌ Error: {IMAGE_PATH} not found.")
        print(f"πŸ“ Current directory: {os.getcwd()}")
        print(f"πŸ“‹ Files in current directory: {os.listdir('.')}")
        return

    print(f"πŸ“Έ Processing {IMAGE_PATH}...")
    print(f"πŸ“ File size: {os.path.getsize(IMAGE_PATH)} bytes\n")
    
    # Encode the image
    try:
        base64_image = encode_image(IMAGE_PATH)
    except Exception as e:
        print(f"❌ Error encoding image: {e}")
        return

    # Try vision model first
    result = summarize_with_vision_model(base64_image)
    
    # If vision model failed, use fallback
    if result is None:
        print("\nπŸ”„ Primary model failed, attempting fallback...")
        result = summarize_with_text_fallback()
    
    # Final status
    if result is None:
        print("\n❌ All methods failed. Please check:")
        print("   1. API key validity")
        print("   2. Network connection")
        print("   3. NVIDIA API service status")
    else:
        print("\nβœ… Image processing completed successfully!")


if __name__ == "__main__":
    summarize_image()