Paperbag commited on
Commit
f1a7daa
·
1 Parent(s): 66e034a

fix vision

Browse files
__pycache__/agent.cpython-312.pyc CHANGED
Binary files a/__pycache__/agent.cpython-312.pyc and b/__pycache__/agent.cpython-312.pyc differ
 
agent.py CHANGED
@@ -1,4 +1,8 @@
1
  import os
 
 
 
 
2
  import datetime
3
  import subprocess
4
  import tempfile
@@ -16,9 +20,7 @@ from groq import Groq
16
  from langchain_groq import ChatGroq
17
  from langchain_community.document_loaders.image import UnstructuredImageLoader
18
  from langchain_community.document_loaders import WebBaseLoader
19
- from langchain_openai import ChatOpenAI
20
  from langchain_google_genai import ChatGoogleGenerativeAI
21
- import base64
22
 
23
  try:
24
  import cv2
@@ -36,7 +38,7 @@ def get_whisper():
36
  whisper_model = whisper.load_model("base")
37
  return whisper_model
38
 
39
- load_dotenv()
40
 
41
  # Base Hugging Face LLM used by the chat wrapper
42
  # base_llm = HuggingFaceEndpoint(
@@ -55,14 +57,14 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
55
  Retries next tier if a 429 (rate limit), 402 (credits), or 404 (model found) error occurs.
56
  """
57
 
58
- # Adaptive Gemini names to try if 1.5 flash is 404
59
- gemini_alternatives = ["gemini-2.5-flash-lite", "gemma-3-1b", "gemini-3-flash", "gemini-3.1-flash-lite"]
60
 
61
  tiers_config = [
62
  {"name": "OpenRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://openrouter.ai/api/v1"},
63
- {"name": "Gemini", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.5-flash", "alternatives": gemini_alternatives},
64
- {"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "meta-llama/llama-4-scout-17b-16e-instruct"},
65
- {"name": "NVIDIA", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.1-405b-instruct", "base_url": "https://integrate.api.nvidia.com/v1"},
66
  {"name": "Vercel", "key": "VERCEL_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://gateway.ai.vercel.com/v1"},
67
  ]
68
 
@@ -75,38 +77,13 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
75
 
76
  def create_model_instance(m_name, provider, b_url=None):
77
  if provider == "openai":
 
78
  return ChatOpenAI(model=m_name, openai_api_key=api_key, openai_api_base=b_url, temperature=0)
79
  elif provider == "google":
 
80
  return ChatGoogleGenerativeAI(model=m_name, temperature=0)
81
  elif provider == "groq":
82
- return ChatGroq(model=m_name, temperature=0, max_retries=2)
83
- return None
84
-
85
- primary_model = create_model_instance(tier["model_name"], tier["provider"], tier.get("base_url"))
86
- if use_tools:
87
- primary_model = primary_model.bind_tools(tools)
88
-
89
- models_to_try = [primary_model]
90
- if "alternatives" in tier:
91
- for alt_name in tier["alternatives"]:
92
- alt_model = create_model_instance(alt_name, tier["provider"], tier.get("base_url"))
93
- if use_tools:
94
- alt_model = alt_model.bind_tools(tools)
95
- models_to_try.append(alt_model)
96
-
97
- last_exception = None
98
- for i in range(start_tier, len(tiers_config)):
99
- tier = tiers_config[i]
100
- api_key = os.getenv(tier["key"])
101
- if not api_key:
102
- continue
103
-
104
- def create_model_instance(m_name, provider, b_url=None):
105
- if provider == "openai":
106
- return ChatOpenAI(model=m_name, openai_api_key=api_key, openai_api_base=b_url, temperature=0)
107
- elif provider == "google":
108
- return ChatGoogleGenerativeAI(model=m_name, temperature=0)
109
- elif provider == "groq":
110
  return ChatGroq(model=m_name, temperature=0, max_retries=2)
111
  return None
112
 
@@ -135,7 +112,7 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
135
  continue
136
 
137
  # Catch other fallback triggers
138
- if any(x in err_str for x in ["rate_limit", "429", "500", "503", "overloaded", "not_found", "404", "402", "credits"]):
139
  print(f"--- {tier['name']} Error: {e}. Trying next model/tier... ---")
140
  last_exception = e
141
  # If this tier has more alternatives, continue to the next one
@@ -198,7 +175,32 @@ def wiki_search(query: str) -> str:
198
  ])
199
  return formatted_search_docs
200
 
201
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  @tool
204
  def analyze_image(image_path: str, question: str) -> str:
@@ -212,18 +214,13 @@ def analyze_image(image_path: str, question: str) -> str:
212
  question: Specific question describing what you want the vision model to look for.
213
  """
214
  try:
 
 
 
215
  # If it's a local file, we encode it to base64
216
  with open(image_path, "rb") as image_file:
217
  encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
218
 
219
- # Use OpenRouter for Vision as a more robust fallback
220
- vision_model = ChatOpenAI(
221
- model="google/gemini-2.0-flash-001",
222
- openai_api_key=os.getenv("OPENROUTER_API_KEY"),
223
- openai_api_base="https://openrouter.ai/api/v1",
224
- temperature=0,
225
- )
226
-
227
  message = HumanMessage(
228
  content=[
229
  {"type": "text", "text": question},
@@ -233,10 +230,26 @@ def analyze_image(image_path: str, question: str) -> str:
233
  },
234
  ]
235
  )
236
- response = vision_model.invoke([message])
237
- return response.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  except Exception as e:
239
- return f"Error analyzing image: {str(e)}"
 
240
 
241
  @tool
242
  def analyze_audio(audio_path: str, question: str) -> str:
@@ -279,7 +292,8 @@ def analyze_video(video_path: str, question: str) -> str:
279
  frame_indices = [int(i * total_frames / 5) for i in range(5)]
280
  extracted_descriptions = []
281
 
282
- vision_model = ChatGroq(model="llama-3.2-90b-vision-preview", temperature=0)
 
283
 
284
  for idx_num, frame_idx in enumerate(frame_indices):
285
  cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
@@ -289,14 +303,24 @@ def analyze_video(video_path: str, question: str) -> str:
289
  _, buffer = cv2.imencode('.jpg', frame)
290
  encoded_image = base64.b64encode(buffer).decode('utf-8')
291
 
292
- # Ask the vision model to describe the frame
293
  msg = HumanMessage(
294
  content=[
295
  {"type": "text", "text": f"Describe what is happening in this video frame concisely. Focus on aspects related to: {question}"},
296
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
297
  ]
298
  )
299
- desc = vision_model.invoke([msg]).content
 
 
 
 
 
 
 
 
 
 
300
  extracted_descriptions.append(f"Frame {idx_num + 1}: {desc}")
301
 
302
  cap.release()
@@ -424,7 +448,21 @@ def restart_required(state: AgentState) -> AgentState:
424
  # Augment the LLM with tools
425
  tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
426
  tools_by_name = {tool.name: tool for tool in tools}
427
- # model_with_tools etc. removed, replaced by lazy initialization in smart_invoke
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
  def answer_message(state: AgentState) -> AgentState:
430
  messages = state["messages"]
@@ -503,7 +541,7 @@ def answer_message(state: AgentState) -> AgentState:
503
  print("Max reasoning steps reached. Forcing answer extraction.")
504
  forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
505
  messages.append(forced_msg)
506
- draft_response = smart_invoke(messages, use_tools=False)
507
 
508
  # Third pass: strict GAIA formatting extraction
509
  formatting_sys = SystemMessage(
@@ -516,11 +554,15 @@ def answer_message(state: AgentState) -> AgentState:
516
  "If it is a name or word, just return the exact string. If a list, return only the comma-separated list."
517
  )
518
  )
519
- final_response, _ = smart_invoke([formatting_sys, HumanMessage(content=draft_response.content)], use_tools=False, start_tier=current_tier)
520
  print(f"Draft response: {draft_response.content}")
521
  print(f"Strict Final response: {final_response.content}")
522
 
523
  # Return messages including the final AIMessage so BasicAgent reads .content
 
 
 
 
524
  messages.append(draft_response)
525
  messages.append(final_response)
526
  return {"messages": messages}
 
1
  import os
2
+ import base64
3
+ import requests
4
+ import json
5
+ import traceback
6
  import datetime
7
  import subprocess
8
  import tempfile
 
20
  from langchain_groq import ChatGroq
21
  from langchain_community.document_loaders.image import UnstructuredImageLoader
22
  from langchain_community.document_loaders import WebBaseLoader
 
23
  from langchain_google_genai import ChatGoogleGenerativeAI
 
24
 
25
  try:
26
  import cv2
 
38
  whisper_model = whisper.load_model("base")
39
  return whisper_model
40
 
41
+ load_dotenv(override=True)
42
 
43
  # Base Hugging Face LLM used by the chat wrapper
44
  # base_llm = HuggingFaceEndpoint(
 
57
  Retries next tier if a 429 (rate limit), 402 (credits), or 404 (model found) error occurs.
58
  """
59
 
60
+ # Adaptive Gemini names to try if 3.1 flash is 404
61
+ gemini_alternatives = ["gemini-2.0-flash", "gemini-3.1-flash-lite", "gemini-3.1-pro"]
62
 
63
  tiers_config = [
64
  {"name": "OpenRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://openrouter.ai/api/v1"},
65
+ {"name": "Gemini", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash", "alternatives": gemini_alternatives},
66
+ {"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.3-70b-versatile"},
67
+ {"name": "NVIDIA", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.3-70b-instruct", "base_url": "https://integrate.api.nvidia.com/v1"},
68
  {"name": "Vercel", "key": "VERCEL_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://gateway.ai.vercel.com/v1"},
69
  ]
70
 
 
77
 
78
  def create_model_instance(m_name, provider, b_url=None):
79
  if provider == "openai":
80
+ from langchain_openai import ChatOpenAI
81
  return ChatOpenAI(model=m_name, openai_api_key=api_key, openai_api_base=b_url, temperature=0)
82
  elif provider == "google":
83
+ from langchain_google_genai import ChatGoogleGenerativeAI
84
  return ChatGoogleGenerativeAI(model=m_name, temperature=0)
85
  elif provider == "groq":
86
+ from langchain_groq import ChatGroq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  return ChatGroq(model=m_name, temperature=0, max_retries=2)
88
  return None
89
 
 
112
  continue
113
 
114
  # Catch other fallback triggers
115
+ if any(x in err_str for x in ["rate_limit", "429", "500", "503", "overloaded", "not_found", "404", "402", "credits", "decommissioned", "invalid_request_error"]):
116
  print(f"--- {tier['name']} Error: {e}. Trying next model/tier... ---")
117
  last_exception = e
118
  # If this tier has more alternatives, continue to the next one
 
175
  ])
176
  return formatted_search_docs
177
 
178
+ def get_vision_models():
179
+ """Returns a list of vision models to try, in order of preference."""
180
+ configs = [
181
+ {"name": "OpenRouter-Gemini-2.0", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "google/gemini-2.0-flash-001", "base_url": "https://openrouter.ai/api/v1"},
182
+ {"name": "Google-Gemini-2.0-Exp", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash-exp"},
183
+ {"name": "Google-Gemini-1.5-Latest", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-1.5-flash-latest"},
184
+ {"name": "NVIDIA-Vision-Llama-11b", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.2-11b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1"},
185
+ {"name": "NVIDIA-Vision-Llama-90b", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.2-90b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1"},
186
+ {"name": "Groq-Vision", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.2-90b-vision-preview"},
187
+ ]
188
+ models = []
189
+ for cfg in configs:
190
+ api_key = os.getenv(cfg["key"])
191
+ if not api_key:
192
+ continue
193
+ if cfg["provider"] == "openai":
194
+ from langchain_openai import ChatOpenAI
195
+ m = ChatOpenAI(model=cfg["model_name"], openai_api_key=api_key, openai_api_base=cfg.get("base_url"), temperature=0)
196
+ elif cfg["provider"] == "google":
197
+ from langchain_google_genai import ChatGoogleGenerativeAI
198
+ m = ChatGoogleGenerativeAI(model=cfg["model_name"], temperature=0)
199
+ elif cfg["provider"] == "groq":
200
+ from langchain_groq import ChatGroq
201
+ m = ChatGroq(model=cfg["model_name"], temperature=0)
202
+ models.append({"name": cfg["name"], "model": m})
203
+ return models
204
 
205
  @tool
206
  def analyze_image(image_path: str, question: str) -> str:
 
214
  question: Specific question describing what you want the vision model to look for.
215
  """
216
  try:
217
+ if not os.path.exists(image_path):
218
+ return f"Error: Image file not found at {image_path}"
219
+
220
  # If it's a local file, we encode it to base64
221
  with open(image_path, "rb") as image_file:
222
  encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
223
 
 
 
 
 
 
 
 
 
224
  message = HumanMessage(
225
  content=[
226
  {"type": "text", "text": question},
 
230
  },
231
  ]
232
  )
233
+
234
+ vision_models = get_vision_models()
235
+ if not vision_models:
236
+ return "Error: No vision models configured (missing API keys)."
237
+
238
+ last_err = None
239
+ for item in vision_models:
240
+ try:
241
+ m_name = getattr(item['model'], 'model', 'unknown')
242
+ print(f"--- Calling Vision Model: {item['name']} ({m_name}) ---")
243
+ response = item['model'].invoke([message])
244
+ return extract_text_from_content(response.content)
245
+ except Exception as e:
246
+ print(f"Vision Model {item['name']} failed.")
247
+ traceback.print_exc()
248
+ last_err = e
249
+ return f"Error analyzing image: All vision models failed. Last error: {str(last_err)}"
250
  except Exception as e:
251
+ traceback.print_exc()
252
+ return f"Error reading/processing image: {str(e)}"
253
 
254
  @tool
255
  def analyze_audio(audio_path: str, question: str) -> str:
 
292
  frame_indices = [int(i * total_frames / 5) for i in range(5)]
293
  extracted_descriptions = []
294
 
295
+ vision_models = get_vision_models()
296
+ # Ensure Groq-Llama is at the front for video if preferred, but we'll use the default order for now.
297
 
298
  for idx_num, frame_idx in enumerate(frame_indices):
299
  cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
 
303
  _, buffer = cv2.imencode('.jpg', frame)
304
  encoded_image = base64.b64encode(buffer).decode('utf-8')
305
 
306
+ # Ask a vision model to describe the frame (with fallback)
307
  msg = HumanMessage(
308
  content=[
309
  {"type": "text", "text": f"Describe what is happening in this video frame concisely. Focus on aspects related to: {question}"},
310
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
311
  ]
312
  )
313
+
314
+ desc = "No description available."
315
+ for item in vision_models:
316
+ try:
317
+ print(f"--- Calling Vision Model for Frame {idx_num+1}: {item['name']} ---")
318
+ desc = item['model'].invoke([msg]).content
319
+ break
320
+ except Exception as e:
321
+ print(f"Vision Model {item['name']} failed for frame: {e}")
322
+ continue
323
+
324
  extracted_descriptions.append(f"Frame {idx_num + 1}: {desc}")
325
 
326
  cap.release()
 
448
  # Augment the LLM with tools
449
  tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
450
  tools_by_name = {tool.name: tool for tool in tools}
451
+ def extract_text_from_content(content: Any) -> str:
452
+ """Extracts a simple string from various possible AIMessage content formats."""
453
+ if isinstance(content, str):
454
+ return content
455
+ if isinstance(content, list):
456
+ text_parts = []
457
+ for part in content:
458
+ if isinstance(part, str):
459
+ text_parts.append(part)
460
+ elif isinstance(part, dict) and "text" in part:
461
+ text_parts.append(part["text"])
462
+ elif isinstance(part, dict) and "type" in part and part["type"] == "text":
463
+ text_parts.append(part.get("text", ""))
464
+ return "".join(text_parts)
465
+ return str(content)
466
 
467
  def answer_message(state: AgentState) -> AgentState:
468
  messages = state["messages"]
 
541
  print("Max reasoning steps reached. Forcing answer extraction.")
542
  forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
543
  messages.append(forced_msg)
544
+ draft_response, _ = smart_invoke(messages, use_tools=False)
545
 
546
  # Third pass: strict GAIA formatting extraction
547
  formatting_sys = SystemMessage(
 
554
  "If it is a name or word, just return the exact string. If a list, return only the comma-separated list."
555
  )
556
  )
557
+ final_response, _ = smart_invoke([formatting_sys, HumanMessage(content=extract_text_from_content(draft_response.content))], use_tools=False, start_tier=current_tier)
558
  print(f"Draft response: {draft_response.content}")
559
  print(f"Strict Final response: {final_response.content}")
560
 
561
  # Return messages including the final AIMessage so BasicAgent reads .content
562
+ # Ensure final_response has string content for basic agents
563
+ if not isinstance(final_response.content, str):
564
+ final_response.content = extract_text_from_content(final_response.content)
565
+
566
  messages.append(draft_response)
567
  messages.append(final_response)
568
  return {"messages": messages}
check_env.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Try to load .env from current directory
5
+ env_path = os.path.join(os.getcwd(), '.env')
6
+ print(f"Checking for .env at: {env_path}")
7
+ print(f"File exists: {os.path.exists(env_path)}")
8
+
9
+ load_dotenv(env_path)
10
+
11
+ # Print keys (masking values)
12
+ keys = list(os.environ.keys())
13
+ relevant_keys = [k for k in keys if any(x in k for x in ["API_KEY", "TOKEN", "GOOGLE", "GROQ", "NVIDIA", "VERCEL", "OPENROUTER"])]
14
+ print(f"Relevant keys found: {relevant_keys}")
15
+
16
+ # Specifically check the ones we need
17
+ needed = ["NVIDIA_API_KEY", "VERCEL_API_KEY", "OPENROUTER_API_KEY", "GOOGLE_API_KEY", "GROQ_API_KEY"]
18
+ for k in needed:
19
+ val = os.getenv(k)
20
+ status = "PRESENT (length={})".format(len(val)) if val else "MISSING"
21
+ print(f"{k}: {status}")
check_env_v2.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Try to load .env from current directory with override=True
5
+ env_path = os.path.join(os.getcwd(), '.env')
6
+ print(f"Checking for .env at: {env_path}")
7
+ print(f"File exists: {os.path.exists(env_path)}")
8
+
9
+ load_dotenv(env_path, override=True)
10
+
11
+ # Print keys (case-insensitive check)
12
+ keys = list(os.environ.keys())
13
+ relevant_keys = [k for k in keys if any(x in k.upper() for x in ["API_KEY", "TOKEN", "GOOGLE", "GROQ", "NVIDIA", "VERCEL", "OPENROUTER"])]
14
+ print(f"Relevant keys found: {relevant_keys}")
15
+
16
+ # Check specifically
17
+ needed = ["NVIDIA_API_KEY", "VERCEL_API_KEY", "OPENROUTER_API_KEY", "GOOGLE_API_KEY", "GROQ_API_KEY"]
18
+ for k in needed:
19
+ # Try case-insensitive lookup
20
+ found_key = next((key for key in keys if key.upper() == k), None)
21
+ if found_key:
22
+ val = os.getenv(found_key)
23
+ print(f"{found_key}: PRESENT (length={len(val)})")
24
+ else:
25
+ print(f"{k}: MISSING")
test_vision.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain_core.messages import HumanMessage
4
+ import base64
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+ def test_vision():
10
+ # Use a tiny 1x1 base64 image for testing
11
+ tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
12
+ msg = HumanMessage(content=[{"type": "text", "text": "what is in this image?"}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
13
+
14
+ models = [
15
+ {"name": "OpenRouter-Gemini-2.0", "provider": "openai", "model": "google/gemini-2.0-flash-001", "base_url": "https://openrouter.ai/api/v1", "key": "OPENROUTER_API_KEY"},
16
+ {"name": "NVIDIA-Llama-3.2", "provider": "openai", "model": "nvidia/llama-3.2-nv-vision-70b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
17
+ {"name": "NVIDIA-Qwen-VL", "provider": "openai", "model": "nvidia/qwen-vl-max", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
18
+ {"name": "Vercel-Vision", "provider": "openai", "model": "gpt-4o-mini", "base_url": "https://gateway.ai.vercel.com/v1", "key": "VERCEL_API_KEY"},
19
+ ]
20
+
21
+ for m in models:
22
+ key = os.getenv(m['key'])
23
+ if not key:
24
+ print(f"Skip {m['name']} (no key)")
25
+ continue
26
+ try:
27
+ print(f"Testing {m['name']} ({m['model']})...")
28
+ llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
29
+ res = llm.invoke([msg])
30
+ print(f"Success: {res.content}")
31
+ except Exception as e:
32
+ print(f"Fail: {e}")
33
+
34
+ if __name__ == "__main__":
35
+ test_vision()
test_vision_v2.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain_core.messages import HumanMessage
4
+ import base64
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv(override=True)
8
+
9
+ def test_vision():
10
+ # Use a tiny 1x1 base64 image for testing
11
+ tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
12
+ msg = HumanMessage(content=[{"type": "text", "text": "is this image red, green, or blue? answer with one word."}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
13
+
14
+ models = [
15
+ {"name": "NVIDIA-Llama-3.2-11b", "provider": "openai", "model": "meta/llama-3.2-11b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
16
+ {"name": "NVIDIA-Llama-3.2-90b", "provider": "openai", "model": "meta/llama-3.2-90b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
17
+ {"name": "NVIDIA-Mistral-Vision", "provider": "openai", "model": "mistralai/pixtral-12b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
18
+ ]
19
+
20
+ for m in models:
21
+ key = os.getenv(m['key'])
22
+ if not key:
23
+ print(f"Skip {m['name']} (no key)")
24
+ continue
25
+ try:
26
+ print(f"Testing {m['name']} ({m['model']})...")
27
+ llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
28
+ res = llm.invoke([msg])
29
+ print(f"Success: {res.content}")
30
+ except Exception as e:
31
+ print(f"Fail: {e}")
32
+
33
+ if __name__ == "__main__":
34
+ test_vision()