Spaces:
Sleeping
Sleeping
fix vision
Browse files- __pycache__/agent.cpython-312.pyc +0 -0
- agent.py +97 -55
- check_env.py +21 -0
- check_env_v2.py +25 -0
- test_vision.py +35 -0
- test_vision_v2.py +34 -0
__pycache__/agent.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/agent.cpython-312.pyc and b/__pycache__/agent.cpython-312.pyc differ
|
|
|
agent.py
CHANGED
|
@@ -1,4 +1,8 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import datetime
|
| 3 |
import subprocess
|
| 4 |
import tempfile
|
|
@@ -16,9 +20,7 @@ from groq import Groq
|
|
| 16 |
from langchain_groq import ChatGroq
|
| 17 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
| 18 |
from langchain_community.document_loaders import WebBaseLoader
|
| 19 |
-
from langchain_openai import ChatOpenAI
|
| 20 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 21 |
-
import base64
|
| 22 |
|
| 23 |
try:
|
| 24 |
import cv2
|
|
@@ -36,7 +38,7 @@ def get_whisper():
|
|
| 36 |
whisper_model = whisper.load_model("base")
|
| 37 |
return whisper_model
|
| 38 |
|
| 39 |
-
load_dotenv()
|
| 40 |
|
| 41 |
# Base Hugging Face LLM used by the chat wrapper
|
| 42 |
# base_llm = HuggingFaceEndpoint(
|
|
@@ -55,14 +57,14 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
|
|
| 55 |
Retries next tier if a 429 (rate limit), 402 (credits), or 404 (model found) error occurs.
|
| 56 |
"""
|
| 57 |
|
| 58 |
-
# Adaptive Gemini names to try if
|
| 59 |
-
gemini_alternatives = ["gemini-2.
|
| 60 |
|
| 61 |
tiers_config = [
|
| 62 |
{"name": "OpenRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://openrouter.ai/api/v1"},
|
| 63 |
-
{"name": "Gemini", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.
|
| 64 |
-
{"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "
|
| 65 |
-
{"name": "NVIDIA", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.
|
| 66 |
{"name": "Vercel", "key": "VERCEL_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://gateway.ai.vercel.com/v1"},
|
| 67 |
]
|
| 68 |
|
|
@@ -75,38 +77,13 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
|
|
| 75 |
|
| 76 |
def create_model_instance(m_name, provider, b_url=None):
|
| 77 |
if provider == "openai":
|
|
|
|
| 78 |
return ChatOpenAI(model=m_name, openai_api_key=api_key, openai_api_base=b_url, temperature=0)
|
| 79 |
elif provider == "google":
|
|
|
|
| 80 |
return ChatGoogleGenerativeAI(model=m_name, temperature=0)
|
| 81 |
elif provider == "groq":
|
| 82 |
-
|
| 83 |
-
return None
|
| 84 |
-
|
| 85 |
-
primary_model = create_model_instance(tier["model_name"], tier["provider"], tier.get("base_url"))
|
| 86 |
-
if use_tools:
|
| 87 |
-
primary_model = primary_model.bind_tools(tools)
|
| 88 |
-
|
| 89 |
-
models_to_try = [primary_model]
|
| 90 |
-
if "alternatives" in tier:
|
| 91 |
-
for alt_name in tier["alternatives"]:
|
| 92 |
-
alt_model = create_model_instance(alt_name, tier["provider"], tier.get("base_url"))
|
| 93 |
-
if use_tools:
|
| 94 |
-
alt_model = alt_model.bind_tools(tools)
|
| 95 |
-
models_to_try.append(alt_model)
|
| 96 |
-
|
| 97 |
-
last_exception = None
|
| 98 |
-
for i in range(start_tier, len(tiers_config)):
|
| 99 |
-
tier = tiers_config[i]
|
| 100 |
-
api_key = os.getenv(tier["key"])
|
| 101 |
-
if not api_key:
|
| 102 |
-
continue
|
| 103 |
-
|
| 104 |
-
def create_model_instance(m_name, provider, b_url=None):
|
| 105 |
-
if provider == "openai":
|
| 106 |
-
return ChatOpenAI(model=m_name, openai_api_key=api_key, openai_api_base=b_url, temperature=0)
|
| 107 |
-
elif provider == "google":
|
| 108 |
-
return ChatGoogleGenerativeAI(model=m_name, temperature=0)
|
| 109 |
-
elif provider == "groq":
|
| 110 |
return ChatGroq(model=m_name, temperature=0, max_retries=2)
|
| 111 |
return None
|
| 112 |
|
|
@@ -135,7 +112,7 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
|
|
| 135 |
continue
|
| 136 |
|
| 137 |
# Catch other fallback triggers
|
| 138 |
-
if any(x in err_str for x in ["rate_limit", "429", "500", "503", "overloaded", "not_found", "404", "402", "credits"]):
|
| 139 |
print(f"--- {tier['name']} Error: {e}. Trying next model/tier... ---")
|
| 140 |
last_exception = e
|
| 141 |
# If this tier has more alternatives, continue to the next one
|
|
@@ -198,7 +175,32 @@ def wiki_search(query: str) -> str:
|
|
| 198 |
])
|
| 199 |
return formatted_search_docs
|
| 200 |
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
@tool
|
| 204 |
def analyze_image(image_path: str, question: str) -> str:
|
|
@@ -212,18 +214,13 @@ def analyze_image(image_path: str, question: str) -> str:
|
|
| 212 |
question: Specific question describing what you want the vision model to look for.
|
| 213 |
"""
|
| 214 |
try:
|
|
|
|
|
|
|
|
|
|
| 215 |
# If it's a local file, we encode it to base64
|
| 216 |
with open(image_path, "rb") as image_file:
|
| 217 |
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
|
| 218 |
|
| 219 |
-
# Use OpenRouter for Vision as a more robust fallback
|
| 220 |
-
vision_model = ChatOpenAI(
|
| 221 |
-
model="google/gemini-2.0-flash-001",
|
| 222 |
-
openai_api_key=os.getenv("OPENROUTER_API_KEY"),
|
| 223 |
-
openai_api_base="https://openrouter.ai/api/v1",
|
| 224 |
-
temperature=0,
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
message = HumanMessage(
|
| 228 |
content=[
|
| 229 |
{"type": "text", "text": question},
|
|
@@ -233,10 +230,26 @@ def analyze_image(image_path: str, question: str) -> str:
|
|
| 233 |
},
|
| 234 |
]
|
| 235 |
)
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
except Exception as e:
|
| 239 |
-
|
|
|
|
| 240 |
|
| 241 |
@tool
|
| 242 |
def analyze_audio(audio_path: str, question: str) -> str:
|
|
@@ -279,7 +292,8 @@ def analyze_video(video_path: str, question: str) -> str:
|
|
| 279 |
frame_indices = [int(i * total_frames / 5) for i in range(5)]
|
| 280 |
extracted_descriptions = []
|
| 281 |
|
| 282 |
-
|
|
|
|
| 283 |
|
| 284 |
for idx_num, frame_idx in enumerate(frame_indices):
|
| 285 |
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
|
@@ -289,14 +303,24 @@ def analyze_video(video_path: str, question: str) -> str:
|
|
| 289 |
_, buffer = cv2.imencode('.jpg', frame)
|
| 290 |
encoded_image = base64.b64encode(buffer).decode('utf-8')
|
| 291 |
|
| 292 |
-
# Ask
|
| 293 |
msg = HumanMessage(
|
| 294 |
content=[
|
| 295 |
{"type": "text", "text": f"Describe what is happening in this video frame concisely. Focus on aspects related to: {question}"},
|
| 296 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
|
| 297 |
]
|
| 298 |
)
|
| 299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
extracted_descriptions.append(f"Frame {idx_num + 1}: {desc}")
|
| 301 |
|
| 302 |
cap.release()
|
|
@@ -424,7 +448,21 @@ def restart_required(state: AgentState) -> AgentState:
|
|
| 424 |
# Augment the LLM with tools
|
| 425 |
tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
|
| 426 |
tools_by_name = {tool.name: tool for tool in tools}
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
def answer_message(state: AgentState) -> AgentState:
|
| 430 |
messages = state["messages"]
|
|
@@ -503,7 +541,7 @@ def answer_message(state: AgentState) -> AgentState:
|
|
| 503 |
print("Max reasoning steps reached. Forcing answer extraction.")
|
| 504 |
forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
|
| 505 |
messages.append(forced_msg)
|
| 506 |
-
draft_response = smart_invoke(messages, use_tools=False)
|
| 507 |
|
| 508 |
# Third pass: strict GAIA formatting extraction
|
| 509 |
formatting_sys = SystemMessage(
|
|
@@ -516,11 +554,15 @@ def answer_message(state: AgentState) -> AgentState:
|
|
| 516 |
"If it is a name or word, just return the exact string. If a list, return only the comma-separated list."
|
| 517 |
)
|
| 518 |
)
|
| 519 |
-
final_response, _ = smart_invoke([formatting_sys, HumanMessage(content=draft_response.content)], use_tools=False, start_tier=current_tier)
|
| 520 |
print(f"Draft response: {draft_response.content}")
|
| 521 |
print(f"Strict Final response: {final_response.content}")
|
| 522 |
|
| 523 |
# Return messages including the final AIMessage so BasicAgent reads .content
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
messages.append(draft_response)
|
| 525 |
messages.append(final_response)
|
| 526 |
return {"messages": messages}
|
|
|
|
| 1 |
import os
|
| 2 |
+
import base64
|
| 3 |
+
import requests
|
| 4 |
+
import json
|
| 5 |
+
import traceback
|
| 6 |
import datetime
|
| 7 |
import subprocess
|
| 8 |
import tempfile
|
|
|
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
| 22 |
from langchain_community.document_loaders import WebBaseLoader
|
|
|
|
| 23 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
|
|
| 24 |
|
| 25 |
try:
|
| 26 |
import cv2
|
|
|
|
| 38 |
whisper_model = whisper.load_model("base")
|
| 39 |
return whisper_model
|
| 40 |
|
| 41 |
+
load_dotenv(override=True)
|
| 42 |
|
| 43 |
# Base Hugging Face LLM used by the chat wrapper
|
| 44 |
# base_llm = HuggingFaceEndpoint(
|
|
|
|
| 57 |
Retries next tier if a 429 (rate limit), 402 (credits), or 404 (model found) error occurs.
|
| 58 |
"""
|
| 59 |
|
| 60 |
+
# Adaptive Gemini names to try if 3.1 flash is 404
|
| 61 |
+
gemini_alternatives = ["gemini-2.0-flash", "gemini-3.1-flash-lite", "gemini-3.1-pro"]
|
| 62 |
|
| 63 |
tiers_config = [
|
| 64 |
{"name": "OpenRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://openrouter.ai/api/v1"},
|
| 65 |
+
{"name": "Gemini", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash", "alternatives": gemini_alternatives},
|
| 66 |
+
{"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.3-70b-versatile"},
|
| 67 |
+
{"name": "NVIDIA", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.3-70b-instruct", "base_url": "https://integrate.api.nvidia.com/v1"},
|
| 68 |
{"name": "Vercel", "key": "VERCEL_API_KEY", "provider": "openai", "model_name": "meta-llama/llama-3.3-70b-instruct", "base_url": "https://gateway.ai.vercel.com/v1"},
|
| 69 |
]
|
| 70 |
|
|
|
|
| 77 |
|
| 78 |
def create_model_instance(m_name, provider, b_url=None):
|
| 79 |
if provider == "openai":
|
| 80 |
+
from langchain_openai import ChatOpenAI
|
| 81 |
return ChatOpenAI(model=m_name, openai_api_key=api_key, openai_api_base=b_url, temperature=0)
|
| 82 |
elif provider == "google":
|
| 83 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 84 |
return ChatGoogleGenerativeAI(model=m_name, temperature=0)
|
| 85 |
elif provider == "groq":
|
| 86 |
+
from langchain_groq import ChatGroq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return ChatGroq(model=m_name, temperature=0, max_retries=2)
|
| 88 |
return None
|
| 89 |
|
|
|
|
| 112 |
continue
|
| 113 |
|
| 114 |
# Catch other fallback triggers
|
| 115 |
+
if any(x in err_str for x in ["rate_limit", "429", "500", "503", "overloaded", "not_found", "404", "402", "credits", "decommissioned", "invalid_request_error"]):
|
| 116 |
print(f"--- {tier['name']} Error: {e}. Trying next model/tier... ---")
|
| 117 |
last_exception = e
|
| 118 |
# If this tier has more alternatives, continue to the next one
|
|
|
|
| 175 |
])
|
| 176 |
return formatted_search_docs
|
| 177 |
|
| 178 |
+
def get_vision_models():
|
| 179 |
+
"""Returns a list of vision models to try, in order of preference."""
|
| 180 |
+
configs = [
|
| 181 |
+
{"name": "OpenRouter-Gemini-2.0", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "google/gemini-2.0-flash-001", "base_url": "https://openrouter.ai/api/v1"},
|
| 182 |
+
{"name": "Google-Gemini-2.0-Exp", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash-exp"},
|
| 183 |
+
{"name": "Google-Gemini-1.5-Latest", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-1.5-flash-latest"},
|
| 184 |
+
{"name": "NVIDIA-Vision-Llama-11b", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.2-11b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1"},
|
| 185 |
+
{"name": "NVIDIA-Vision-Llama-90b", "key": "NVIDIA_API_KEY", "provider": "openai", "model_name": "meta/llama-3.2-90b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1"},
|
| 186 |
+
{"name": "Groq-Vision", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.2-90b-vision-preview"},
|
| 187 |
+
]
|
| 188 |
+
models = []
|
| 189 |
+
for cfg in configs:
|
| 190 |
+
api_key = os.getenv(cfg["key"])
|
| 191 |
+
if not api_key:
|
| 192 |
+
continue
|
| 193 |
+
if cfg["provider"] == "openai":
|
| 194 |
+
from langchain_openai import ChatOpenAI
|
| 195 |
+
m = ChatOpenAI(model=cfg["model_name"], openai_api_key=api_key, openai_api_base=cfg.get("base_url"), temperature=0)
|
| 196 |
+
elif cfg["provider"] == "google":
|
| 197 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 198 |
+
m = ChatGoogleGenerativeAI(model=cfg["model_name"], temperature=0)
|
| 199 |
+
elif cfg["provider"] == "groq":
|
| 200 |
+
from langchain_groq import ChatGroq
|
| 201 |
+
m = ChatGroq(model=cfg["model_name"], temperature=0)
|
| 202 |
+
models.append({"name": cfg["name"], "model": m})
|
| 203 |
+
return models
|
| 204 |
|
| 205 |
@tool
|
| 206 |
def analyze_image(image_path: str, question: str) -> str:
|
|
|
|
| 214 |
question: Specific question describing what you want the vision model to look for.
|
| 215 |
"""
|
| 216 |
try:
|
| 217 |
+
if not os.path.exists(image_path):
|
| 218 |
+
return f"Error: Image file not found at {image_path}"
|
| 219 |
+
|
| 220 |
# If it's a local file, we encode it to base64
|
| 221 |
with open(image_path, "rb") as image_file:
|
| 222 |
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
message = HumanMessage(
|
| 225 |
content=[
|
| 226 |
{"type": "text", "text": question},
|
|
|
|
| 230 |
},
|
| 231 |
]
|
| 232 |
)
|
| 233 |
+
|
| 234 |
+
vision_models = get_vision_models()
|
| 235 |
+
if not vision_models:
|
| 236 |
+
return "Error: No vision models configured (missing API keys)."
|
| 237 |
+
|
| 238 |
+
last_err = None
|
| 239 |
+
for item in vision_models:
|
| 240 |
+
try:
|
| 241 |
+
m_name = getattr(item['model'], 'model', 'unknown')
|
| 242 |
+
print(f"--- Calling Vision Model: {item['name']} ({m_name}) ---")
|
| 243 |
+
response = item['model'].invoke([message])
|
| 244 |
+
return extract_text_from_content(response.content)
|
| 245 |
+
except Exception as e:
|
| 246 |
+
print(f"Vision Model {item['name']} failed.")
|
| 247 |
+
traceback.print_exc()
|
| 248 |
+
last_err = e
|
| 249 |
+
return f"Error analyzing image: All vision models failed. Last error: {str(last_err)}"
|
| 250 |
except Exception as e:
|
| 251 |
+
traceback.print_exc()
|
| 252 |
+
return f"Error reading/processing image: {str(e)}"
|
| 253 |
|
| 254 |
@tool
|
| 255 |
def analyze_audio(audio_path: str, question: str) -> str:
|
|
|
|
| 292 |
frame_indices = [int(i * total_frames / 5) for i in range(5)]
|
| 293 |
extracted_descriptions = []
|
| 294 |
|
| 295 |
+
vision_models = get_vision_models()
|
| 296 |
+
# Ensure Groq-Llama is at the front for video if preferred, but we'll use the default order for now.
|
| 297 |
|
| 298 |
for idx_num, frame_idx in enumerate(frame_indices):
|
| 299 |
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
|
|
|
| 303 |
_, buffer = cv2.imencode('.jpg', frame)
|
| 304 |
encoded_image = base64.b64encode(buffer).decode('utf-8')
|
| 305 |
|
| 306 |
+
# Ask a vision model to describe the frame (with fallback)
|
| 307 |
msg = HumanMessage(
|
| 308 |
content=[
|
| 309 |
{"type": "text", "text": f"Describe what is happening in this video frame concisely. Focus on aspects related to: {question}"},
|
| 310 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
|
| 311 |
]
|
| 312 |
)
|
| 313 |
+
|
| 314 |
+
desc = "No description available."
|
| 315 |
+
for item in vision_models:
|
| 316 |
+
try:
|
| 317 |
+
print(f"--- Calling Vision Model for Frame {idx_num+1}: {item['name']} ---")
|
| 318 |
+
desc = item['model'].invoke([msg]).content
|
| 319 |
+
break
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f"Vision Model {item['name']} failed for frame: {e}")
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
extracted_descriptions.append(f"Frame {idx_num + 1}: {desc}")
|
| 325 |
|
| 326 |
cap.release()
|
|
|
|
| 448 |
# Augment the LLM with tools
|
| 449 |
tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
|
| 450 |
tools_by_name = {tool.name: tool for tool in tools}
|
| 451 |
+
def extract_text_from_content(content: Any) -> str:
|
| 452 |
+
"""Extracts a simple string from various possible AIMessage content formats."""
|
| 453 |
+
if isinstance(content, str):
|
| 454 |
+
return content
|
| 455 |
+
if isinstance(content, list):
|
| 456 |
+
text_parts = []
|
| 457 |
+
for part in content:
|
| 458 |
+
if isinstance(part, str):
|
| 459 |
+
text_parts.append(part)
|
| 460 |
+
elif isinstance(part, dict) and "text" in part:
|
| 461 |
+
text_parts.append(part["text"])
|
| 462 |
+
elif isinstance(part, dict) and "type" in part and part["type"] == "text":
|
| 463 |
+
text_parts.append(part.get("text", ""))
|
| 464 |
+
return "".join(text_parts)
|
| 465 |
+
return str(content)
|
| 466 |
|
| 467 |
def answer_message(state: AgentState) -> AgentState:
|
| 468 |
messages = state["messages"]
|
|
|
|
| 541 |
print("Max reasoning steps reached. Forcing answer extraction.")
|
| 542 |
forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
|
| 543 |
messages.append(forced_msg)
|
| 544 |
+
draft_response, _ = smart_invoke(messages, use_tools=False)
|
| 545 |
|
| 546 |
# Third pass: strict GAIA formatting extraction
|
| 547 |
formatting_sys = SystemMessage(
|
|
|
|
| 554 |
"If it is a name or word, just return the exact string. If a list, return only the comma-separated list."
|
| 555 |
)
|
| 556 |
)
|
| 557 |
+
final_response, _ = smart_invoke([formatting_sys, HumanMessage(content=extract_text_from_content(draft_response.content))], use_tools=False, start_tier=current_tier)
|
| 558 |
print(f"Draft response: {draft_response.content}")
|
| 559 |
print(f"Strict Final response: {final_response.content}")
|
| 560 |
|
| 561 |
# Return messages including the final AIMessage so BasicAgent reads .content
|
| 562 |
+
# Ensure final_response has string content for basic agents
|
| 563 |
+
if not isinstance(final_response.content, str):
|
| 564 |
+
final_response.content = extract_text_from_content(final_response.content)
|
| 565 |
+
|
| 566 |
messages.append(draft_response)
|
| 567 |
messages.append(final_response)
|
| 568 |
return {"messages": messages}
|
check_env.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
# Try to load .env from current directory
|
| 5 |
+
env_path = os.path.join(os.getcwd(), '.env')
|
| 6 |
+
print(f"Checking for .env at: {env_path}")
|
| 7 |
+
print(f"File exists: {os.path.exists(env_path)}")
|
| 8 |
+
|
| 9 |
+
load_dotenv(env_path)
|
| 10 |
+
|
| 11 |
+
# Print keys (masking values)
|
| 12 |
+
keys = list(os.environ.keys())
|
| 13 |
+
relevant_keys = [k for k in keys if any(x in k for x in ["API_KEY", "TOKEN", "GOOGLE", "GROQ", "NVIDIA", "VERCEL", "OPENROUTER"])]
|
| 14 |
+
print(f"Relevant keys found: {relevant_keys}")
|
| 15 |
+
|
| 16 |
+
# Specifically check the ones we need
|
| 17 |
+
needed = ["NVIDIA_API_KEY", "VERCEL_API_KEY", "OPENROUTER_API_KEY", "GOOGLE_API_KEY", "GROQ_API_KEY"]
|
| 18 |
+
for k in needed:
|
| 19 |
+
val = os.getenv(k)
|
| 20 |
+
status = "PRESENT (length={})".format(len(val)) if val else "MISSING"
|
| 21 |
+
print(f"{k}: {status}")
|
check_env_v2.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
# Try to load .env from current directory with override=True
|
| 5 |
+
env_path = os.path.join(os.getcwd(), '.env')
|
| 6 |
+
print(f"Checking for .env at: {env_path}")
|
| 7 |
+
print(f"File exists: {os.path.exists(env_path)}")
|
| 8 |
+
|
| 9 |
+
load_dotenv(env_path, override=True)
|
| 10 |
+
|
| 11 |
+
# Print keys (case-insensitive check)
|
| 12 |
+
keys = list(os.environ.keys())
|
| 13 |
+
relevant_keys = [k for k in keys if any(x in k.upper() for x in ["API_KEY", "TOKEN", "GOOGLE", "GROQ", "NVIDIA", "VERCEL", "OPENROUTER"])]
|
| 14 |
+
print(f"Relevant keys found: {relevant_keys}")
|
| 15 |
+
|
| 16 |
+
# Check specifically
|
| 17 |
+
needed = ["NVIDIA_API_KEY", "VERCEL_API_KEY", "OPENROUTER_API_KEY", "GOOGLE_API_KEY", "GROQ_API_KEY"]
|
| 18 |
+
for k in needed:
|
| 19 |
+
# Try case-insensitive lookup
|
| 20 |
+
found_key = next((key for key in keys if key.upper() == k), None)
|
| 21 |
+
if found_key:
|
| 22 |
+
val = os.getenv(found_key)
|
| 23 |
+
print(f"{found_key}: PRESENT (length={len(val)})")
|
| 24 |
+
else:
|
| 25 |
+
print(f"{k}: MISSING")
|
test_vision.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_openai import ChatOpenAI
|
| 3 |
+
from langchain_core.messages import HumanMessage
|
| 4 |
+
import base64
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
def test_vision():
|
| 10 |
+
# Use a tiny 1x1 base64 image for testing
|
| 11 |
+
tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
|
| 12 |
+
msg = HumanMessage(content=[{"type": "text", "text": "what is in this image?"}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
|
| 13 |
+
|
| 14 |
+
models = [
|
| 15 |
+
{"name": "OpenRouter-Gemini-2.0", "provider": "openai", "model": "google/gemini-2.0-flash-001", "base_url": "https://openrouter.ai/api/v1", "key": "OPENROUTER_API_KEY"},
|
| 16 |
+
{"name": "NVIDIA-Llama-3.2", "provider": "openai", "model": "nvidia/llama-3.2-nv-vision-70b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 17 |
+
{"name": "NVIDIA-Qwen-VL", "provider": "openai", "model": "nvidia/qwen-vl-max", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 18 |
+
{"name": "Vercel-Vision", "provider": "openai", "model": "gpt-4o-mini", "base_url": "https://gateway.ai.vercel.com/v1", "key": "VERCEL_API_KEY"},
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
for m in models:
|
| 22 |
+
key = os.getenv(m['key'])
|
| 23 |
+
if not key:
|
| 24 |
+
print(f"Skip {m['name']} (no key)")
|
| 25 |
+
continue
|
| 26 |
+
try:
|
| 27 |
+
print(f"Testing {m['name']} ({m['model']})...")
|
| 28 |
+
llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
|
| 29 |
+
res = llm.invoke([msg])
|
| 30 |
+
print(f"Success: {res.content}")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Fail: {e}")
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
test_vision()
|
test_vision_v2.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_openai import ChatOpenAI
|
| 3 |
+
from langchain_core.messages import HumanMessage
|
| 4 |
+
import base64
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
load_dotenv(override=True)
|
| 8 |
+
|
| 9 |
+
def test_vision():
|
| 10 |
+
# Use a tiny 1x1 base64 image for testing
|
| 11 |
+
tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
|
| 12 |
+
msg = HumanMessage(content=[{"type": "text", "text": "is this image red, green, or blue? answer with one word."}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
|
| 13 |
+
|
| 14 |
+
models = [
|
| 15 |
+
{"name": "NVIDIA-Llama-3.2-11b", "provider": "openai", "model": "meta/llama-3.2-11b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 16 |
+
{"name": "NVIDIA-Llama-3.2-90b", "provider": "openai", "model": "meta/llama-3.2-90b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 17 |
+
{"name": "NVIDIA-Mistral-Vision", "provider": "openai", "model": "mistralai/pixtral-12b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
for m in models:
|
| 21 |
+
key = os.getenv(m['key'])
|
| 22 |
+
if not key:
|
| 23 |
+
print(f"Skip {m['name']} (no key)")
|
| 24 |
+
continue
|
| 25 |
+
try:
|
| 26 |
+
print(f"Testing {m['name']} ({m['model']})...")
|
| 27 |
+
llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
|
| 28 |
+
res = llm.invoke([msg])
|
| 29 |
+
print(f"Success: {res.content}")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"Fail: {e}")
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
test_vision()
|