feat: Add image and video analysis tools using Groq Vision, integrate file attachment handling into the agent, and configure VS Code Python settings.
Browse files- .vscode/settings.json +4 -0
- __pycache__/agent.cpython-39.pyc +0 -0
- agent.py +89 -21
- app copy.py +6 -1
- app.py +4 -0
- requirements.txt +2 -1
.vscode/settings.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"python.defaultInterpreterPath": "${workspaceFolder}\\.venv\\Scripts\\python.exe",
|
| 3 |
+
"python.terminal.activateEnvironment": true
|
| 4 |
+
}
|
__pycache__/agent.cpython-39.pyc
CHANGED
|
Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ
|
|
|
agent.py
CHANGED
|
@@ -11,7 +11,11 @@ from dotenv import load_dotenv
|
|
| 11 |
from groq import Groq
|
| 12 |
from langchain_groq import ChatGroq
|
| 13 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
load_dotenv()
|
| 17 |
|
|
@@ -80,26 +84,89 @@ def wiki_search(query: str) -> str:
|
|
| 80 |
|
| 81 |
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
system_prompt = """
|
|
@@ -143,7 +210,7 @@ def restart_required(state: AgentState) -> AgentState:
|
|
| 143 |
# return {"messages": messages + [response]}
|
| 144 |
|
| 145 |
# Augment the LLM with tools
|
| 146 |
-
tools = [web_search,wiki_search]
|
| 147 |
tools_by_name = {tool.name: tool for tool in tools}
|
| 148 |
model_with_tools = model.bind_tools(tools)
|
| 149 |
|
|
@@ -155,6 +222,7 @@ def answer_message(state: AgentState) -> AgentState:
|
|
| 155 |
Think carefully before answering the question.
|
| 156 |
Do not include any thought process before answering the question, and only response exactly what was being asked of you.
|
| 157 |
If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
|
|
|
|
| 158 |
|
| 159 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 160 |
If you are asked for a number, don't use comma to write your number, and don't use units such as $ or percent sign unless specified otherwise.
|
|
|
|
| 11 |
from groq import Groq
|
| 12 |
from langchain_groq import ChatGroq
|
| 13 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
| 14 |
+
import base64
|
| 15 |
+
try:
|
| 16 |
+
import cv2
|
| 17 |
+
except ImportError:
|
| 18 |
+
cv2 = None
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
|
|
|
|
| 84 |
|
| 85 |
|
| 86 |
|
| 87 |
+
@tool
|
| 88 |
+
def analyze_image(image_path: str, question: str) -> str:
|
| 89 |
+
"""
|
| 90 |
+
Analyzes an image to answer a specific question.
|
| 91 |
+
Use this tool when you need to extract visual information from an image file.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
image_path: The local path or URL to the image file.
|
| 95 |
+
question: The specific question to ask about the image.
|
| 96 |
+
"""
|
| 97 |
+
try:
|
| 98 |
+
# If it's a local file, we encode it to base64
|
| 99 |
+
with open(image_path, "rb") as image_file:
|
| 100 |
+
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
|
| 101 |
+
|
| 102 |
+
# Create a separate Vision LLM call specific to the image
|
| 103 |
+
vision_model = ChatGroq(model="llama-3.2-90b-vision-preview", temperature=0)
|
| 104 |
+
|
| 105 |
+
message = HumanMessage(
|
| 106 |
+
content=[
|
| 107 |
+
{"type": "text", "text": question},
|
| 108 |
+
{
|
| 109 |
+
"type": "image_url",
|
| 110 |
+
"image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
|
| 111 |
+
},
|
| 112 |
+
]
|
| 113 |
+
)
|
| 114 |
+
response = vision_model.invoke([message])
|
| 115 |
+
return response.content
|
| 116 |
+
except Exception as e:
|
| 117 |
+
return f"Error analyzing image: {str(e)}"
|
| 118 |
|
| 119 |
+
@tool
|
| 120 |
+
def analyze_video(video_path: str, question: str) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Analyzes a video file to answer questions about its content.
|
| 123 |
+
Extracts key frames and describes what is happening.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
video_path: The local path to the video file.
|
| 127 |
+
question: The specific question to ask about the video.
|
| 128 |
+
"""
|
| 129 |
+
if cv2 is None:
|
| 130 |
+
return "Error: cv2 is not installed. Please install opencv-python."
|
| 131 |
+
try:
|
| 132 |
+
# 1. Extract frames evenly spaced throughout the video
|
| 133 |
+
cap = cv2.VideoCapture(video_path)
|
| 134 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 135 |
+
if total_frames == 0:
|
| 136 |
+
return "Error: Could not read video frames."
|
| 137 |
+
|
| 138 |
+
# Take 5 frames as a summary
|
| 139 |
+
frame_indices = [int(i * total_frames / 5) for i in range(5)]
|
| 140 |
+
extracted_descriptions = []
|
| 141 |
+
|
| 142 |
+
vision_model = ChatGroq(model="llama-3.2-90b-vision-preview", temperature=0)
|
| 143 |
+
|
| 144 |
+
for idx_num, frame_idx in enumerate(frame_indices):
|
| 145 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
| 146 |
+
ret, frame = cap.read()
|
| 147 |
+
if ret:
|
| 148 |
+
# Convert frame to base64
|
| 149 |
+
_, buffer = cv2.imencode('.jpg', frame)
|
| 150 |
+
encoded_image = base64.b64encode(buffer).decode('utf-8')
|
| 151 |
+
|
| 152 |
+
# Ask the vision model to describe the frame
|
| 153 |
+
msg = HumanMessage(
|
| 154 |
+
content=[
|
| 155 |
+
{"type": "text", "text": f"Describe what is happening in this video frame concisely. Focus on aspects related to: {question}"},
|
| 156 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
|
| 157 |
+
]
|
| 158 |
+
)
|
| 159 |
+
desc = vision_model.invoke([msg]).content
|
| 160 |
+
extracted_descriptions.append(f"Frame {idx_num + 1}: {desc}")
|
| 161 |
+
|
| 162 |
+
cap.release()
|
| 163 |
+
|
| 164 |
+
# 2. Compile the context for the agent
|
| 165 |
+
video_context = "\n".join(extracted_descriptions)
|
| 166 |
+
|
| 167 |
+
return f"Video Summary based on extracted frames:\n{video_context}"
|
| 168 |
+
except Exception as e:
|
| 169 |
+
return f"Error analyzing video: {str(e)}"
|
| 170 |
|
| 171 |
|
| 172 |
system_prompt = """
|
|
|
|
| 210 |
# return {"messages": messages + [response]}
|
| 211 |
|
| 212 |
# Augment the LLM with tools
|
| 213 |
+
tools = [web_search, wiki_search, analyze_image, analyze_video]
|
| 214 |
tools_by_name = {tool.name: tool for tool in tools}
|
| 215 |
model_with_tools = model.bind_tools(tools)
|
| 216 |
|
|
|
|
| 222 |
Think carefully before answering the question.
|
| 223 |
Do not include any thought process before answering the question, and only response exactly what was being asked of you.
|
| 224 |
If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
|
| 225 |
+
If a file is attached, use the appropriate tool (analyze_image or analyze_video) to answer the question based on the file content.
|
| 226 |
|
| 227 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 228 |
If you are asked for a number, don't use comma to write your number, and don't use units such as $ or percent sign unless specified otherwise.
|
app copy.py
CHANGED
|
@@ -6,7 +6,9 @@ import pandas as pd
|
|
| 6 |
from langchain_core.messages import HumanMessage
|
| 7 |
from agent import build_graph
|
| 8 |
from huggingface_hub import HfApi, hf_hub_download
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# (Keep Constants as is)
|
| 12 |
# --- Constants ---
|
|
@@ -58,6 +60,9 @@ for item in questions_data[:5]:
|
|
| 58 |
continue
|
| 59 |
files_text = item.get("files")
|
| 60 |
task_id = item.get("task_id")
|
|
|
|
|
|
|
|
|
|
| 61 |
# file = file_extract(,task_id)
|
| 62 |
print(files_text,task_id)
|
| 63 |
output = agent(question_text)
|
|
|
|
| 6 |
from langchain_core.messages import HumanMessage
|
| 7 |
from agent import build_graph
|
| 8 |
from huggingface_hub import HfApi, hf_hub_download
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
# (Keep Constants as is)
|
| 14 |
# --- Constants ---
|
|
|
|
| 60 |
continue
|
| 61 |
files_text = item.get("files")
|
| 62 |
task_id = item.get("task_id")
|
| 63 |
+
file_name = item.get("file_name")
|
| 64 |
+
if file_name:
|
| 65 |
+
question_text += f"\n\n[Attached File: {file_name}]"
|
| 66 |
# file = file_extract(,task_id)
|
| 67 |
print(files_text,task_id)
|
| 68 |
output = agent(question_text)
|
app.py
CHANGED
|
@@ -84,9 +84,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 84 |
for item in questions_data:
|
| 85 |
task_id = item.get("task_id")
|
| 86 |
question_text = item.get("question")
|
|
|
|
| 87 |
if not task_id or question_text is None:
|
| 88 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 89 |
continue
|
|
|
|
|
|
|
|
|
|
| 90 |
try:
|
| 91 |
submitted_answer = agent(question_text)
|
| 92 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
|
|
|
| 84 |
for item in questions_data:
|
| 85 |
task_id = item.get("task_id")
|
| 86 |
question_text = item.get("question")
|
| 87 |
+
file_name = item.get("file_name")
|
| 88 |
if not task_id or question_text is None:
|
| 89 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 90 |
continue
|
| 91 |
+
|
| 92 |
+
if file_name:
|
| 93 |
+
question_text += f"\n\n[Attached File: {file_name}]"
|
| 94 |
try:
|
| 95 |
submitted_answer = agent(question_text)
|
| 96 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
requirements.txt
CHANGED
|
@@ -20,4 +20,5 @@ pandas
|
|
| 20 |
numpy
|
| 21 |
ddgs
|
| 22 |
groq
|
| 23 |
-
unstructured[all-docs]
|
|
|
|
|
|
| 20 |
numpy
|
| 21 |
ddgs
|
| 22 |
groq
|
| 23 |
+
unstructured[all-docs]
|
| 24 |
+
opencv-python
|