Paperbag commited on
Commit
a5ab16b
·
1 Parent(s): f0a6306

feat: Add image and video analysis tools using Groq Vision, integrate file attachment handling into the agent, and configure VS Code Python settings.

Browse files
.vscode/settings.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "python.defaultInterpreterPath": "${workspaceFolder}\\.venv\\Scripts\\python.exe",
3
+ "python.terminal.activateEnvironment": true
4
+ }
__pycache__/agent.cpython-39.pyc CHANGED
Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ
 
agent.py CHANGED
@@ -11,7 +11,11 @@ from dotenv import load_dotenv
11
  from groq import Groq
12
  from langchain_groq import ChatGroq
13
  from langchain_community.document_loaders.image import UnstructuredImageLoader
14
-
 
 
 
 
15
 
16
  load_dotenv()
17
 
@@ -80,26 +84,89 @@ def wiki_search(query: str) -> str:
80
 
81
 
82
 
83
- # @tool
84
- # def get_image_file(task_id):
85
- # """
86
- # Get the image file from the question
87
- # Use cases:
88
- # - Extract Image from the question
89
-
90
- # Args:
91
- # task_id: the task_id of the question
92
-
93
- # Returns:
94
- # Image file result
95
- # """
96
-
97
- # loader = UnstructuredImageLoader("./example_data/layout-parser-paper-screenshot.png")
98
-
99
- # data = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # data[0]
102
- # return ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  system_prompt = """
@@ -143,7 +210,7 @@ def restart_required(state: AgentState) -> AgentState:
143
  # return {"messages": messages + [response]}
144
 
145
  # Augment the LLM with tools
146
- tools = [web_search,wiki_search]
147
  tools_by_name = {tool.name: tool for tool in tools}
148
  model_with_tools = model.bind_tools(tools)
149
 
@@ -155,6 +222,7 @@ def answer_message(state: AgentState) -> AgentState:
155
  Think carefully before answering the question.
156
  Do not include any thought process before answering the question, and only response exactly what was being asked of you.
157
  If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
 
158
 
159
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
160
  If you are asked for a number, don't use comma to write your number, and don't use units such as $ or percent sign unless specified otherwise.
 
11
  from groq import Groq
12
  from langchain_groq import ChatGroq
13
  from langchain_community.document_loaders.image import UnstructuredImageLoader
14
+ import base64
15
+ try:
16
+ import cv2
17
+ except ImportError:
18
+ cv2 = None
19
 
20
  load_dotenv()
21
 
 
84
 
85
 
86
 
87
+ @tool
88
+ def analyze_image(image_path: str, question: str) -> str:
89
+ """
90
+ Analyzes an image to answer a specific question.
91
+ Use this tool when you need to extract visual information from an image file.
92
+
93
+ Args:
94
+ image_path: The local path or URL to the image file.
95
+ question: The specific question to ask about the image.
96
+ """
97
+ try:
98
+ # If it's a local file, we encode it to base64
99
+ with open(image_path, "rb") as image_file:
100
+ encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
101
+
102
+ # Create a separate Vision LLM call specific to the image
103
+ vision_model = ChatGroq(model="llama-3.2-90b-vision-preview", temperature=0)
104
+
105
+ message = HumanMessage(
106
+ content=[
107
+ {"type": "text", "text": question},
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
111
+ },
112
+ ]
113
+ )
114
+ response = vision_model.invoke([message])
115
+ return response.content
116
+ except Exception as e:
117
+ return f"Error analyzing image: {str(e)}"
118
 
119
+ @tool
120
+ def analyze_video(video_path: str, question: str) -> str:
121
+ """
122
+ Analyzes a video file to answer questions about its content.
123
+ Extracts key frames and describes what is happening.
124
+
125
+ Args:
126
+ video_path: The local path to the video file.
127
+ question: The specific question to ask about the video.
128
+ """
129
+ if cv2 is None:
130
+ return "Error: cv2 is not installed. Please install opencv-python."
131
+ try:
132
+ # 1. Extract frames evenly spaced throughout the video
133
+ cap = cv2.VideoCapture(video_path)
134
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
135
+ if total_frames == 0:
136
+ return "Error: Could not read video frames."
137
+
138
+ # Take 5 frames as a summary
139
+ frame_indices = [int(i * total_frames / 5) for i in range(5)]
140
+ extracted_descriptions = []
141
+
142
+ vision_model = ChatGroq(model="llama-3.2-90b-vision-preview", temperature=0)
143
+
144
+ for idx_num, frame_idx in enumerate(frame_indices):
145
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
146
+ ret, frame = cap.read()
147
+ if ret:
148
+ # Convert frame to base64
149
+ _, buffer = cv2.imencode('.jpg', frame)
150
+ encoded_image = base64.b64encode(buffer).decode('utf-8')
151
+
152
+ # Ask the vision model to describe the frame
153
+ msg = HumanMessage(
154
+ content=[
155
+ {"type": "text", "text": f"Describe what is happening in this video frame concisely. Focus on aspects related to: {question}"},
156
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
157
+ ]
158
+ )
159
+ desc = vision_model.invoke([msg]).content
160
+ extracted_descriptions.append(f"Frame {idx_num + 1}: {desc}")
161
+
162
+ cap.release()
163
+
164
+ # 2. Compile the context for the agent
165
+ video_context = "\n".join(extracted_descriptions)
166
+
167
+ return f"Video Summary based on extracted frames:\n{video_context}"
168
+ except Exception as e:
169
+ return f"Error analyzing video: {str(e)}"
170
 
171
 
172
  system_prompt = """
 
210
  # return {"messages": messages + [response]}
211
 
212
  # Augment the LLM with tools
213
+ tools = [web_search, wiki_search, analyze_image, analyze_video]
214
  tools_by_name = {tool.name: tool for tool in tools}
215
  model_with_tools = model.bind_tools(tools)
216
 
 
222
  Think carefully before answering the question.
223
  Do not include any thought process before answering the question, and only response exactly what was being asked of you.
224
  If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
225
+ If a file is attached, use the appropriate tool (analyze_image or analyze_video) to answer the question based on the file content.
226
 
227
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
228
  If you are asked for a number, don't use comma to write your number, and don't use units such as $ or percent sign unless specified otherwise.
app copy.py CHANGED
@@ -6,7 +6,9 @@ import pandas as pd
6
  from langchain_core.messages import HumanMessage
7
  from agent import build_graph
8
  from huggingface_hub import HfApi, hf_hub_download
9
- from logging import logger
 
 
10
 
11
  # (Keep Constants as is)
12
  # --- Constants ---
@@ -58,6 +60,9 @@ for item in questions_data[:5]:
58
  continue
59
  files_text = item.get("files")
60
  task_id = item.get("task_id")
 
 
 
61
  # file = file_extract(,task_id)
62
  print(files_text,task_id)
63
  output = agent(question_text)
 
6
  from langchain_core.messages import HumanMessage
7
  from agent import build_graph
8
  from huggingface_hub import HfApi, hf_hub_download
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
 
13
  # (Keep Constants as is)
14
  # --- Constants ---
 
60
  continue
61
  files_text = item.get("files")
62
  task_id = item.get("task_id")
63
+ file_name = item.get("file_name")
64
+ if file_name:
65
+ question_text += f"\n\n[Attached File: {file_name}]"
66
  # file = file_extract(,task_id)
67
  print(files_text,task_id)
68
  output = agent(question_text)
app.py CHANGED
@@ -84,9 +84,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
84
  for item in questions_data:
85
  task_id = item.get("task_id")
86
  question_text = item.get("question")
 
87
  if not task_id or question_text is None:
88
  print(f"Skipping item with missing task_id or question: {item}")
89
  continue
 
 
 
90
  try:
91
  submitted_answer = agent(question_text)
92
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
84
  for item in questions_data:
85
  task_id = item.get("task_id")
86
  question_text = item.get("question")
87
+ file_name = item.get("file_name")
88
  if not task_id or question_text is None:
89
  print(f"Skipping item with missing task_id or question: {item}")
90
  continue
91
+
92
+ if file_name:
93
+ question_text += f"\n\n[Attached File: {file_name}]"
94
  try:
95
  submitted_answer = agent(question_text)
96
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
requirements.txt CHANGED
@@ -20,4 +20,5 @@ pandas
20
  numpy
21
  ddgs
22
  groq
23
- unstructured[all-docs]
 
 
20
  numpy
21
  ddgs
22
  groq
23
+ unstructured[all-docs]
24
+ opencv-python