Final_Assignment_Template

Running

App Files Files Community

Paperbag commited on Mar 20

Commit

a5ab16b

1 Parent(s): f0a6306

feat: Add image and video analysis tools using Groq Vision, integrate file attachment handling into the agent, and configure VS Code Python settings.

Browse files

Files changed (6) hide show

.vscode/settings.json +4 -0
__pycache__/agent.cpython-39.pyc +0 -0
agent.py +89 -21
app copy.py +6 -1
app.py +4 -0
requirements.txt +2 -1

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "python.defaultInterpreterPath": "${workspaceFolder}\\.venv\\Scripts\\python.exe",
+    "python.terminal.activateEnvironment": true
+}

__pycache__/agent.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ

agent.py CHANGED Viewed

@@ -11,7 +11,11 @@ from dotenv import load_dotenv
 from groq import Groq
 from langchain_groq import ChatGroq
 from langchain_community.document_loaders.image import UnstructuredImageLoader
 load_dotenv()
@@ -80,26 +84,89 @@ def wiki_search(query: str) -> str:
-# @tool
-# def get_image_file(task_id):
-#     """
-#     Get the image file from the question
-#     Use cases:
-#      - Extract Image from the question
-#      Args:
-#         task_id: the task_id of the question
-#     Returns:
-#         Image file result
-#     """
-#     loader = UnstructuredImageLoader("./example_data/layout-parser-paper-screenshot.png")
-#     data = loader.load()
-#     data[0]
-#     return ''
 system_prompt = """
@@ -143,7 +210,7 @@ def restart_required(state: AgentState) -> AgentState:
 #     return {"messages": messages + [response]}
 # Augment the LLM with tools
-tools = [web_search,wiki_search]
 tools_by_name = {tool.name: tool for tool in tools}
 model_with_tools = model.bind_tools(tools)
@@ -155,6 +222,7 @@ def answer_message(state: AgentState) -> AgentState:
     Think carefully before answering the question.
     Do not include any thought process before answering the question, and only response exactly what was being asked of you.
     If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
     YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
     If you are asked for a number, don't use comma to write your number, and don't use units such as $ or percent sign unless specified otherwise.

 from groq import Groq
 from langchain_groq import ChatGroq
 from langchain_community.document_loaders.image import UnstructuredImageLoader
+import base64
+try:
+    import cv2
+except ImportError:
+    cv2 = None
 load_dotenv()
+@tool
+def analyze_image(image_path: str, question: str) -> str:
+    """
+    Analyzes an image to answer a specific question.
+    Use this tool when you need to extract visual information from an image file.
+    Args:
+        image_path: The local path or URL to the image file.
+        question: The specific question to ask about the image.
+    """
+    try:
+        # If it's a local file, we encode it to base64
+        with open(image_path, "rb") as image_file:
+            encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
+        # Create a separate Vision LLM call specific to the image
+        vision_model = ChatGroq(model="llama-3.2-90b-vision-preview", temperature=0)
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": question},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
+                },
+            ]
+        )
+        response = vision_model.invoke([message])
+        return response.content
+    except Exception as e:
+        return f"Error analyzing image: {str(e)}"
+@tool
+def analyze_video(video_path: str, question: str) -> str:
+    """
+    Analyzes a video file to answer questions about its content.
+    Extracts key frames and describes what is happening.
+    Args:
+        video_path: The local path to the video file.
+        question: The specific question to ask about the video.
+    """
+    if cv2 is None:
+        return "Error: cv2 is not installed. Please install opencv-python."
+    try:
+        # 1. Extract frames evenly spaced throughout the video
+        cap = cv2.VideoCapture(video_path)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if total_frames == 0:
+            return "Error: Could not read video frames."
+        # Take 5 frames as a summary
+        frame_indices = [int(i * total_frames / 5) for i in range(5)]
+        extracted_descriptions = []
+        vision_model = ChatGroq(model="llama-3.2-90b-vision-preview", temperature=0)
+        for idx_num, frame_idx in enumerate(frame_indices):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+            ret, frame = cap.read()
+            if ret:
+                # Convert frame to base64
+                _, buffer = cv2.imencode('.jpg', frame)
+                encoded_image = base64.b64encode(buffer).decode('utf-8')
+                # Ask the vision model to describe the frame
+                msg = HumanMessage(
+                    content=[
+                        {"type": "text", "text": f"Describe what is happening in this video frame concisely. Focus on aspects related to: {question}"},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
+                    ]
+                )
+                desc = vision_model.invoke([msg]).content
+                extracted_descriptions.append(f"Frame {idx_num + 1}: {desc}")
+        cap.release()
+        # 2. Compile the context for the agent
+        video_context = "\n".join(extracted_descriptions)
+        return f"Video Summary based on extracted frames:\n{video_context}"
+    except Exception as e:
+        return f"Error analyzing video: {str(e)}"
 system_prompt = """
 #     return {"messages": messages + [response]}
 # Augment the LLM with tools
+tools = [web_search, wiki_search, analyze_image, analyze_video]
 tools_by_name = {tool.name: tool for tool in tools}
 model_with_tools = model.bind_tools(tools)
     Think carefully before answering the question.
     Do not include any thought process before answering the question, and only response exactly what was being asked of you.
     If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
+    If a file is attached, use the appropriate tool (analyze_image or analyze_video) to answer the question based on the file content.
     YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
     If you are asked for a number, don't use comma to write your number, and don't use units such as $ or percent sign unless specified otherwise.

app copy.py CHANGED Viewed

@@ -6,7 +6,9 @@ import pandas as pd
 from langchain_core.messages import HumanMessage
 from agent import build_graph
 from huggingface_hub import HfApi, hf_hub_download
-from logging import logger
 # (Keep Constants as is)
 # --- Constants ---
@@ -58,6 +60,9 @@ for item in questions_data[:5]:
         continue
     files_text = item.get("files")
     task_id = item.get("task_id")
     # file = file_extract(,task_id)
     print(files_text,task_id)
     output = agent(question_text)

 from langchain_core.messages import HumanMessage
 from agent import build_graph
 from huggingface_hub import HfApi, hf_hub_download
+import logging
+logger = logging.getLogger(__name__)
 # (Keep Constants as is)
 # --- Constants ---
         continue
     files_text = item.get("files")
     task_id = item.get("task_id")
+    file_name = item.get("file_name")
+    if file_name:
+        question_text += f"\n\n[Attached File: {file_name}]"
     # file = file_extract(,task_id)
     print(files_text,task_id)
     output = agent(question_text)

app.py CHANGED Viewed

@@ -84,9 +84,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})

     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
+        file_name = item.get("file_name")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
+        if file_name:
+            question_text += f"\n\n[Attached File: {file_name}]"
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})

requirements.txt CHANGED Viewed

@@ -20,4 +20,5 @@ pandas
 numpy
 ddgs
 groq
-unstructured[all-docs]

 numpy
 ddgs
 groq
+unstructured[all-docs]
+opencv-python