Agent_Course_Final_Assignment

Sleeping

App Files Files Community

giulia-fontanella commited on Jun 5, 2025

Commit

8234b7a

verified ·

1 Parent(s): 27a61fa

Update tools.py

Browse files

Files changed (1) hide show

tools.py +75 -18

tools.py CHANGED Viewed

@@ -3,8 +3,9 @@ import pandas as pd
 from langchain_core.messages import HumanMessage
 from langchain.tools import tool
 from langchain_community.tools.tavily_search import TavilySearchResults
-from langchain_community.document_loaders import WikipediaLoader
-from langchain_community.document_loaders import ArxivLoader
 @tool
@@ -49,8 +50,8 @@ def read_python(file_path: str) -> str:
 class ExtractTextFromImage:
-    def __init__(self, vision_llm):
-        self.vision_llm = vision_llm
     def __call__(self, img_path: str) -> str:
         """
@@ -92,7 +93,7 @@ class ExtractTextFromImage:
             ]
             # Call the vision-capable model
-            response = self.vision_llm.invoke(message)
             # Append extracted text
             all_text += response.content + "\n\n"
@@ -105,10 +106,10 @@ class ExtractTextFromImage:
 class DescribeImage:
-    def __init__(self, vision_llm):
-        self.vision_llm = vision_llm
-    def __call__(self, img_path: str) -> str:
         """
         Generate a detailed description of an image.
         This function reads a image from an url, encodes it, and sends it to a
@@ -148,7 +149,7 @@ class DescribeImage:
                     ]
                 )
             ]
-            response = self.vision_llm.invoke(message)
             return response.content.strip()
         except Exception as e:
@@ -158,16 +159,10 @@ class DescribeImage:
 class TranscribeAudio:
-    def __init__(self, audio_llm):
-        """
-        Initialize with a LangChain-compatible vision+audio GPT-4o model.
-        Args:
-            audio_llm: A LangChain Runnable for GPT-4o (must support audio inputs).
-        """
-        self.audio_llm = audio_llm
-    def __call__(self, audio_path: str) -> str:
         """
         Transcribe an MP3 file.
@@ -212,6 +207,68 @@ class TranscribeAudio:
             print(error_msg)
             return ""
 @tool
 def wiki_search(query: str) -> str:
     """Search Wikipedia for a query and return maximum 2 results.

 from langchain_core.messages import HumanMessage
 from langchain.tools import tool
 from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
+from langchain_core.video import VideoFile
+import yt_dlp
 @tool
 class ExtractTextFromImage:
+    def __init__(self, multimodal_model):
+        self.multimodal_model = multimodal_model
     def __call__(self, img_path: str) -> str:
         """
             ]
             # Call the vision-capable model
+            response = self.multimodal_model.invoke(message)
             # Append extracted text
             all_text += response.content + "\n\n"
 class DescribeImage:
+    def __init__(self, multimodal_model):
+        self.multimodal_model = multimodal_model
+    def __call__(self, img_path: str, query: str) -> str:
         """
         Generate a detailed description of an image.
         This function reads a image from an url, encodes it, and sends it to a
                     ]
                 )
             ]
+            response = self.multimodal_model.invoke(message)
             return response.content.strip()
         except Exception as e:
 class TranscribeAudio:
+    def __init__(self, multimodal_model):
+        self.multimodal_model = multimodal_model
+    def __call__(self, audio_path: str, query:str) -> str:
         """
         Transcribe an MP3 file.
             print(error_msg)
             return ""
+def download_youtube_video(youtube_url: str, output_path: str) -> str:
+    """
+    Download a YouTube video as an MP4 file.
+    Args:
+        youtube_url: The YouTube video URL.
+        output_path: Desired output path for the downloaded MP4 file.
+    Returns:
+        Path to the saved video file.
+    """
+    ydl_opts = {
+        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
+        'outtmpl': output_path,
+        'merge_output_format': 'mp4',
+        'quiet': True,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([youtube_url])
+    return output_path
+class AnalyzeVideo:
+    def __init__(self, multimodal_model):
+        self.multimodal_model = multimodal_model
+    def __call__(self, video_path: str, query: str) -> str:
+        try:
+            with open(video_path, "rb") as video_file:
+                video_bytes = video_file.read()
+            video_data = VideoFile(
+                mime_type="video/mp4",
+                data=video_bytes
+            )
+            message = [
+                HumanMessage(
+                    content=[
+                        {
+                            "type": "text",
+                            "text": (
+                                f"In relation to this video, answer the following request: {query} "
+                            ),
+                        },
+                        {
+                            "type": "video",
+                            "video": video_data,
+                        },
+                    ]
+                )
+            ]
+            response = self.multimodal_model.invoke(message)
+            return response.content.strip()
+        except Exception as e:
+            print(f"Error analyzing video: {str(e)}")
+            return ""
 @tool
 def wiki_search(query: str) -> str:
     """Search Wikipedia for a query and return maximum 2 results.