Agent_Course_Final_Assignment

Sleeping

File size: 9,751 Bytes

import base64
import pandas as pd
from langchain_core.messages import HumanMessage
from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
import yt_dlp
import ffmpeg


@tool
def read_excel(file_path: str) -> str:
    """
    Extract readable text from an Excel file (.xlsx or .xls).

    Args:
        file_path: Path to the Excel file.

    Returns:
        A string representation of all sheets and their content.
    """
    try:
        df_dict = pd.read_excel(file_path, sheet_name=None)  # Read all sheets
        result = []
        for sheet_name, sheet_df in df_dict.items():
            sheet_text = sheet_df.to_string(index=False)
            result.append(f"Sheet: {sheet_name}\n{sheet_text}")
        return "\n\n".join(result)

    except Exception as e:
        return f"Error reading Excel file: {str(e)}"


@tool
def read_python(file_path: str) -> str:
    """
    Extract source code from a Python (.py) file.

    Args:
        file_path: Path to the Python file.

    Returns:
        A string containing the full source code of the file.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        return f"Error reading Python file: {str(e)}"

        
class ExtractTextFromImage:
    def __init__(self, multimodal_model):
        self.multimodal_model = multimodal_model

    def __call__(self, img_path: str) -> str:
        """
        Extract text from an image file.
    
        Args:
            img_path: A string representing the path to an image (e.g., PNG, JPEG).
    
        Returns:
            A single string containing the concatenated text extracted from the image.    
        """
        all_text = ""
        try:
            # Read image and encode as base64
            with open(img_path, "rb") as image_file:
                image_bytes = image_file.read()
    
            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
    
            # Prepare the prompt including the base64 image data
            message = [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": (
                                "Extract all the text from this image. "
                                "Return only the extracted text, no explanations."
                            ),
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_base64}"
                            },
                        },
                    ]
                )
            ]
    
            # Call the vision-capable model
            response = self.multimodal_model.invoke(message)
    
            # Append extracted text
            all_text += response.content + "\n\n"
    
            return all_text.strip()
        except Exception as e:
            error_msg = f"Error extracting text: {str(e)}"
            print(error_msg)
            return ""


class DescribeImage:
    def __init__(self, multimodal_model):
        self.multimodal_model = multimodal_model

    def __call__(self, img_path: str, query: str) -> str:
        """
        Generate a detailed description of an image.
        This function reads a image from an url, encodes it, and sends it to a 
        vision-capable language model to obtain a comprehensive, natural language 
        description of the image's content, including its objects, actions, and context,
        following a specific query.
        
        Args:
            img_path: A string representing the path to an image (e.g., PNG, JPEG).
            query: Information to extract from the image.
            
        Returns:
            A single string containing a detailed description of the image.
        """
        try:
            # Read image and encode as base64
            with open(img_path, "rb") as image_file:
                image_bytes = image_file.read()
    
            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
    
            # Prepare message payload
            message = [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": (
                                f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}"                        ),
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_base64}"
                            },
                        },
                    ]
                )
            ]
            response = self.multimodal_model.invoke(message)
            return response.content.strip()
    
        except Exception as e:
            error_msg = f"Error describing image: {str(e)}"
            print(error_msg)
            return ""

    
class TranscribeAudio:
    def __init__(self, multimodal_model):
        self.multimodal_model = multimodal_model

    def __call__(self, audio_path: str, query:str) -> str:
        """
        Transcribe an MP3 file.

        Args:
            audio_path: Path to the MP3 audio file.

        Returns:
            Transcribed text as a string.
        """
        try:
            with open(audio_path, "rb") as audio_file:
                audio_bytes = audio_file.read()

            audio_data = AudioFile(
                mime_type="audio/mpeg",  # MP3 MIME type
                data=audio_bytes
            )

            message = [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": (
                                "Transcribe the speech from this audio file. "
                                "Return only the transcribed text, with no extra commentary."
                            ),
                        },
                        {
                            "type": "audio",
                            "audio": audio_data,
                        },
                    ]
                )
            ]

            response = self.audio_llm.invoke(message)
            return response.content.strip()

        except Exception as e:
            error_msg = f"Error transcribing audio: {str(e)}"
            print(error_msg)
            return ""


@tool
def download_youtube_video(youtube_url: str, output_path: str) -> str:
    """
    Download a YouTube video as an MP4 file.
    
    Args:
        youtube_url: The YouTube video URL.
        output_path: Desired output path for the downloaded MP4 file.

    Returns:
        Path to the saved video file.
    """
    ydl_opts = {
        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
        'outtmpl': output_path,
        'merge_output_format': 'mp4',
        'quiet': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
    return output_path


@tool
def extract_audio_from_video(video_path: str, audio_output: str) -> str:
    """
    Extracts audio from an MP4 video file and saves it as MP3.

    Args:
        video_path: Path to the input MP4 video file.
        audio_output: Path for the output MP3 file.

    Returns:
        Path to the audio file.
    """
    try:
        (
            ffmpeg
            .input(video_path)
            .output(audio_output, format='mp3', acodec='libmp3lame', t=60)  # limit to 60 sec
            .overwrite_output()
            .run(quiet=True)
        )
        return audio_output
    except ffmpeg.Error as e:
        raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}") from e
        
    
@tool
def wiki_search(query: str) -> str:
    """Search Wikipedia for a query and return maximum 2 results.
    
    Args:
        query: The search query."""
    search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
            for doc in search_docs
        ])
    return {"wiki_results": formatted_search_docs}


@tool
def web_search(query: str) -> str:
    """Search Tavily for a query and return maximum 3 results.
    
    Args:
        query: The search query."""
    search_docs = TavilySearchResults(max_results=3).invoke(query)
    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
            for doc in search_docs
        ])
    return {"web_results": formatted_search_docs}


@tool
def arxiv_search(query: str) -> str:
    """Search Arxiv for a query and return maximum 3 result.
    
    Args:
        query: The search query."""
    search_docs = ArxivLoader(query=query, load_max_docs=3).load()
    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
            for doc in search_docs
        ])
    return {"arvix_results": formatted_search_docs}