Final_Assignment_Template

Sleeping

File size: 4,983 Bytes

c3b34bf

import asyncio
import logging
import os
from typing import Any, Dict, List, Optional

import aiohttp
import html2text
from griffe import json_decoder
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.schema import Document
from llama_index.core.tools import FunctionTool
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.tools.tavily_research import TavilyToolSpec
from llama_index.tools.wikipedia import WikipediaToolSpec
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize OpenAI settings
Settings.llm = OpenAI(model="gpt-4o", temperature=0.1)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")



async def fetch_and_process(
    urls: List[str], session: aiohttp.ClientSession, timeout: int = 10
) -> List[Document]:
    """Fetch and convert webpages to Document objects concurrently."""

    async def fetch(url: str) -> Dict[str, str]:
        try:
            async with session.get(url, timeout=timeout) as response:
                return {"text": await response.text(), "url": str(response.url)}
        except (asyncio.TimeoutError, aiohttp.ClientError) as e:
            logging.warning(f"Could not fetch {url}: {repr(e)}")
            return {"text": "", "url": url}

    tasks = [fetch(url) for url in urls]
    responses = await asyncio.gather(*tasks)

    return [
        Document(text=html2text.html2text(resp["text"]), id_=resp["url"])
        for resp in responses if resp["text"]
    ]


async def summarize_websites(urls: List[str], query: str) -> List[str]:
    """Summarize a query from content across multiple websites. Even if there is only one website, it will still be used.
    
    Args:
        urls: A list of URLs to summarize.
        query: The query to summarize.
    Returns:
        A list of summaries.
    """

    logging.info(f"Summarizing {len(urls)} websites for query: {query}")

    Settings.llm = OpenAI(model="gpt-4o-mini")
    Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

    summaries = []

    async with aiohttp.ClientSession() as session:
        documents = await fetch_and_process(urls, session)
        for doc in documents:
            index = VectorStoreIndex.from_documents([doc])
            result = index.as_query_engine().query(f"Summarize in very meticulous detail. {query}")
            summaries.append(f"Source: {doc.id_} \nContent: {result.response}")

    return summaries


def tavily_search(query: str, max_results: Optional[int] = 10) -> List[Dict]:
    """
    Tavily search with result formatting.
    Args:
        query: The query to search for.
        max_results: The maximum number of results to return.
    Returns:
        results: A list of dictionaries containing the results as URLS. Need to be used with the summarize_websites tool.
    """
    logger.info(f"Called tavily_search for: {query}")
    
    try:
        search_engine = TavilyToolSpec(api_key=os.getenv("TAVILY_API_KEY"))
        search_results = search_engine.search(query, max_results=max_results)

        results = []
        for document in search_results:
            results.append({
                "url": document.metadata.get("url", ""),
                "content": f"Title: {document.metadata.get('title', '')}\nContent: {document.text}"
            })

        return results
    except Exception as e:
        logger.error(f"Tavily search failed: {str(e)}")
        return [{"error": f"Search failed: {str(e)}"}]


def search_wikipedia(query: str, language: str = "en") -> str:
    """
    Search Wikipedia for specific information. This is a more efficient way to search Wikipedia than the tavily_search tool.  Need to be used with the summarize_websites tool.
    Args:
        query: The search query
        language: Wikipedia language code (default: "en")
    Returns:
        str: Wikipedia content summary
    """
    logger.info(f"Searching Wikipedia for: {query}")

    try:
        wikipedia_tool = WikipediaToolSpec()
        search_results = wikipedia_tool.search_data(query, language=language)
        return search_results
    except Exception as e:
        logger.error(f"Wikipedia search failed: {str(e)}")
        return f"Wikipedia search failed: {str(e)}"
    
def transcribe_youtube_video(video_url: str) -> str:
    """Transcribe a YouTube video."""
    # get the video url
    reader = YoutubeTranscriptReader()
    transcript = reader.load_data(video_url)
    # return the transcript
    return transcript

def get_web_tools():
    """Return all available tools for the agent."""
    return [
        FunctionTool.from_defaults(summarize_websites),
        FunctionTool.from_defaults(tavily_search),
        FunctionTool.from_defaults(search_wikipedia),
        FunctionTool.from_defaults(transcribe_youtube_video),
    ]