import os from typing import TypedDict, List, Dict, Any, Optional from langchain.agents import create_tool_calling_agent, AgentExecutor, initialize_agent from langchain_google_genai import ChatGoogleGenerativeAI from langchain_core.tools import tool from langchain_core.messages import HumanMessage from langchain_core.prompts import ChatPromptTemplate # 1. Web Browsing from langchain_community.tools import DuckDuckGoSearchRun from langchain_community.document_loaders import ImageCaptionLoader import requests, time import pandas as pd from pypdf import PdfReader from langchain_community.tools import WikipediaQueryRun from langchain_community.utilities import WikipediaAPIWrapper from youtube_transcript_api import YouTubeTranscriptApi @tool def web_search(query: str) -> str: """Allows search through DuckDuckGo. Args: query: what you want to search """ search = DuckDuckGoSearchRun() results = search.invoke(query) return "\n".join(results) @tool def visit_webpage(url: str) -> str: """Fetches raw HTML content of a web page. Args: url: the webpage url """ try: response = requests.get(url, timeout=5) return response.text except Exception as e: return f"[ERROR fetching {url}]: {str(e)}" @tool def wiki_search(query: str) -> str: """Wiki search tools. Args: query: what you want to wiki """ api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=100) wikipediatool = WikipediaQueryRun(api_wrapper=api_wrapper) return wikipediatool.run({"query": query}) @tool def youtube_transcript(video_url: str) -> str: """Fetched youtube transcript Args: video_url: YouTube video url """ try: video_id = video_url.split("v=")[-1].split("&")[0] transcript = YouTubeTranscriptApi.get_transcript(video_id) return " ".join([item["text"] for item in transcript]) except Exception as e: return f"Error fetching transcript: {str(e)}" # 4. File Reading @tool def read_file(dir: str) -> str: """Read the content of the provided file Args: dir: the filepath """ extension = dir.split['.'][-1] if extension == 'xlsx': dataframe = pd.read_excel(dir) return dataframe.to_string() elif extension == 'pdf': reader = PdfReader(dir) contents = [p.extract_text() for p in reader.pages] return "\n".join(contents) else: with open(dir) as f: return f.read() # 5. Image Open @tool def image_caption(dir: str) -> str: """Understand the content of the provided image Args: dir: the image url link """ loader = ImageCaptionLoader(images=[dir]) metadata = loader.load() return metadata[0].page_content # 2. Coding # 3. Multi-Modality # ("human", f"Question: {question}\nReport to validate: {final_answer}") class BasicAgent: def __init__(self): self.model = ChatGoogleGenerativeAI( model="gemini-2.0-flash", temperature=0, max_tokens=128, timeout=None, max_retries=2, google_api_key="AIzaSyAxVUPaGJIgdxB46ZR0RWPKSjB9a63Z80o", # other params... ) # System Prompt for few shot prompting self.sys_prompt = """" You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separared list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (eg. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to put in the list is a number or a string. You have access to the following tools: - web_search: web search the content of the query by passing the query as input - visit_webpage: visit the given webpage url by passing the url as input - wiki_search: wiki search the content of the query by passing the query as input if the question asks for wiki search it - youtube_transcript: fetch the transcript of the Youtube video by passing the video url as input if the question asks for watching a Youtube video - read_file: read the content of the attached file by passing the file directory as input - image_caption: understand the visual content of the attached image by passing the image directory as input HERE are some examples illustrating how and what tools to call. --------------- TASK: Count how many birds in the provided Youtube video. ACTION: Call youtube_transcript tool to extract video transcript. Use LLM to understand the retrived transcript. TASK: How many Grammy Awards that Taylor Swift has won. ACTION: Call the web_search tools with the query: 'how many Grammy Awards that Taylor Swift has won.' to extract the answer. TASK: Count how many people in this image. ACTION: Call the image_caption tool by passing the image directory as input. Then, use LLM to understand the image caption and answer the question. TASK: How much the total expense in this spreadsheet? ACTION: Call the read_file tool to extract the content of the provided spreadfile. Then, use LLM to extract the amount of every expense and sum them up. TASK: How many All England Title that Lee Chong Wei won? ACTION: Call wiki_search with the query: "Lee Chong Wei". Extract the relevant row of All England Title and count how many rows is there. """ self.tools = [web_search, visit_webpage, wiki_search, youtube_transcript, read_file, image_caption] self.prompt = ChatPromptTemplate.from_messages([ ("system", self.sys_prompt), ("human", "{input}") ]) self.agent = initialize_agent( tools=self.tools, llm=self.model, agent="zero-shot-react-description", # ReAct agent type verbose=True, system_prompt=self.prompt ) print("BasicAgent initialized.") def __call__(self, question: str) -> str: print(f"Agent received question (first 50 chars): {question[:50]}...") # response = self.agent_exe.invoke({"input": f"Question: {question}"}) # fixed_answer = response['message'][-1].content time.sleep(15) fixed_answer = self.agent.run(f"Answer this question: {question}") # fixed_answer = "This is a default answer." print(f"Agent returning fixed answer: {fixed_answer}") return fixed_answer