Spaces:
Runtime error
Runtime error
| from smolagents import AzureOpenAIServerModel, CodeAgent, ToolCallingAgent, tool, DuckDuckGoSearchTool, WikipediaSearchTool, VisitWebpageTool, SpeechToTextTool | |
| import pandas as pd | |
| import os | |
| from requests.exceptions import HTTPError | |
| from dotenv import load_dotenv | |
| import requests | |
| from io import BytesIO | |
| from typing import IO | |
| from elevenlabs import ElevenLabs | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from langchain.docstore.document import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.retrievers import BM25Retriever | |
| import pdfplumber | |
| #YouTube Transcriber | |
| def youtube_transcriber(video_url: str) -> list: | |
| """Takes web address of YouTube video and generates a transcript. Each utterance is a sting in a chronologically ordered list. | |
| Args: | |
| video_url (str): Web address of YouTube video. | |
| Returns: | |
| list: Transcript as chronologically ordered list of strings. | |
| """ | |
| video_id = video_url[video_url.find('v=')+2:] | |
| print(f'YouTube video ID: {video_id}') | |
| ytt_api = YouTubeTranscriptApi() | |
| try: | |
| raw = ytt_api.fetch(video_id) | |
| transcript = [snip.text for snip in raw.snippets] | |
| print('Successfully acquired transcript') | |
| return transcript | |
| except: | |
| print('Transcript collection unsuccessful. Try again.') | |
| #Excel reader | |
| def get_remote_file(url: str) -> IO: | |
| """This tool downloads a file using the requests package, which is often successful in downloading a file when other methods meet a HTTP Error 403: Forbidden. | |
| It returns IO which can be used as if it were a file in other fuctions which expect file data. The URL must be for the file itself, not a page. | |
| Args: | |
| url (str): Web address of file to download. | |
| Returns: | |
| IO. | |
| """ | |
| # Send a GET request to the URL | |
| response = requests.get(url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Use BytesIO to read the content of the response as a binary stream | |
| return BytesIO(response.content) | |
| else: | |
| print(f"Failed to retrieve the file. Status code: {response.status_code}") | |
| def excel_reader_io(file_data: IO) -> pd.DataFrame: | |
| """ | |
| This tool returns a pandas dataframe from a from bytes data. | |
| Args: | |
| file_data: A file location as a string (either a local file or url of xlsx file) or IO file data of an xlsx to read in as a dataframe. | |
| """ | |
| return pd.read_excel(file_data, engine="openpyxl") | |
| def excel_reader_url(file_or_url: str) -> pd.DataFrame: | |
| """ | |
| This tool returns a pandas dataframe from a file locally or from a URL. | |
| Args: | |
| file_or_url: A file location as a string (either a local file or url of xlsx file) or IO file data of an xlsx to read in as a dataframe. | |
| If a file is forbidden to be accessed directly, another approach is to download the file data with get_remote_file as bytes and use that instead of a URL. | |
| """ | |
| return pd.read_excel(file_or_url, engine="openpyxl") | |
| #mp3 transcription | |
| def audio_transcription_tool(media_data: IO) -> dict: | |
| """Creates a transcript from an audio or video file. The use of this tool consumes credits, so only use if SpeechToTextTool has not been successful. | |
| Args: | |
| media_data (IO): File data as bytes stream | |
| Returns: | |
| dict: Response from the API of transcription and meta-data. | |
| """ | |
| client = ElevenLabs( | |
| api_key=os.environ.get("ELEVENLABS_API_KEY"), | |
| ) | |
| # with open(media_data, 'rb') as af: | |
| # response = client.speech_to_text.convert( | |
| # model_id="scribe_v1", file=af, tag_audio_events=False | |
| # ) | |
| response = client.speech_to_text.convert( | |
| model_id="scribe_v1", file= media_data, tag_audio_events=False | |
| ) | |
| return response.text | |
| #python code running | |
| #tables from webpage | |
| def dataframes_from_website(url:str) -> list: | |
| """Returns a list of pandas dataframes from all tables found at a given url. The first rows of each table are added to the context. | |
| This tool is useful for working with tabulated data on Wikipedia pages etc. Use information gathered previously to identify the correct table from the list returned. | |
| You can efficiently conduct sums, counts etc once you identify the correct table from the list of dataframes returned. | |
| If a tool that visits a webpage identifes a table, it's worth running this tool in case an answer can be easily derived. | |
| Args: | |
| url (str): Web page which contains one or more tables | |
| Returns: | |
| list: List of pandas dataframes | |
| """ | |
| dfs = pd.read_html(url) | |
| for count, df in enumerate(dfs): | |
| print(f"---First rows of df[{count}]---\n") | |
| print(df.head(), "\n") | |
| return dfs | |
| #chess analysis | |
| #pdf reader | |
| def text_from_pdf(pdf_file: str) -> str: | |
| """Reads a pdf file and outputs the text content as a string. | |
| Args: | |
| pdf_file (str): Filepath or URL of pdf to parse | |
| Returns: | |
| str: Content of pdf as string | |
| """ | |
| pdf_file = '/home/rob/Downloads/Reinforcement Learning 2nd Edition.pdf' | |
| with pdfplumber.open(pdf_file) as pdf: | |
| content = ' '.join([page.extract_text() for page in pdf.pages]) | |
| content = content.replace('\n', ' ') | |
| #vector store | |
| #string reverse | |
| def string_reverser(text: str) -> str: | |
| """Reverses a string. This can be useful to try if initially a prompt or string seems uninelligable. | |
| Args: | |
| text (str): String that cannot be understood. | |
| Returns: | |
| str: Reversed string | |
| """ | |
| return text[::-1] | |
| custom_tools = [get_remote_file, excel_reader_io, excel_reader_url, audio_transcription_tool, string_reverser, | |
| text_from_pdf, youtube_transcriber, dataframes_from_website] | |
| default_tools = [DuckDuckGoSearchTool(), WikipediaSearchTool(), VisitWebpageTool(), SpeechToTextTool()] | |
| tools = custom_tools + default_tools | |
| additionals = ["pandas", "numpy", "datetime", "json", "re", "math"] | |
| model = AzureOpenAIServerModel( | |
| model_id = os.environ.get("AZURE_OPENAI_MODEL_MINI"), | |
| azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"), | |
| api_key=os.environ.get("AZURE_OPENAI_API_KEY"), | |
| api_version=os.environ.get("OPENAI_API_VERSION"), | |
| #temperature=1.5, | |
| max_tokens=4096 | |
| ) | |
| planning_steps = 2 | |
| agent = CodeAgent(model=model, tools=tools, additional_authorized_imports=additionals, planning_interval=planning_steps) |