Spaces:
Sleeping
Sleeping
| from smolagents import DuckDuckGoSearchTool, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool, PythonInterpreterTool, tool | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, pipeline | |
| from qwen_vl_utils import process_vision_info | |
| import torch | |
| from typing import List, Any, Optional | |
| from markdownify import markdownify | |
| from tavily import TavilyClient | |
| import os | |
| import uuid | |
| import json | |
| import traceback | |
| import requests | |
| import datetime | |
| import yt_dlp | |
| import pandas as pd | |
| import wikipedia as wiki | |
| from bs4 import BeautifulSoup | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from markdownify import markdownify as md | |
| def video_analyzer(file_path: str, query: str) -> str: | |
| """ | |
| An artificial intelligence tool that takes as input a text string containing | |
| the absolute path to a video file in MP4 format and a string with | |
| a detailed text query to analyze the video. | |
| Args: | |
| file_path: Absolute path to an Excel file. | |
| query: detailed text query to analyze the video. | |
| Returns: | |
| str: Row of text with the results of video file analysis | |
| Examples: | |
| >>> video_analyzer("/test/1.mp4", "Identify separate bird species. What is the highest number of bird species to be on camera simultaneously?") | |
| The video shows a group of Emperor penguins and a single Albatross. Therefore, the highest number of bird species to be on camera simultaneously is 2. | |
| """ | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct") | |
| text = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. " + query | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "video", "video": f"file://{file_path}", "fps": 1.0,}, | |
| {"type": "text", "text": text}, | |
| ], | |
| } | |
| ] | |
| # Preparation for inference | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to("cuda") | |
| # Inference: Generation of the output | |
| generated_ids = model.generate(**inputs, max_new_tokens=128) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| return output_text[0] | |
| # https://wikipedia.readthedocs.io/en/latest/code.html | |
| def wikipedia_available_titles(query: str) -> List[str]: | |
| """This insturment returns the titles of the articles available on wikipedia." | |
| Args: | |
| query: str | |
| The query that will be used to search for articles on wikipedia. | |
| Returns: | |
| list : list of strings with available article titles | |
| """ | |
| try: | |
| wiki.set_rate_limiting(rate_limit=True, min_wait=datetime.timedelta(milliseconds=100)) | |
| titles = wiki.search(query) | |
| except Exception as e: | |
| print("Exception occurred: ", e, "with query: ", query) | |
| return titles | |
| def wikipedia_summary(title: str) -> str: | |
| """This instrument returns the summary of a wikipedia article. | |
| Args: | |
| title: str | |
| The title of the wikipedia article to summarize. | |
| Returns: | |
| str : The summary of the article. | |
| """ | |
| try: | |
| wiki.set_rate_limiting(rate_limit=True, min_wait=datetime.timedelta(milliseconds=100)) | |
| summary = wiki.summary(title, ) | |
| except Exception as e: | |
| print("Exception occurred: ", e, "with title: ", title) | |
| summary = "" | |
| return summary | |
| def reverse_text(text: str) -> str: | |
| """This tool returns a reversed string of text. | |
| Args: | |
| text: str | |
| The line of text to be reversed | |
| Returns: | |
| str : Reversed line of text. | |
| Examples: | |
| >>> reverse_text("ecnetnes siht dnatsrednu uoy fI") | |
| If you understand this sentence | |
| """ | |
| return text[::-1] | |
| tavily_access_token = os.getenv("TAVILY_ACCESS_TOKEN") | |
| def tavily_search(request: str) -> str: | |
| """ | |
| This is an ultimatum tool for finding information on the internet. | |
| Don't use it to search YouTube! It's useless! | |
| Args: | |
| request: A string containing a query to search in the Internet. | |
| Returns: | |
| str: JSON string with execution results containing the following fields: | |
| - query: The search query to execute with Tavily. | |
| - answer: A short answer to the user's query, generated by an LLM. Included in the response only if include_answer is requested | |
| - images: List of query-related images. If include_image_descriptions is true, each item will have url and description. | |
| - results: A list of sorted search results, ranked by relevancy. Contains the following fields: | |
| - title: The title of the search result. | |
| - url: The URL of the search result. | |
| - content: A short description of the search result. | |
| - score: The relevance score of the search result. | |
| - raw_content: The cleaned and parsed HTML content of the search result. Only if include_raw_content is true. | |
| """ | |
| client = TavilyClient(tavily_access_token) | |
| response = client.search(query=request, include_raw_content=False, max_results=3, search_depth='advanced') | |
| return response | |
| def tavily_extract_web_page(url: str) -> str: | |
| """ | |
| This is an ultimatum tool that allows you to retrieve the contents of a web page. | |
| In other words, to view the website. Don't use YouTube to extract pages! It's useless! | |
| Args: | |
| url: The URL of the web page from which you want to retrieve information. | |
| Returns: | |
| str: The parsed and cleaned HTML content of the web page. The raw content extracted. | |
| """ | |
| client = TavilyClient(tavily_access_token) | |
| response = client.extract([url], extract_depth="advanced") | |
| return response["results"][0]['raw_content'] | |
| def download_youtube_video_audio(url: str) -> tuple[bool, str, str]: | |
| """ | |
| Downloads a YouTube video to a specified directory. Video and audio are downloaded separately. | |
| The video is downloaded in mp4 format and the audio in mp3 format. | |
| Args: | |
| url: The URL of the YouTube video. | |
| Returns: | |
| Returns three strings: | |
| bool: Execution result. True - success, False - error in file upload process. | |
| str: The absolute path to the downloaded video file. | |
| str: The absolute path to the downloaded audio file. | |
| """ | |
| try: | |
| # Генерация имен файлов | |
| guid = str(uuid.uuid4()) | |
| output_dir="./downloads" | |
| abs_output_dir = os.path.abspath(output_dir) | |
| video_path = os.path.join(abs_output_dir, f"{guid}.mp4") | |
| audio_path = os.path.join(abs_output_dir, f"{guid}.mp3") # Расширение будет добавлено позже автоматически | |
| format_priority = ( | |
| 'bestvideo[height=360][ext=mp4]/' # 1. Точное 720p в MP4 | |
| 'bestvideo[height<360][ext=mp4]/' # 2. Наилучшее качество ниже 720p в MP4 | |
| 'worstvideo[height>=360]' # 3. Если нет 720p, берёт лучшее (макс. 1080p) | |
| ) | |
| video_options = { | |
| 'format': format_priority, | |
| 'outtmpl': video_path, | |
| 'quiet': True, | |
| 'no_warnings': True, | |
| } | |
| # Настройки для аудио | |
| audio_options = { | |
| 'format': 'bestaudio/best[ext=mp3]', | |
| 'outtmpl': audio_path, | |
| 'quiet': True, | |
| 'no_warnings': True, | |
| } | |
| # Создание папки, если она не существует. | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Загрузка | |
| with yt_dlp.YoutubeDL(video_options) as ydl: | |
| ydl.download([url]) | |
| with yt_dlp.YoutubeDL(audio_options) as ydl: | |
| ydl.download([url]) | |
| return True, video_path, audio_path | |
| except Exception as e: | |
| # Удаляем файлы если что-то пошло не так | |
| for path in [video_path, audio_path]: | |
| try: | |
| os.remove(path) | |
| except: | |
| pass | |
| return False, None, None | |
| def transcribe_audio_file(path: str) -> str: | |
| """ | |
| The tool takes as input the absolute path to the mp3 file to be transcribed and returns the English text. | |
| Args: | |
| path: Absolute path to an audio file in mp3 format. | |
| Returns: | |
| str: A string of transcripts of an audio file in English. | |
| """ | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| transcribe = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-base", | |
| chunk_length_s=30, | |
| batch_size=2, | |
| device=device, | |
| ) | |
| try: | |
| transcription = transcribe(path, batch_size=8, generate_kwargs={"language": "english", "task": "transcribe"})["text"] | |
| except Exception as e: | |
| print("ERROR: {e}, {path}") | |
| traceback.print_exc() | |
| return None | |
| return transcription | |
| def get_excel_data(file_path: str) -> pd.DataFrame: | |
| """ | |
| The tool takes as input an absolute path to the Excel file whose contents are to be output and returns a string of text with the contents of the file. | |
| Args: | |
| file_path: Absolute path to an Excel file. | |
| Returns: | |
| str: A row with the contents of an Excel file | |
| """ | |
| return str(pd.read_excel(file_path)) | |
| def multiply(a: int, b: int) -> int: | |
| """Multiply two numbers. | |
| Args: | |
| a: first int | |
| b: second int | |
| """ | |
| return a * b | |
| def add(a: int, b: int) -> int: | |
| """Add two numbers. | |
| Args: | |
| a: first int | |
| b: second int | |
| """ | |
| return a + b | |
| def subtract(a: int, b: int) -> int: | |
| """Subtract two numbers. | |
| Args: | |
| a: first int | |
| b: second int | |
| """ | |
| return a - b | |
| def divide(a: int, b: int) -> int: | |
| """Divide two numbers. | |
| Args: | |
| a: first int | |
| b: second int | |
| """ | |
| if b == 0: | |
| raise ValueError("Cannot divide by zero.") | |
| return a / b | |
| def modulus(a: int, b: int) -> int: | |
| """Get the modulus of two numbers. | |
| Args: | |
| a: first int | |
| b: second int | |
| """ | |
| return a % b | |
| available_tools = [ | |
| reverse_text, | |
| multiply, | |
| add, | |
| subtract, | |
| divide, | |
| modulus, | |
| download_youtube_video_audio, | |
| transcribe_audio_file, | |
| get_excel_data, | |
| wikipedia_available_titles, | |
| wikipedia_summary, | |
| video_analyzer, | |
| FinalAnswerTool(), | |
| DuckDuckGoSearchTool(), | |
| tavily_search, | |
| tavily_extract_web_page, | |
| # VisitWebpageTool(), | |
| PythonInterpreterTool(), | |
| # SpeechToTextTool(), | |
| ] | |
| if __name__ == "__main__": | |
| file = "/workspaces/Final_Assignment_Template/downloads/60cc887f-cb60-4fc6-88c8-a8bbc6a4659a.mp4" | |
| text = "Identify separate bird species. What is the highest number of bird species to be on camera simultaneously?" | |
| print(video_analyzer(file, text)) |