Spaces:
Sleeping
Sleeping
| # tools/youtube_video_tool.py | |
| import base64 | |
| import os | |
| import re | |
| import requests | |
| import subprocess | |
| import tempfile | |
| from io import BytesIO | |
| from tools.base_tool import BaseTool | |
| import av | |
| import yt_dlp | |
| from tools.speech_recognition_tool import SpeechRecognitionTool | |
| class YouTubeVideoTool(BaseTool): | |
| name = 'youtube_video' | |
| description = 'Process a YouTube video and answer questions based on content.' | |
| def __init__( | |
| self, | |
| speech_tool: SpeechRecognitionTool = None, | |
| quality: int = 360, | |
| frame_interval: float = 2.0, | |
| chunk_duration: float = 2.0, | |
| debug: bool = False, | |
| ): | |
| self.speech_tool = speech_tool | |
| self.quality = quality | |
| self.frame_interval = frame_interval | |
| self.chunk_duration = chunk_duration | |
| self.debug = debug | |
| def forward(self, url: str, query: str) -> str: | |
| video = self._download_video_info(url) | |
| captions = self._get_captions(video) | |
| title, description = video['title'], video['description'] | |
| chunks = self._split_captions(captions) | |
| answer = "" | |
| for chunk in chunks: | |
| prompt = self._build_prompt(title, description, chunk, query, answer) | |
| response = self._mock_llm(prompt) # replace with real call to your LLM | |
| answer = response.strip() | |
| return answer | |
| def _download_video_info(self, url: str): | |
| opts = { | |
| 'quiet': True, | |
| 'skip_download': True, | |
| 'format': f'bestvideo[height<={self.quality}]+bestaudio/best', | |
| } | |
| with yt_dlp.YoutubeDL(opts) as ydl: | |
| return ydl.extract_info(url, download=False) | |
| def _get_captions(self, info: dict): | |
| lang = 'en' | |
| subs = info.get('subtitles', {}).get(lang) or info.get('automatic_captions', {}).get(lang) | |
| if subs: | |
| sub = next((s for s in subs if s['ext'] == 'vtt'), None) | |
| if sub: | |
| text = requests.get(sub['url']).text | |
| return self._parse_vtt(text) | |
| # fallback to Whisper-based transcription | |
| if self.speech_tool: | |
| audio_url = self._select_audio_format(info['formats']) | |
| audio = self._download_audio(audio_url) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| f.write(audio.read()) | |
| f.flush() | |
| transcription = self.speech_tool.forward(audio=f.name, with_time_markers=True) | |
| return self._parse_whisper_transcription(transcription) | |
| return [] | |
| def _select_audio_format(self, formats): | |
| audio_only = [f for f in formats if f.get('vcodec') == 'none'] | |
| audio_only.sort(key=lambda f: f.get('abr', 0), reverse=True) | |
| return audio_only[0]['url'] | |
| def _download_audio(self, audio_url: str) -> BytesIO: | |
| cmd = ["ffmpeg", "-i", audio_url, "-f", "wav", "-ac", "1", "-ar", "16000", "-"] | |
| proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) | |
| return BytesIO(proc.stdout) | |
| def _parse_vtt(self, vtt_data: str): | |
| segments = [] | |
| entries = re.findall(r'(\d+:\d+:\d+\.\d+ --> \d+:\d+:\d+\.\d+)(.*?)\n(?=\n|\d)', vtt_data, re.DOTALL) | |
| for (time_range, text) in entries: | |
| clean_text = re.sub(r'<.*?>', '', text).strip().replace("\n", " ") | |
| segments.append({"text": clean_text}) | |
| return segments | |
| def _parse_whisper_transcription(self, text: str): | |
| pattern = re.compile(r'\[(\d+\.\d+)]\n(.+?)\n\[(\d+\.\d+)]') | |
| return [{"text": match[1]} for match in pattern.findall(text)] | |
| def _split_captions(self, captions): | |
| # Simple fixed-length chunking | |
| return [ | |
| {"text": " ".join([c["text"] for c in captions[i:i+3]])} | |
| for i in range(0, len(captions), 3) | |
| ] | |
| def _build_prompt(self, title, desc, chunk, query, prev): | |
| base = f""" | |
| Video Title: {title} | |
| Video Description: {desc} | |
| Transcript: | |
| {chunk['text']} | |
| """ | |
| if prev: | |
| base += f"\nPrevious answer: {prev}\n" | |
| base += f"Question: {query}" | |
| return base.strip() | |
| def _mock_llm(self, prompt: str): | |
| # Replace this with call to your real LLM | |
| return "I need to keep watching." |