HF_Final_Assignment_Template

Sleeping

App Files Files Community

Samuel Thomas commited on May 31

Commit

fe1bd6e

1 Parent(s): 4000d20

new tools

Browse files

Files changed (3) hide show

app.py +1 -1
requirements.txt +2 -1
tools.py +675 -76

app.py CHANGED Viewed

@@ -143,7 +143,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             task_id = hf_questions[r]['task_id']
             question_text = hf_questions[r]['question']
             full_answer = run_agent(agent, s)
-            submitted_answer = strip_final_answer(extract_final_answer(full_answer[-1].content))
             print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

             task_id = hf_questions[r]['task_id']
             question_text = hf_questions[r]['question']
             full_answer = run_agent(agent, s)
+            submitted_answer = extract_final_answer(full_answer[-1].content)
             print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

requirements.txt CHANGED Viewed

@@ -24,4 +24,5 @@ duckduckgo-search==8.0.0
 sentencepiece
 nltk
 SpeechRecognition
-pandas

 sentencepiece
 nltk
 SpeechRecognition
+pandas
+openai-whisper

tools.py CHANGED Viewed

@@ -6,16 +6,19 @@ import string
 import glob
 import shutil
 import gc
 import uuid
 import signal
 from datetime import datetime
 from io import BytesIO
 from contextlib import contextmanager
 from langchain_huggingface import HuggingFacePipeline
-from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set
 import time
 from collections import Counter
-from pydantic import Field
 import hashlib
 import json
 import numpy as np
@@ -44,6 +47,7 @@ from pydub import AudioSegment
 from pydub.silence import split_on_silence
 import nltk
 from nltk.corpus import words
 # LangChain Ecosystem
 from langchain.docstore.document import Document
@@ -89,23 +93,21 @@ def create_llm_pipeline():
     #model_id = "meta-llama/Llama-3.3-70B-Instruct"
     #model_id = "mistralai/Mistral-Small-24B-Base-2501"
     model_id = "mistralai/Mistral-7B-Instruct-v0.3"
     #model_id = "Qwen/Qwen2-7B-Instruct"
-        # Load tokenizer explicitly with fast version
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_id,
-        use_fast=True,  # Force fast tokenizer
-        add_prefix_space=True  # Only if actually needed
-    )
     return pipeline(
         "text-generation",
         model=model_id,
-        tokenizer = tokenizer,
-        device_map="cpu",
         torch_dtype=torch.float16,
         max_new_tokens=1024,
-        temperature=0.1
     )
 # Define file extension sets for each category
@@ -150,21 +152,637 @@ def write_bytes_to_temp_dir(file_bytes: bytes, file_name: str) -> str:
     print(f"File written to: {file_path}")
     return file_path
 def extract_final_answer(text: str) -> str:
-  """
-  Returns the substring starting from the last occurrence of 'FINAL ANSWER:' (case-insensitive)
-  to the end of the string, with any trailing punctuation removed.
-  If not found, returns an empty string.
-  """
-  marker = "FINAL ANSWER:"
-  idx = text.lower().rfind(marker.lower())
-  if idx == -1:
-    return ""
-  result = text[idx:].strip()
-  # Remove trailing punctuation
-  return result.rstrip(string.punctuation + " ")
 class EnhancedDuckDuckGoSearchTool(BaseTool):
     name: str = "enhanced_search"
@@ -755,12 +1373,11 @@ class WikipediaSearchToolWithFAISS(BaseTool):
             return f"An unexpected error occurred: {str(e)}"
 class EnhancedYoutubeScreenshotQA(BaseTool):
-    name: str = "enhanced_youtube_screenshot_qa"
     description: str = (
-        "Downloads a YouTube video, intelligently extracts screenshots, "
-        "and answers questions using advanced visual QA with semantic analysis. "
-        "Use this tool for questions about the VIDEO or IMAGES in the video,"
         "Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
         #"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
         #"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
@@ -796,8 +1413,8 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
     def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
         """Get configuration value with fallback to defaults"""
         defaults = {
-            'frame_interval_seconds': 10,
-            'max_frames': 50,
             'use_scene_detection': True,
             'resize_frames': True,
             'parallel_processing': True,
@@ -822,6 +1439,11 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
                 "Salesforce/blip-vqa-base"
             ).to(self.device)
             print("BLIP VQA model loaded successfully")
         except Exception as e:
             print(f"Error initializing VQA model: {str(e)}")
@@ -1057,6 +1679,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
     def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
         """Answer question on single frame with confidence scoring"""
         try:
             image = Image.open(frame_path).convert('RGB')
             inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
@@ -1373,6 +1996,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
     def _run(self, youtube_url, question, **kwargs) -> str:
         """Enhanced main execution method"""
         #ipdb.set_trace()
         #input_data = query
         #youtube_url = input_data.get("youtube_url")
@@ -1411,20 +2035,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
             # Format comprehensive result - Fixed the reference to stats
             result = f"""
-📊 **ANALYSIS SUMMARY**:
-• Confidence Score: {analysis_result['confidence']:.2%}
-• Frames Analyzed: {analysis_result['successful_analyses']}/{analysis_result['frame_count']}
-• Answer Consistency: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
-📈 **ANSWER DISTRIBUTION**:
-{chr(10).join([f"• {answer}: {count} frames" for answer, count in analysis_result['answer_distribution'].items()])}
-🔍 **SEMANTIC CLUSTERS**:
-{chr(10).join([f"• '{cluster}': {count} similar answers" for cluster, count in analysis_result['semantic_clusters'].items()])}
-⏱️ **TEMPORAL ANALYSIS**:
-• Answer Changes: {analysis_result['temporal_analysis'].get('total_changes', 0)}
-• Stability: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
 📊 **STATISTICAL SUMMARY**:
 • Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
@@ -1433,10 +2043,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
 • Median: {analysis_result['statistical_summary']['median']:.2f}
 • Range: {analysis_result['statistical_summary']['range']:.2f}
-🎯 **CONFIDENCE BREAKDOWN**:
-• Frequency-based: {analysis_result['frequency_confidence']:.2%}
-• Model-based: {analysis_result['average_model_confidence']:.2%}
-• Combined: {analysis_result['confidence']:.2%}
             """.strip()
             return result
@@ -1449,30 +2055,18 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
 def create_enhanced_youtube_qa_tool(**kwargs):
     """Factory function to create the enhanced tool with custom parameters"""
     return EnhancedYoutubeScreenshotQA(**kwargs)
-# Example of creating the tool instance:
-# wikipedia_tool_faiss = WikipediaSearchToolWithFAISS()
-# To use this new tool in your agent, you would replace the old
-# `wikipedia_tool` instance with `wikipedia_tool_faiss` in your `tools` list.
-# For example:
-# tools = [wikipedia_tool_faiss, search_tool]
-# Create tool instances
-#wikipedia_tool = WikipediaSearchTool()
-# --- Define Call LLM function ---
-# 3. Improved LLM call with memory management
 class YouTubeTranscriptExtractor(BaseTool):
     name: str = "youtube_transcript_extractor"
     description: str = (
         "Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
-        "Use this tool when you need the AUDIO or DIALOGUE or sound from a YouTube video with speaker tags,"
         "Input should be a dict with keys: 'youtube_url' and optional parameters. "
-        "Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
-        "'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
-        "'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
         "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
     )
@@ -2240,8 +2834,6 @@ def create_youtube_transcript_tool(**kwargs):
     """Factory function to create the transcript extraction tool with custom parameters"""
     return YouTubeTranscriptExtractor(**kwargs)
 # --- Model Configuration ---
 def create_llm_pipeline():
     #model_id = "meta-llama/Llama-2-13b-chat-hf"
@@ -2993,17 +3585,19 @@ def fix_backwards_text(text):
 # --- Run the Agent ---
 # Enhanced system prompt for better behavior
 def run_agent(agent, state: AgentState):
     """Enhanced agent initialization with better prompt and hallucination prevention."""
-    global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, tools
     # Initialize tools
     WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
-    SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=3000)
     YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
     YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
-    tools = [WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL]
     formatted_tools_description = render_text_description(tools)
     current_date_str = datetime.now().strftime("%Y-%m-%d")
@@ -3019,6 +3613,7 @@ CRITICAL INSTRUCTIONS:
 3. Use tools ONLY when you need specific information you don't know
 4. After using a tool, provide your FINAL ANSWER immediately
 5. STOP after giving your FINAL ANSWER - do not continue
 FORMAT for tool use:
 Thought: <brief reasoning>
@@ -3030,12 +3625,15 @@ FINAL ANSWER: [concise answer only]
 ANSWER FORMAT:
 - Numbers: no commas, no units unless specified
 - Strings: no articles, no abbreviations, digits in plain text
-- Lists: comma-separated following above rules
 - Be extremely brief and concise
 - Do not provide additional context or explanations
 - Do not provide parentheticals
 IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
 Current date: {current_date_str}
@@ -3062,9 +3660,10 @@ Current date: {current_date_str}
     # Cleanup
     if result.get("done"):
-        #torch.cuda.empty_cache()
-        #torch.cuda.ipc_collect()
         gc.collect()
         print("🧹 Released GPU memory after completion")
     return result["messages"]

 import glob
 import shutil
 import gc
+import sys
 import uuid
 import signal
+from pathlib import Path
+import subprocess
 from datetime import datetime
 from io import BytesIO
 from contextlib import contextmanager
 from langchain_huggingface import HuggingFacePipeline
+from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
 import time
 from collections import Counter
+from pydantic import Field, BaseModel
 import hashlib
 import json
 import numpy as np
 from pydub.silence import split_on_silence
 import nltk
 from nltk.corpus import words
+import pandas as pd
 # LangChain Ecosystem
 from langchain.docstore.document import Document
     #model_id = "meta-llama/Llama-3.3-70B-Instruct"
     #model_id = "mistralai/Mistral-Small-24B-Base-2501"
     model_id = "mistralai/Mistral-7B-Instruct-v0.3"
+    #model_id = "Meta-Llama/Llama-2-7b-chat-hf"
+    #model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
+    #model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
+    #model_id = "mistralai/Mistral-7B-Instruct-v0.2"
     #model_id = "Qwen/Qwen2-7B-Instruct"
+    #model_id = "GSAI-ML/LLaDA-8B-Instruct"
     return pipeline(
         "text-generation",
         model=model_id,
+        device_map="auto",
         torch_dtype=torch.float16,
         max_new_tokens=1024,
+        temperature=0.05,
+        do_sample=False,
+        repetition_penalty=1.2
     )
 # Define file extension sets for each category
     print(f"File written to: {file_path}")
     return file_path
 def extract_final_answer(text: str) -> str:
+    """
+    Extracts the answer after the last 'FINAL ANSWER:' (case-insensitive),
+    removes any parenthetical immediately following a numeric answer,
+    strips trailing punctuation, sorts comma-separated lists,
+    and does not split numbers containing commas.
+    Returns an empty string if marker not found.
+    """
+    marker = "FINAL ANSWER:"
+    idx = text.lower().rfind(marker.lower())
+    if idx == -1:
+        return ""
+    # Extract answer after marker
+    result = text[idx + len(marker):].strip()
+    # Remove parenthetical immediately following a number at the start
+    result = re.sub(r'^(\d+(?:\.\d+)?)\s*\(.*?\)', r'\1', result)
+    # Remove trailing punctuation and whitespace
+    result = result.rstrip(string.punctuation + " ")
+    # Split on commas NOT between digits (i.e., not inside numbers)
+    # This regex splits on commas not surrounded by digits (to avoid splitting numbers like 1,000)
+    items = re.split(r',(?!\s*\d{3}\b)', result)
+    # If we have a list, sort it
+    if len(items) > 1:
+        items = [item.strip() for item in items]
+        # Try to sort numerically
+        try:
+            sorted_items = sorted(
+                items,
+                key=lambda x: float(re.sub(r'[^\d\.]', '', x))  # Remove non-numeric except .
+            )
+            return ', '.join(sorted_items)
+        except ValueError:
+            # Fallback: sort alphabetically
+            sorted_items = sorted(items, key=lambda x: x.lower())
+            return ', '.join(sorted_items)
+    return result
+class AudioTranscriptionInput(BaseModel):
+    """Input schema for AudioTranscriptionTool."""
+    file_path: str = Field(description="Path to the audio file to transcribe")
+    engine: Optional[str] = Field(default="google", description="Speech recognition engine to use")
+    language: Optional[str] = Field(default="en-US", description="Language of the audio")
+class AudioTranscriptionTool(BaseTool):
+    """Tool for transcribing audio files using local speech recognition."""
+    name: str = "audio_transcription"
+    description: str = """
+    Transcribes voice memo, audio files (mp3, wav, m4a, flac, etc.) to text using local speech recognition.
+    Input should be a dictionary with 'file_path' key containing the path to the audio file.
+    Optionally accepts 'engine' and 'language' parameters.
+    Returns the transcribed text as a string.
+    """
+    args_schema: type[BaseModel] = AudioTranscriptionInput
+    class Config:
+        arbitrary_types_allowed = True
+    def __init__(self, **kwargs):
+        """Initialize the AudioTranscriptionTool."""
+        super().__init__(**kwargs)
+        self._init_speech_recognition()
+    def _init_speech_recognition(self):
+        """Initialize speech recognition components."""
+        try:
+            import speech_recognition as sr
+            from pydub import AudioSegment
+            object.__setattr__(self, 'recognizer', sr.Recognizer())
+            object.__setattr__(self, 'sr', sr)
+            object.__setattr__(self, 'AudioSegment', AudioSegment)
+        except ImportError as e:
+            raise ImportError(
+                "Required libraries not found. Install with: "
+                "pip install SpeechRecognition pydub"
+            ) from e
+    def _validate_audio_file(self, file_path: str) -> bool:
+        """Validate that the audio file exists and has a supported format."""
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Audio file not found: {file_path}")
+        # Check file extension - pydub supports many formats
+        supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
+        file_extension = Path(file_path).suffix.lower()
+        if file_extension not in supported_formats:
+            raise ValueError(
+                f"Unsupported audio format: {file_extension}. "
+                f"Supported formats: {', '.join(supported_formats)}"
+            )
+        return True
+    def _convert_to_wav(self, file_path: str) -> str:
+        """Convert audio file to WAV format if needed."""
+        file_extension = Path(file_path).suffix.lower()
+        if file_extension == '.wav':
+            return file_path
+        try:
+            # Convert to WAV using pydub
+            audio = self.AudioSegment.from_file(file_path)
+            # Create temporary WAV file
+            temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+            audio.export(temp_wav.name, format="wav")
+            return temp_wav.name
+        except Exception as e:
+            raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
+    def _transcribe_audio(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
+        """Transcribe audio file using local speech recognition."""
+        temp_wav_path = None
+        try:
+            # Convert to WAV if necessary
+            wav_path = self._convert_to_wav(file_path)
+            if wav_path != file_path:
+                temp_wav_path = wav_path
+            # Load audio file
+            with self.sr.AudioFile(wav_path) as source:
+                # Adjust for ambient noise
+                self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
+                # Record the audio
+                audio_data = self.recognizer.record(source)
+            # Choose recognition engine
+            if engine == "google":
+                transcript = self.recognizer.recognize_google(audio_data, language=language)
+            elif engine == "sphinx":
+                transcript = self.recognizer.recognize_sphinx(audio_data, language=language)
+            elif engine == "wit":
+                # Note: requires WIT_AI_KEY environment variable
+                wit_key = os.getenv('WIT_AI_KEY')
+                if not wit_key:
+                    raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
+                transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
+            elif engine == "bing":
+                # Note: requires BING_KEY environment variable
+                bing_key = os.getenv('BING_KEY')
+                if not bing_key:
+                    raise ValueError("BING_KEY environment variable required for Bing engine")
+                transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
+            else:
+                # Default to Google
+                transcript = self.recognizer.recognize_google(audio_data, language=language)
+            return transcript
+        except self.sr.UnknownValueError:
+            return "Could not understand the audio - speech was unclear or inaudible"
+        except self.sr.RequestError as e:
+            return f"Error with speech recognition service: {str(e)}"
+        except Exception as e:
+            raise RuntimeError(f"Error transcribing audio: {str(e)}")
+        finally:
+            # Clean up temporary WAV file
+            if temp_wav_path and os.path.exists(temp_wav_path):
+                try:
+                    os.unlink(temp_wav_path)
+                except OSError:
+                    pass  # Ignore cleanup errors
+    def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
+        """
+        Internal method required by LangChain BaseTool.
+        Args:
+            file_path: Path to the audio file to transcribe
+            engine: Speech recognition engine to use
+            language: Language of the audio
+        Returns:
+            str: Transcribed text from the audio file
+        """
+        try:
+            # Validate audio file
+            self._validate_audio_file(file_path)
+            # Transcribe audio
+            transcript = self._transcribe_audio(
+                file_path=file_path,
+                engine=engine,
+                language=language
+            )
+            return transcript
+        except Exception as e:
+            error_msg = f"AudioTranscriptionTool error: {str(e)}"
+            print(error_msg)
+            return error_msg
+    def run(self, tool_input: Dict[str, Any]) -> str:
+        """
+        Main method to run the audio transcription tool.
+        Args:
+            tool_input: Dictionary containing 'file_path' and optional parameters
+        Returns:
+            str: Transcribed text from the audio file
+        """
+        try:
+            # Extract parameters from input
+            file_path = tool_input.get('file_path')
+            if not file_path:
+                raise ValueError("file_path is required in tool_input")
+            engine = tool_input.get('engine', 'google')
+            language = tool_input.get('language', 'en-US')
+            # Call the internal _run method
+            return self._run(file_path=file_path, engine=engine, language=language)
+        except Exception as e:
+            error_msg = f"AudioTranscriptionTool error: {str(e)}"
+            print(error_msg)
+            return error_msg
+# Enhanced local transcription tool with multiple engine support
+class AdvancedAudioTranscriptionTool(BaseTool):
+    """Advanced tool with support for multiple local transcription engines including Whisper."""
+    name: str = "advanced_audio_transcription"
+    description: str = """
+    Advanced audio transcription tool supporting multiple engines including local Whisper.
+    Supports engines: 'whisper' (local), 'google', 'sphinx', 'wit', 'bing'.
+    Input should be a dictionary with 'file_path' key.
+    Returns the transcribed text as a string.
+    """
+    args_schema: type[BaseModel] = AudioTranscriptionInput
+    class Config:
+        arbitrary_types_allowed = True
+    def __init__(self, **kwargs):
+        """Initialize the AdvancedAudioTranscriptionTool."""
+        super().__init__(**kwargs)
+        self._init_speech_recognition()
+        self._init_whisper()
+    def _init_speech_recognition(self):
+        """Initialize speech recognition components."""
+        try:
+            import speech_recognition as sr
+            from pydub import AudioSegment
+            object.__setattr__(self, 'recognizer', sr.Recognizer())
+            object.__setattr__(self, 'sr', sr)
+            object.__setattr__(self, 'AudioSegment', AudioSegment)
+        except ImportError as e:
+            raise ImportError(
+                "Required libraries not found. Install with: "
+                "pip install SpeechRecognition pydub"
+            ) from e
+    def _init_whisper(self):
+        """Initialize Whisper if available."""
+        try:
+            import whisper
+            object.__setattr__(self, 'whisper', whisper)
+        except ImportError:
+            object.__setattr__(self, 'whisper', None)
+            print("Warning: OpenAI Whisper not installed. Install with 'pip install openai-whisper' for local Whisper support.")
+    def _validate_audio_file(self, file_path: str) -> bool:
+        """Validate that the audio file exists and has a supported format."""
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Audio file not found: {file_path}")
+        supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
+        file_extension = Path(file_path).suffix.lower()
+        if file_extension not in supported_formats:
+            raise ValueError(
+                f"Unsupported audio format: {file_extension}. "
+                f"Supported formats: {', '.join(supported_formats)}"
+            )
+        return True
+    def _transcribe_with_whisper(self, file_path: str, language: str = "en") -> str:
+        """Transcribe using local Whisper model."""
+        if not self.whisper:
+            raise RuntimeError("Whisper not installed. Install with 'pip install openai-whisper'")
+        try:
+            # Load the model (you can change model size: tiny, base, small, medium, large)
+            model = self.whisper.load_model("base")
+            # Transcribe the audio
+            result = model.transcribe(file_path, language=language if language != "en-US" else "en")
+            return result["text"].strip()
+        except Exception as e:
+            raise RuntimeError(f"Error with Whisper transcription: {str(e)}")
+    def _convert_to_wav(self, file_path: str) -> str:
+        """Convert audio file to WAV format if needed."""
+        file_extension = Path(file_path).suffix.lower()
+        if file_extension == '.wav':
+            return file_path
+        try:
+            audio = self.AudioSegment.from_file(file_path)
+            temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+            audio.export(temp_wav.name, format="wav")
+            return temp_wav.name
+        except Exception as e:
+            raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
+    def _transcribe_with_sr(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
+        """Transcribe using speech_recognition library."""
+        temp_wav_path = None
+        try:
+            wav_path = self._convert_to_wav(file_path)
+            if wav_path != file_path:
+                temp_wav_path = wav_path
+            with self.sr.AudioFile(wav_path) as source:
+                self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
+                audio_data = self.recognizer.record(source)
+            if engine == "google":
+                transcript = self.recognizer.recognize_google(audio_data, language=language)
+            elif engine == "sphinx":
+                transcript = self.recognizer.recognize_sphinx(audio_data)
+            elif engine == "wit":
+                wit_key = os.getenv('WIT_AI_KEY')
+                if not wit_key:
+                    raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
+                transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
+            elif engine == "bing":
+                bing_key = os.getenv('BING_KEY')
+                if not bing_key:
+                    raise ValueError("BING_KEY environment variable required for Bing engine")
+                transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
+            else:
+                transcript = self.recognizer.recognize_google(audio_data, language=language)
+            return transcript
+        except self.sr.UnknownValueError:
+            return "Could not understand the audio - speech was unclear or inaudible"
+        except self.sr.RequestError as e:
+            return f"Error with speech recognition service: {str(e)}"
+        finally:
+            if temp_wav_path and os.path.exists(temp_wav_path):
+                try:
+                    os.unlink(temp_wav_path)
+                except OSError:
+                    pass
+    def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
+        """
+        Internal method required by LangChain BaseTool.
+        Args:
+            file_path: Path to the audio file to transcribe
+            engine: Speech recognition engine to use
+            language: Language of the audio
+        Returns:
+            str: Transcribed text from the audio file
+        """
+        try:
+            self._validate_audio_file(file_path)
+            # Use local Whisper if specified
+            if engine == "whisper":
+                transcript = self._transcribe_with_whisper(file_path, language)
+            else:
+                # Use speech_recognition library
+                transcript = self._transcribe_with_sr(file_path, engine, language)
+            return transcript
+        except Exception as e:
+            error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
+            print(error_msg)
+            return error_msg
+    def run(self, tool_input: Dict[str, Any]) -> str:
+        """
+        Main method to run the advanced audio transcription tool.
+        Args:
+            tool_input: Dictionary containing 'file_path' and optional parameters
+        Returns:
+            str: Transcribed text from the audio file
+        """
+        try:
+            file_path = tool_input.get('file_path')
+            if not file_path:
+                raise ValueError("file_path is required in tool_input")
+            engine = tool_input.get('engine', 'google')
+            language = tool_input.get('language', 'en-US')
+            # Call the internal _run method
+            return self._run(file_path=file_path, engine=engine, language=language)
+        except Exception as e:
+            error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
+            print(error_msg)
+            return error_msg
+class ExcelReaderInput(BaseModel):
+    """Input schema for ExcelReaderTool."""
+    file_path: str = Field(description="Path to the Excel file to read")
+class ExcelReaderTool(BaseTool):
+    """Tool for reading Excel files and formatting them for LLM consumption."""
+    name: str = "excel_reader"
+    description: str = (
+        "Reads an Excel file from the specified file path and returns the entire "
+        "Use for running math operations on a table of data"
+        "table from Sheet1 in a format that can be easily processed by an LLM. "
+        "Input should be a file path to an Excel file (.xlsx or .xls)."
+    )
+    args_schema: Type[BaseModel] = ExcelReaderInput
+    def _run(self, file_path: str, run_manager: Optional[Any] = None) -> str:
+        """
+        Execute the tool to read Excel file and return formatted table.
+        Args:
+            file_path: Path to the Excel file
+            run_manager: Optional callback manager
+        Returns:
+            Formatted string representation of the Excel table
+        """
+        try:
+            # Validate file exists
+            if not os.path.exists(file_path):
+                return f"Error: File not found at path: {file_path}"
+            # Validate file extension
+            if not file_path.lower().endswith(('.xlsx', '.xls')):
+                return f"Error: File must be an Excel file (.xlsx or .xls). Got: {file_path}"
+            # Read Excel file - specifically Sheet1
+            try:
+                df = pd.read_excel(file_path, sheet_name='Sheet1')
+            except ValueError as e:
+                if "Worksheet named 'Sheet1' not found" in str(e):
+                    # If Sheet1 doesn't exist, try reading the first sheet
+                    df = pd.read_excel(file_path, sheet_name=0)
+                else:
+                    raise e
+            # Check if dataframe is empty
+            if df.empty:
+                return "The Excel file contains no data in Sheet1."
+            # Format the table for LLM consumption
+            formatted_output = self._format_table_for_llm(df, file_path)
+            return formatted_output
+        except FileNotFoundError:
+            return f"Error: File not found at path: {file_path}"
+        except PermissionError:
+            return f"Error: Permission denied accessing file: {file_path}"
+        except Exception as e:
+            return f"Error reading Excel file: {str(e)}"
+    def _format_table_for_llm(self, df: pd.DataFrame, file_path: str) -> str:
+        """
+        Format the pandas DataFrame into a readable string format for LLMs.
+        Args:
+            df: The pandas DataFrame containing the Excel data
+            file_path: Original file path for reference
+        Returns:
+            Formatted string representation of the table
+        """
+        output_lines = []
+        # Add header information
+        #output_lines.append(f"EXCEL FILE DATA FROM: {os.path.basename(file_path)}")
+        #output_lines.append(f"Sheet: Sheet1")
+        #output_lines.append(f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns")
+        #output_lines.append("-" * 60)
+        # Add column information
+        #output_lines.append("COLUMNS:")
+        #for i, col in enumerate(df.columns, 1):
+        #    col_type = str(df[col].dtype)
+        #    non_null_count = df[col].count()
+        #    output_lines.append(f"  {i}. {col} ({col_type}) - {non_null_count} non-null values")
+        #output_lines.append("-" * 60)
+        # Add table data in a clean format
+        output_lines.append("TABLE DATA:")
+        # Convert DataFrame to string with proper formatting
+        # Handle potential NaN values and make it readable
+        df_clean = df.fillna("N/A")  # Replace NaN with readable placeholder
+        # Create a formatted table string
+        #table_str = df_clean.to_string(index=True, max_rows=None, max_cols=None)
+        #output_lines.append(table_str)
+        # Add summary statistics for numeric columns if they exist
+        numeric_cols = df.select_dtypes(include=['number']).columns
+        sums = df_clean[numeric_cols].sum()
+        # Step 2: Define which columns are food and which are drink
+        food_cols = [col for col in numeric_cols if col.lower() != 'soda']
+        drink_cols = [col for col in numeric_cols if col.lower() == 'soda']
+        # Step 3: Aggregate totals
+        food_total = sums[food_cols].sum()
+        drink_total = sums[drink_cols].sum()
+        # Step 4: Format the results as dollars
+        formatted_totals = {
+            'Food': f"${food_total:,.2f}",
+            'Drink': f"${drink_total:,.2f}"
+        }
+        # Step 5: Convert to string for display (optional)
+        result_string = '\n'.join([f"{k}: {v}" for k, v in formatted_totals.items()])
+        # Convert to string for display
+        #result_string = formatted.to_string()
+        output_lines.append(result_string)
+        #output_lines.append(df_clean[numeric_cols].sum())
+        if len(numeric_cols) > 0:
+            output_lines.append("-" * 60)
+            #output_lines.append("NUMERIC COLUMN SUMMARY:")
+            #for col in numeric_cols:
+            #    stats = df[col].describe()
+            #    output_lines.append(f"\n{col}:")
+            #    output_lines.append(f"  Count: {stats['count']}")
+            #    output_lines.append(f"  Mean: {stats['mean']:.2f}")
+            #    output_lines.append(f"  Min: {stats['min']}")
+            #    output_lines.append(f"  Max: {stats['max']}")
+        return "\n".join(output_lines)
+    async def _arun(self, file_path: str, run_manager: Optional[Any] = None) -> str:
+        """Async version of the tool (falls back to sync implementation)."""
+        return self._run(file_path, run_manager)
+class PythonExecutorInput(BaseModel):
+    """Input schema for PythonExecutor tool."""
+    file_path: str = Field(description="Path to the Python file to execute")
+class PythonExecutorTool(BaseTool):
+    """Tool that executes a Python file and returns the result."""
+    name: str = "python_executor"
+    description: str = "Executes a Python file from the given file path and returns the output"
+    args_schema: Type[BaseModel] = PythonExecutorInput
+    def _run(
+        self,
+        file_path: str,
+        run_manager: Optional[Any] = None,
+    ) -> str:
+        """Execute the Python file and return the result."""
+        try:
+            # Validate that the file exists
+            if not os.path.exists(file_path):
+                return f"Error: File '{file_path}' does not exist"
+            # Validate that it's a Python file
+            if not file_path.endswith('.py'):
+                return f"Error: '{file_path}' is not a Python file (.py extension required)"
+            # Execute the Python file
+            result = subprocess.run(
+                [sys.executable, file_path],
+                capture_output=True,
+                text=True,
+                timeout=600  # 30 second timeout to prevent hanging
+            )
+            # Prepare the output
+            output_parts = []
+            if result.stdout:
+                output_parts.append(f"STDOUT:\n{result.stdout}")
+            if result.stderr:
+                output_parts.append(f"STDERR:\n{result.stderr}")
+            if result.returncode != 0:
+                output_parts.append(f"Return code: {result.returncode}")
+            if not output_parts:
+                return "Script executed successfully with no output"
+            return "\n\n".join(output_parts)
+        except subprocess.TimeoutExpired:
+            return "Error: Script execution timed out (30 seconds)"
+        except Exception as e:
+            return f"Error executing Python file: {str(e)}"
+    async def _arun(
+        self,
+        file_path: str,
+        run_manager: Optional[Any] = None,
+    ) -> str:
+        """Async version - delegates to sync implementation."""
+        return self._run(file_path, run_manager)
 class EnhancedDuckDuckGoSearchTool(BaseTool):
     name: str = "enhanced_search"
             return f"An unexpected error occurred: {str(e)}"
 class EnhancedYoutubeScreenshotQA(BaseTool):
+    name: str = "bird_species_screenshot_qa"
     description: str = (
+        "Use this tool to calculate the number of bird species on camera at any one time,"
         "Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
         #"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
         #"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
     def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
         """Get configuration value with fallback to defaults"""
         defaults = {
+            'frame_interval_seconds': 5,
+            'max_frames': 500,
             'use_scene_detection': True,
             'resize_frames': True,
             'parallel_processing': True,
                 "Salesforce/blip-vqa-base"
             ).to(self.device)
+            #self.processor_vqa = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
+            #self.model_vqa = BlipForQuestionAnswering.from_pretrained(
+            #    "Salesforce/blip-vqa-capfilt-large"
+            #).to(self.device)
             print("BLIP VQA model loaded successfully")
         except Exception as e:
             print(f"Error initializing VQA model: {str(e)}")
     def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
         """Answer question on single frame with confidence scoring"""
         try:
+            #ipdb.set_trace()
             image = Image.open(frame_path).convert('RGB')
             inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
     def _run(self, youtube_url, question, **kwargs) -> str:
         """Enhanced main execution method"""
         #ipdb.set_trace()
+        question = "How many unique bird species are on camera?"
         #input_data = query
         #youtube_url = input_data.get("youtube_url")
             # Format comprehensive result - Fixed the reference to stats
             result = f"""
 📊 **STATISTICAL SUMMARY**:
 • Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
 • Median: {analysis_result['statistical_summary']['median']:.2f}
 • Range: {analysis_result['statistical_summary']['range']:.2f}
             """.strip()
             return result
 def create_enhanced_youtube_qa_tool(**kwargs):
     """Factory function to create the enhanced tool with custom parameters"""
     return EnhancedYoutubeScreenshotQA(**kwargs)
 class YouTubeTranscriptExtractor(BaseTool):
     name: str = "youtube_transcript_extractor"
     description: str = (
         "Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
+        #"Use this tool for AUDIO questions, when the youtube question involves what a person says,"
+        "Use this tool for questions like 'what does jim say in response to a question in this video',"
         "Input should be a dict with keys: 'youtube_url' and optional parameters. "
+        #"Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
+        #"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
+        #"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
         "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
     )
     """Factory function to create the transcript extraction tool with custom parameters"""
     return YouTubeTranscriptExtractor(**kwargs)
 # --- Model Configuration ---
 def create_llm_pipeline():
     #model_id = "meta-llama/Llama-2-13b-chat-hf"
 # --- Run the Agent ---
 # Enhanced system prompt for better behavior
 def run_agent(agent, state: AgentState):
     """Enhanced agent initialization with better prompt and hallucination prevention."""
+    global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL, tools
     # Initialize tools
     WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
+    SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=8000)
     YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
     YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
+    AUDIO_TRANSCRIPTION_TOOL = AudioTranscriptionTool()
+    EXCEL_TOOL = ExcelReaderTool()
+    PYTHON_TOOL = PythonExecutorTool()
+    tools = [WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_AUDIO_TOOL, YOUTUBE_TOOL,  AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL]
     formatted_tools_description = render_text_description(tools)
     current_date_str = datetime.now().strftime("%Y-%m-%d")
 3. Use tools ONLY when you need specific information you don't know
 4. After using a tool, provide your FINAL ANSWER immediately
 5. STOP after giving your FINAL ANSWER - do not continue
+6. Do not repeat words in the question in the answer
 FORMAT for tool use:
 Thought: <brief reasoning>
 ANSWER FORMAT:
 - Numbers: no commas, no units unless specified
+- Questions on "how many" should be answered with a number ONLY
 - Strings: no articles, no abbreviations, digits in plain text
+- Lists: comma-separated either in ascending numeric order or alphabetical order as requested
 - Be extremely brief and concise
 - Do not provide additional context or explanations
 - Do not provide parentheticals
 IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
 Current date: {current_date_str}
     # Cleanup
     if result.get("done"):
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
         gc.collect()
         print("🧹 Released GPU memory after completion")
     return result["messages"]