HF_Final_Assignment_Template

Sleeping

App Files Files Community

Samuel Thomas commited on Jun 1, 2025

Commit

0264a40

1 Parent(s): 7ab4cd0

youtube transcript correction

Browse files

Files changed (1) hide show

tools.py +215 -488

tools.py CHANGED Viewed

@@ -2124,42 +2124,58 @@ def create_enhanced_youtube_qa_tool(**kwargs):
     """Factory function to create the enhanced tool with custom parameters"""
     return EnhancedYoutubeScreenshotQA(**kwargs)
 class YouTubeTranscriptExtractor(BaseTool):
     name: str = "youtube_transcript_extractor"
     description: str = (
-        "Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
-        #"Use this tool for AUDIO questions, when the youtube question involves what a person says,"
-        "Use this tool for questions like 'what does jim say in response to a question in this video',"
         "Input should be a dict with keys: 'youtube_url' and optional parameters. "
-        #"Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
-        #"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
-        #"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
-        "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
     )
     # Define Pydantic fields for the attributes we need to set
     recognizer: Any = Field(default=None, exclude=True)
     class Config:
-        # Allow arbitrary types
         arbitrary_types_allowed = True
-        # Allow extra fields to be set
         extra = "allow"
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         # Initialize directories
-        cache_dir = '/tmp/youtube_transcript_cache/'
-        audio_dir = '/tmp/audio/'
-        chunks_dir = '/tmp/audio_chunks/'
         # Initialize speech recognizer
         self.recognizer = sr.Recognizer()
         # Create directories
-        for dir_path in [cache_dir, audio_dir, chunks_dir]:
             os.makedirs(dir_path, exist_ok=True)
     def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
@@ -2168,19 +2184,10 @@ class YouTubeTranscriptExtractor(BaseTool):
             'language': 'en-US',
             'chunk_length_ms': 30000,  # 30 seconds
             'silence_thresh': -40,     # dB
-            'use_enhanced_model': True,
             'audio_quality': 'best',
             'cache_enabled': True,
-            'parallel_processing': True,
-            'overlap_ms': 1000,        # 1 second overlap between chunks
             'min_silence_len': 500,    # minimum silence length to split on
-            'energy_threshold': 4000,  # recognizer energy threshold
-            'pause_threshold': 0.8,    # recognizer pause threshold
-            'enable_speaker_id': True, # enable speaker identification
-            'max_speakers': 5,         # maximum number of speakers to identify
-            'speaker_min_duration': 2.0, # minimum duration (seconds) for speaker segment
-            'speaker_confidence_threshold': 0.6, # confidence threshold for speaker assignment
-            'voice_activity_threshold': 0.01     # threshold for voice activity detection
         }
         if input_data and key in input_data:
@@ -2193,8 +2200,7 @@ class YouTubeTranscriptExtractor(BaseTool):
     def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
         """Get cache file path"""
-        cache_dir = '/tmp/youtube_transcript_cache/'
-        return os.path.join(cache_dir, f"{video_hash}_{cache_type}")
     def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
         """Load data from cache"""
@@ -2217,12 +2223,24 @@ class YouTubeTranscriptExtractor(BaseTool):
         except Exception as e:
             print(f"Error saving cache: {str(e)}")
     def download_youtube_audio(self, url: str, video_hash: str, input_data: Dict[str, Any] = None) -> Optional[str]:
         """Download YouTube video as audio file"""
-        audio_dir = '/tmp/audio/'
         audio_quality = self._get_config('audio_quality', 'best', input_data)
         output_filename = f'{video_hash}.wav'
-        output_path = os.path.join(audio_dir, output_filename)
         # Check cache
         cache_enabled = self._get_config('cache_enabled', True, input_data)
@@ -2231,147 +2249,97 @@ class YouTubeTranscriptExtractor(BaseTool):
             return output_path
         # Clean directory
-        self._clean_directory(audio_dir)
         try:
-            # First download as mp4/webm
-            temp_video_path = os.path.join(audio_dir, f'{video_hash}_temp.%(ext)s')
             ydl_opts = {
-                'format': 'bestaudio/best' if audio_quality == 'best' else 'worstaudio/worst',
-                'outtmpl': temp_video_path,
-                'quiet': True,
-                'extractaudio': True,
-                'audioformat': 'wav',
-                'audioquality': '192K' if audio_quality == 'best' else '64K',
             }
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 ydl.download([url])
-            # Find the downloaded file
-            temp_files = glob.glob(os.path.join(audio_dir, f'{video_hash}_temp.*'))
-            if not temp_files:
-                print("No temporary audio file found")
-                return None
-            temp_file = temp_files[0]
-            # Convert to WAV if not already
-            if not temp_file.endswith('.wav'):
-                try:
-                    audio = AudioSegment.from_file(temp_file)
-                    audio.export(output_path, format="wav")
-                    os.remove(temp_file)  # Clean up temp file
-                except Exception as e:
-                    print(f"Error converting audio: {str(e)}")
-                    # Try to rename if it's already the right format
-                    if os.path.exists(temp_file):
-                        os.rename(temp_file, output_path)
-            else:
-                os.rename(temp_file, output_path)
             if os.path.exists(output_path):
-                print(f"Audio extracted successfully: {output_path}")
                 return output_path
             else:
-                print("Audio extraction completed but file not found")
                 return None
         except Exception as e:
             print(f"Error downloading YouTube audio: {str(e)}")
             return None
-    def _clean_directory(self, directory: str):
-        """Clean directory contents"""
-        if os.path.exists(directory):
-            for filename in os.listdir(directory):
-                file_path = os.path.join(directory, filename)
-                try:
-                    if os.path.isfile(file_path) or os.path.islink(file_path):
-                        os.unlink(file_path)
-                    elif os.path.isdir(file_path):
-                        shutil.rmtree(file_path)
-                except Exception as e:
-                    print(f'Failed to delete {file_path}. Reason: {e}')
-    def _extract_voice_features(self, audio_path: str) -> Optional[np.ndarray]:
-        """Extract voice features for speaker identification using librosa"""
-        try:
-            # Load audio with librosa
-            y, sr = librosa.load(audio_path, sr=None)
-            # Extract MFCC features (commonly used for speaker identification)
-            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
-            # Extract additional features
-            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
-            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
-            zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
-            # Combine features and take mean across time
-            features = np.concatenate([
-                np.mean(mfccs, axis=1),
-                np.mean(spectral_centroids),
-                np.mean(spectral_rolloff),
-                np.mean(zero_crossing_rate)
-            ])
-            return features
-        except Exception as e:
-            print(f"Error extracting voice features from {audio_path}: {str(e)}")
-            return None
-    def _detect_voice_activity(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Tuple[float, float]]:
-        """Detect voice activity in audio chunk"""
-        try:
-            y, sr = librosa.load(audio_path, sr=None)
-            # Simple voice activity detection based on energy
-            frame_length = int(0.025 * sr)  # 25ms frames
-            hop_length = int(0.010 * sr)    # 10ms hop
-            # Calculate short-time energy
-            energy = []
-            for i in range(0, len(y) - frame_length, hop_length):
-                frame = y[i:i + frame_length]
-                energy.append(np.sum(frame ** 2))
-            energy = np.array(energy)
-            threshold = self._get_config('voice_activity_threshold', 0.01, input_data)
-            # Find voice segments
-            voice_frames = energy > (np.max(energy) * threshold)
-            # Convert frame indices to time segments
-            voice_segments = []
-            in_voice = False
-            start_time = 0
-            for i, is_voice in enumerate(voice_frames):
-                time_sec = i * hop_length / sr
-                if is_voice and not in_voice:
-                    start_time = time_sec
-                    in_voice = True
-                elif not is_voice and in_voice:
-                    voice_segments.append((start_time, time_sec))
-                    in_voice = False
-            # Close last segment if needed
-            if in_voice:
-                voice_segments.append((start_time, len(y) / sr))
-            return voice_segments
-        except Exception as e:
-            print(f"Error in voice activity detection: {str(e)}")
-            return [(0, librosa.get_duration(filename=audio_path))]
     def _split_audio_intelligent(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
-        """Split audio into chunks intelligently based on silence and voice activity"""
-        chunks_dir = '/tmp/audio_chunks/'
-        self._clean_directory(chunks_dir)
         try:
             # Load audio
@@ -2402,40 +2370,46 @@ class YouTubeTranscriptExtractor(BaseTool):
             # Save chunks and create metadata
             chunk_data = []
             for i, chunk in enumerate(chunks):
                 if len(chunk) < 1000:  # Skip very short chunks
                     continue
-                chunk_filename = os.path.join(chunks_dir, f"chunk_{i:04d}.wav")
                 chunk.export(chunk_filename, format="wav")
-                # Calculate timing information
-                start_time = sum(len(chunks[j]) for j in range(i)) / 1000.0  # in seconds
                 duration = len(chunk) / 1000.0  # in seconds
                 chunk_info = {
                     'filename': chunk_filename,
                     'index': i,
-                    'start_time': start_time,
                     'duration': duration,
-                    'end_time': start_time + duration
                 }
                 chunk_data.append(chunk_info)
             print(f"Split audio into {len(chunk_data)} chunks")
             return chunk_data
         except Exception as e:
             print(f"Error splitting audio: {str(e)}")
-            # Fallback: return original file
-            return [{
-                'filename': audio_path,
-                'index': 0,
-                'start_time': 0,
-                'duration': len(AudioSegment.from_wav(audio_path)) / 1000.0,
-                'end_time': len(AudioSegment.from_wav(audio_path)) / 1000.0
-            }]
     def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
         """Transcribe a single audio chunk"""
@@ -2443,375 +2417,115 @@ class YouTubeTranscriptExtractor(BaseTool):
         try:
             language = self._get_config('language', 'en-US', input_data)
-            # Configure recognizer
-            self.recognizer.energy_threshold = self._get_config('energy_threshold', 4000, input_data)
-            self.recognizer.pause_threshold = self._get_config('pause_threshold', 0.8, input_data)
             with sr.AudioFile(chunk_path) as source:
                 # Adjust for ambient noise
                 self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
                 audio_data = self.recognizer.record(source)
-            # Try Google Speech Recognition first (most accurate)
             try:
                 text = self.recognizer.recognize_google(audio_data, language=language)
-                result = {
                     'text': text,
-                    'confidence': 1.0,  # Google doesn't provide confidence
-                    'method': 'google',
-                    'chunk': os.path.basename(chunk_path),
                     'start_time': chunk_info['start_time'],
                     'end_time': chunk_info['end_time'],
                     'duration': chunk_info['duration'],
-                    'index': chunk_info['index']
                 }
-                # Extract voice features if speaker ID is enabled
-                if self._get_config('enable_speaker_id', True, input_data):
-                    features = self._extract_voice_features(chunk_path)
-                    result['voice_features'] = features.tolist() if features is not None else None
-                return result
             except sr.UnknownValueError:
-                # Try alternative recognition methods
                 try:
-                    # Try with alternative language detection
                     text = self.recognizer.recognize_google(audio_data)
-                    result = {
                         'text': text,
-                        'confidence': 0.8,  # Lower confidence for language mismatch
-                        'method': 'google_auto',
-                        'chunk': os.path.basename(chunk_path),
                         'start_time': chunk_info['start_time'],
                         'end_time': chunk_info['end_time'],
                         'duration': chunk_info['duration'],
-                        'index': chunk_info['index']
                     }
-                    if self._get_config('enable_speaker_id', True, input_data):
-                        features = self._extract_voice_features(chunk_path)
-                        result['voice_features'] = features.tolist() if features is not None else None
-                    return result
                 except sr.UnknownValueError:
                     return {
                         'text': '[INAUDIBLE]',
                         'confidence': 0.0,
-                        'method': 'failed',
-                        'chunk': os.path.basename(chunk_path),
                         'start_time': chunk_info['start_time'],
                         'end_time': chunk_info['end_time'],
                         'duration': chunk_info['duration'],
                         'index': chunk_info['index'],
-                        'voice_features': None
                     }
             except sr.RequestError as e:
-                print(f"Google Speech Recognition error: {e}")
                 return {
-                    'text': '[RECOGNITION_ERROR]',
                     'confidence': 0.0,
-                    'method': 'error',
-                    'chunk': os.path.basename(chunk_path),
                     'start_time': chunk_info['start_time'],
                     'end_time': chunk_info['end_time'],
                     'duration': chunk_info['duration'],
                     'index': chunk_info['index'],
-                    'error': str(e),
-                    'voice_features': None
                 }
         except Exception as e:
-            print(f"Error transcribing chunk {chunk_path}: {str(e)}")
             return {
-                'text': '[ERROR]',
                 'confidence': 0.0,
-                'method': 'error',
-                'chunk': os.path.basename(chunk_path),
                 'start_time': chunk_info.get('start_time', 0),
                 'end_time': chunk_info.get('end_time', 0),
                 'duration': chunk_info.get('duration', 0),
                 'index': chunk_info.get('index', 0),
-                'error': str(e),
-                'voice_features': None
             }
-    def _identify_speakers(self, transcript_results: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
-        """Identify speakers using voice features clustering"""
-        enable_speaker_id = self._get_config('enable_speaker_id', True, input_data)
-        if not enable_speaker_id:
-            # Add default speaker tags
-            for result in transcript_results:
-                result['speaker_id'] = 'SPEAKER_1'
-                result['speaker_confidence'] = 1.0
-            return transcript_results
-        try:
-            # Filter results with valid voice features and text
-            valid_results = []
-            features_list = []
-            for result in transcript_results:
-                if (result.get('voice_features') is not None and
-                    result['text'] not in ['[INAUDIBLE]', '[RECOGNITION_ERROR]', '[ERROR]', '[PROCESSING_ERROR]']):
-                    valid_results.append(result)
-                    features_list.append(result['voice_features'])
-            if len(features_list) < 2:
-                # Not enough data for clustering
-                for result in transcript_results:
-                    result['speaker_id'] = 'SPEAKER_1'
-                    result['speaker_confidence'] = 1.0
-                return transcript_results
-            # Normalize features
-            features_array = np.array(features_list)
-            scaler = StandardScaler()
-            normalized_features = scaler.fit_transform(features_array)
-            # Determine optimal number of speakers
-            max_speakers = min(self._get_config('max_speakers', 5, input_data), len(features_list))
-            # Use elbow method to find optimal clusters (simplified)
-            best_k = 1
-            if len(features_list) > 1:
-                best_score = float('inf')
-                for k in range(1, min(max_speakers + 1, len(features_list) + 1)):
-                    try:
-                        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
-                        labels = kmeans.fit_predict(normalized_features)
-                        if k > 1:
-                            score = kmeans.inertia_
-                            if score < best_score:
-                                best_score = score
-                                best_k = k
-                    except:
-                        continue
-                # Don't use too many clusters for short audio
-                if len(features_list) < 10:
-                    best_k = min(best_k, 2)
-            # Perform final clustering
-            kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
-            speaker_labels = kmeans.fit_predict(normalized_features)
-            # Calculate speaker assignment confidence
-            distances = kmeans.transform(normalized_features)
-            confidences = []
-            for i, label in enumerate(speaker_labels):
-                # Confidence based on distance to assigned cluster vs. nearest other cluster
-                dist_to_assigned = distances[i][label]
-                other_distances = np.delete(distances[i], label)
-                if len(other_distances) > 0:
-                    dist_to_nearest_other = np.min(other_distances)
-                    confidence = max(0.1, min(1.0, dist_to_nearest_other / (dist_to_assigned + 1e-6)))
-                else:
-                    confidence = 1.0
-                confidences.append(confidence)
-            # Assign speaker IDs back to results
-            valid_idx = 0
-            speaker_duration = {}  # Track duration per speaker
-            for result in transcript_results:
-                if (result.get('voice_features') is not None and
-                    result['text'] not in ['[INAUDIBLE]', '[RECOGNITION_ERROR]', '[ERROR]', '[PROCESSING_ERROR]']):
-                    speaker_label = speaker_labels[valid_idx]
-                    confidence = confidences[valid_idx]
-                    # Filter by confidence threshold
-                    conf_threshold = self._get_config('speaker_confidence_threshold', 0.6, input_data)
-                    if confidence < conf_threshold:
-                        speaker_id = 'SPEAKER_UNKNOWN'
-                    else:
-                        speaker_id = f'SPEAKER_{speaker_label + 1}'
-                    result['speaker_id'] = speaker_id
-                    result['speaker_confidence'] = confidence
-                    # Track speaker duration
-                    if speaker_id in speaker_duration:
-                        speaker_duration[speaker_id] += result['duration']
-                    else:
-                        speaker_duration[speaker_id] = result['duration']
-                    valid_idx += 1
-                else:
-                    # Handle invalid results
-                    result['speaker_id'] = 'SPEAKER_UNKNOWN'
-                    result['speaker_confidence'] = 0.0
-            # Filter out speakers with insufficient duration
-            min_duration = self._get_config('speaker_min_duration', 2.0, input_data)
-            speakers_to_merge = [s for s, d in speaker_duration.items() if d < min_duration and s != 'SPEAKER_UNKNOWN']
-            # Merge low-duration speakers into SPEAKER_UNKNOWN
-            for result in transcript_results:
-                if result['speaker_id'] in speakers_to_merge:
-                    result['speaker_id'] = 'SPEAKER_UNKNOWN'
-                    result['speaker_confidence'] = 0.3
-            print(f"Identified {best_k} speakers based on voice characteristics")
-            return transcript_results
-        except Exception as e:
-            print(f"Error in speaker identification: {str(e)}")
-            # Fallback: assign all to single speaker
-            for result in transcript_results:
-                result['speaker_id'] = 'SPEAKER_1'
-                result['speaker_confidence'] = 1.0
-            return transcript_results
     def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
         """Transcribe audio chunks in parallel"""
         results = []
-        parallel_processing = self._get_config('parallel_processing', True, input_data)
-        if parallel_processing:
-            # Use fewer workers for speech recognition to avoid API limits
-            max_workers = min(3, len(chunk_data))
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                future_to_chunk = {
-                    executor.submit(self._transcribe_audio_chunk, chunk_info, input_data): chunk_info
-                    for chunk_info in chunk_data
-                }
-                for future in as_completed(future_to_chunk):
-                    chunk_info = future_to_chunk[future]
-                    try:
-                        result = future.result()
-                        results.append(result)
-                        print(f"Transcribed {result['chunk']}: {result['text'][:50]}..." if len(result['text']) > 50 else f"Transcribed {result['chunk']}: {result['text']}")
-                    except Exception as e:
-                        print(f"Error processing {chunk_info['filename']}: {str(e)}")
-                        results.append({
-                            'text': '[PROCESSING_ERROR]',
-                            'confidence': 0.0,
-                            'method': 'error',
-                            'chunk': os.path.basename(chunk_info['filename']),
-                            'start_time': chunk_info.get('start_time', 0),
-                            'end_time': chunk_info.get('end_time', 0),
-                            'duration': chunk_info.get('duration', 0),
-                            'index': chunk_info.get('index', 0),
-                            'error': str(e),
-                            'voice_features': None
-                        })
-        else:
-            for chunk_info in chunk_data:
-                result = self._transcribe_audio_chunk(chunk_info, input_data)
-                results.append(result)
-                print(f"Transcribed {result['chunk']}: {result['text'][:50]}..." if len(result['text']) > 50 else f"Transcribed {result['chunk']}: {result['text']}")
         # Sort results by chunk index to maintain order
         results.sort(key=lambda x: x['index'])
         return results
-    def _post_process_transcript(self, transcript_results: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
-        """Post-process and analyze transcript results with speaker information"""
-        enable_speaker_id = self._get_config('enable_speaker_id', True, input_data)
-        # Identify speakers if enabled
-        if enable_speaker_id:
-            transcript_results = self._identify_speakers(transcript_results, input_data)
-        # Combine text with speaker tags
-        full_text_parts = []
-        speaker_tagged_text = []
-        successful_chunks = 0
-        total_confidence = 0.0
-        method_counts = {}
-        speaker_stats = {}
-        current_speaker = None
-        current_speaker_text = []
-        for result in transcript_results:
-            text = result['text']
-            speaker = result.get('speaker_id', 'SPEAKER_1')
-            start_time = result.get('start_time', 0)
-            if text not in ['[INAUDIBLE]', '[RECOGNITION_ERROR]', '[ERROR]', '[PROCESSING_ERROR]']:
-                full_text_parts.append(text)
-                successful_chunks += 1
-                total_confidence += result['confidence']
-                # Handle speaker transitions
-                if enable_speaker_id:
-                    if current_speaker != speaker:
-                        # Save previous speaker's text
-                        if current_speaker and current_speaker_text:
-                            combined_text = ' '.join(current_speaker_text)
-                            speaker_tagged_text.append(f"[{current_speaker}]: {combined_text}")
-                        # Start new speaker
-                        current_speaker = speaker
-                        current_speaker_text = [text]
-                    else:
-                        # Continue with same speaker
-                        current_speaker_text.append(text)
-                else:
-                    speaker_tagged_text.append(text)
-                # Update speaker statistics
-                if speaker in speaker_stats:
-                    speaker_stats[speaker]['duration'] += result.get('duration', 0)
-                    speaker_stats[speaker]['word_count'] += len(text.split())
-                    speaker_stats[speaker]['segments'] += 1
-                else:
-                    speaker_stats[speaker] = {
-                        'duration': result.get('duration', 0),
-                        'word_count': len(text.split()),
-                        'segments': 1,
-                        'confidence': result.get('speaker_confidence', 1.0)
-                    }
-            method = result['method']
-            method_counts[method] = method_counts.get(method, 0) + 1
-        # Add final speaker text
-        if enable_speaker_id and current_speaker and current_speaker_text:
-            combined_text = ' '.join(current_speaker_text)
-            speaker_tagged_text.append(f"[{current_speaker}]: {combined_text}")
-        # Combine texts
-        combined_text = ' '.join(full_text_parts)
-        speaker_formatted_text = combined_text
-        # Calculate statistics
-        word_count = len(combined_text.split()) if combined_text else 0
-        char_count = len(combined_text)
-        avg_confidence = total_confidence / max(1, successful_chunks)
-        success_rate = successful_chunks / len(transcript_results) if transcript_results else 0
-        # Estimate speaking duration (rough approximation: 150 words per minute)
-        estimated_duration_minutes = word_count / 150 if word_count > 0 else 0
-        return {
-            'full_transcript': combined_text,
-            'speaker_tagged_transcript': speaker_formatted_text,
-            'word_count': word_count,
-            'character_count': char_count,
-            'chunk_count': len(transcript_results),
-            'successful_chunks': successful_chunks,
-            'success_rate': success_rate,
-            'average_confidence': avg_confidence,
-            'method_distribution': method_counts,
-            'estimated_duration_minutes': estimated_duration_minutes,
-            'speaker_identification_enabled': enable_speaker_id,
-            'speaker_statistics': speaker_stats,
-            'total_speakers': len([s for s in speaker_stats.keys() if s != 'SPEAKER_UNKNOWN']),
-            'detailed_results': transcript_results
-        }
     def extract_transcript(self, audio_path: str, video_hash: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]:
         """Extract complete transcript from audio file"""
         cache_enabled = self._get_config('cache_enabled', True, input_data)
-        enable_speaker_id = self._get_config('enable_speaker_id', True, input_data)
-        cache_suffix = "transcript_with_speakers.json" if enable_speaker_id else "transcript.json"
-        cache_path = self._get_cache_path(video_hash, cache_suffix)
         # Check cache
         cached_transcript = self._load_from_cache(cache_path, cache_enabled)
@@ -2828,7 +2542,6 @@ class YouTubeTranscriptExtractor(BaseTool):
                 return {
                     'error': 'Failed to split audio into chunks',
                     'full_transcript': '',
-                    'speaker_tagged_transcript': '',
                     'success_rate': 0.0
                 }
@@ -2836,17 +2549,31 @@ class YouTubeTranscriptExtractor(BaseTool):
             print(f"Transcribing {len(chunk_data)} audio chunks...")
             transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
-            # Step 3: Post-process and combine results
-            print("Post-processing transcript and identifying speakers...")
-            final_result = self._post_process_transcript(transcript_results, input_data)
-            # Add timestamp
-            final_result['extraction_timestamp'] = time.time()
-            final_result['extraction_date'] = time.strftime('%Y-%m-%d %H:%M:%S')
             # Cache results
             self._save_to_cache(cache_path, final_result, cache_enabled)
             return final_result
         except Exception as e:
@@ -2854,7 +2581,6 @@ class YouTubeTranscriptExtractor(BaseTool):
             return {
                 'error': str(e),
                 'full_transcript': '',
-                'speaker_tagged_transcript': '',
                 'success_rate': 0.0
             }
@@ -2876,7 +2602,7 @@ class YouTubeTranscriptExtractor(BaseTool):
             print(f"Downloading YouTube audio from {youtube_url}...")
             audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
             if not audio_path or not os.path.exists(audio_path):
-                return "Error: Failed to download the YouTube audio."
             # Step 2: Extract transcript
             print("Extracting audio transcript...")
@@ -2885,18 +2611,19 @@ class YouTubeTranscriptExtractor(BaseTool):
             if transcript_result.get("error"):
                 return f"Error: {transcript_result['error']}"
-            # Choose the appropriate transcript
-            main_transcript = transcript_result.get('full_transcript')
-            #ipdb.set_trace()
-            print(f"Transcript extracted: {main_transcript[:50]}..." if len(main_transcript) > 50 else f"Transcript extracted: {main_transcript}")
             return "TRANSCRIPT: " + main_transcript
         except Exception as e:
             return f"Error during transcript extraction: {str(e)}"
 # Factory function to create the tool
 def create_youtube_transcript_tool(**kwargs):
     """Factory function to create the transcript extraction tool with custom parameters"""

     """Factory function to create the enhanced tool with custom parameters"""
     return EnhancedYoutubeScreenshotQA(**kwargs)
+import os
+import json
+import hashlib
+import time
+import shutil
+import glob
+from typing import Dict, Any, List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import yt_dlp
+import speech_recognition as sr
+from pydantic import Field
+from pydantic.v1 import BaseModel
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+class BaseTool(BaseModel):
+    name: str
+    description: str
 class YouTubeTranscriptExtractor(BaseTool):
     name: str = "youtube_transcript_extractor"
     description: str = (
+        "Downloads a YouTube video and extracts the complete audio transcript using speech recognition. "
+        "Use this tool for questions about what people say in YouTube videos. "
         "Input should be a dict with keys: 'youtube_url' and optional parameters. "
+        "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US'}"
     )
     # Define Pydantic fields for the attributes we need to set
     recognizer: Any = Field(default=None, exclude=True)
     class Config:
         arbitrary_types_allowed = True
         extra = "allow"
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         # Initialize directories
+        self.cache_dir = '/tmp/youtube_transcript_cache/'
+        self.audio_dir = '/tmp/audio/'
+        self.chunks_dir = '/tmp/audio_chunks/'
         # Initialize speech recognizer
         self.recognizer = sr.Recognizer()
+        self.recognizer.energy_threshold = 4000
+        self.recognizer.pause_threshold = 0.8
         # Create directories
+        for dir_path in [self.cache_dir, self.audio_dir, self.chunks_dir]:
             os.makedirs(dir_path, exist_ok=True)
     def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
             'language': 'en-US',
             'chunk_length_ms': 30000,  # 30 seconds
             'silence_thresh': -40,     # dB
             'audio_quality': 'best',
             'cache_enabled': True,
             'min_silence_len': 500,    # minimum silence length to split on
+            'overlap_ms': 1000,        # 1 second overlap between chunks
         }
         if input_data and key in input_data:
     def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
         """Get cache file path"""
+        return os.path.join(self.cache_dir, f"{video_hash}_{cache_type}")
     def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
         """Load data from cache"""
         except Exception as e:
             print(f"Error saving cache: {str(e)}")
+    def _clean_directory(self, directory: str):
+        """Clean directory contents"""
+        if os.path.exists(directory):
+            for filename in os.listdir(directory):
+                file_path = os.path.join(directory, filename)
+                try:
+                    if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception as e:
+                    print(f'Failed to delete {file_path}. Reason: {e}')
     def download_youtube_audio(self, url: str, video_hash: str, input_data: Dict[str, Any] = None) -> Optional[str]:
         """Download YouTube video as audio file"""
         audio_quality = self._get_config('audio_quality', 'best', input_data)
         output_filename = f'{video_hash}.wav'
+        output_path = os.path.join(self.audio_dir, output_filename)
         # Check cache
         cache_enabled = self._get_config('cache_enabled', True, input_data)
             return output_path
         # Clean directory
+        self._clean_directory(self.audio_dir)
         try:
+            # Updated yt-dlp configuration for better compatibility
             ydl_opts = {
+                'format': 'bestaudio[ext=m4a]/bestaudio/best',
+                'outtmpl': os.path.join(self.audio_dir, f'{video_hash}.%(ext)s'),
+                'quiet': False,  # Set to False for debugging
+                'no_warnings': False,
+                'extract_flat': False,
+                'writethumbnail': False,
+                'writeinfojson': False,
+                'postprocessors': [{
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'wav',
+                    'preferredquality': '192' if audio_quality == 'best' else '128',
+                }],
+                # Add user agent and headers to avoid blocking
+                'http_headers': {
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+                },
+                # Add cookie handling
+                'cookiefile': None,
+                'nocheckcertificate': True,
             }
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                print(f"Downloading audio from: {url}")
                 ydl.download([url])
+            # Check if the output file exists
             if os.path.exists(output_path):
+                print(f"Audio downloaded successfully: {output_path}")
                 return output_path
             else:
+                # Look for any downloaded file with the video hash
+                possible_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}.*'))
+                if possible_files:
+                    # Convert to WAV if needed
+                    source_file = possible_files[0]
+                    if not source_file.endswith('.wav'):
+                        try:
+                            audio = AudioSegment.from_file(source_file)
+                            audio.export(output_path, format="wav")
+                            os.remove(source_file)  # Clean up original
+                            print(f"Audio converted to WAV: {output_path}")
+                            return output_path
+                        except Exception as e:
+                            print(f"Error converting audio: {str(e)}")
+                            return None
+                    else:
+                        return source_file
+                print("No audio file found after download")
                 return None
         except Exception as e:
             print(f"Error downloading YouTube audio: {str(e)}")
+            # Try alternative format as fallback
+            try:
+                print("Trying alternative download method...")
+                fallback_opts = {
+                    'format': 'worst[ext=mp4]',
+                    'outtmpl': os.path.join(self.audio_dir, f'{video_hash}_fallback.%(ext)s'),
+                    'quiet': False,
+                }
+                with yt_dlp.YoutubeDL(fallback_opts) as ydl:
+                    ydl.download([url])
+                # Look for fallback file and convert
+                fallback_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}_fallback.*'))
+                if fallback_files:
+                    source_file = fallback_files[0]
+                    try:
+                        audio = AudioSegment.from_file(source_file)
+                        audio.export(output_path, format="wav")
+                        os.remove(source_file)
+                        print(f"Fallback audio converted: {output_path}")
+                        return output_path
+                    except Exception as conv_e:
+                        print(f"Error converting fallback audio: {str(conv_e)}")
+            except Exception as fallback_e:
+                print(f"Fallback download also failed: {str(fallback_e)}")
             return None
     def _split_audio_intelligent(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+        """Split audio into chunks intelligently based on silence"""
+        self._clean_directory(self.chunks_dir)
         try:
             # Load audio
             # Save chunks and create metadata
             chunk_data = []
+            current_time = 0
             for i, chunk in enumerate(chunks):
                 if len(chunk) < 1000:  # Skip very short chunks
                     continue
+                chunk_filename = os.path.join(self.chunks_dir, f"chunk_{i:04d}.wav")
                 chunk.export(chunk_filename, format="wav")
                 duration = len(chunk) / 1000.0  # in seconds
                 chunk_info = {
                     'filename': chunk_filename,
                     'index': i,
+                    'start_time': current_time,
                     'duration': duration,
+                    'end_time': current_time + duration
                 }
                 chunk_data.append(chunk_info)
+                current_time += duration
             print(f"Split audio into {len(chunk_data)} chunks")
             return chunk_data
         except Exception as e:
             print(f"Error splitting audio: {str(e)}")
+            # Fallback: return original file as single chunk
+            try:
+                audio = AudioSegment.from_wav(audio_path)
+                duration = len(audio) / 1000.0
+                return [{
+                    'filename': audio_path,
+                    'index': 0,
+                    'start_time': 0,
+                    'duration': duration,
+                    'end_time': duration
+                }]
+            except:
+                return []
     def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
         """Transcribe a single audio chunk"""
         try:
             language = self._get_config('language', 'en-US', input_data)
             with sr.AudioFile(chunk_path) as source:
                 # Adjust for ambient noise
                 self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
                 audio_data = self.recognizer.record(source)
+            # Try Google Speech Recognition
             try:
                 text = self.recognizer.recognize_google(audio_data, language=language)
+                return {
                     'text': text,
+                    'confidence': 1.0,
                     'start_time': chunk_info['start_time'],
                     'end_time': chunk_info['end_time'],
                     'duration': chunk_info['duration'],
+                    'index': chunk_info['index'],
+                    'success': True
                 }
             except sr.UnknownValueError:
+                # Try without language specification
                 try:
                     text = self.recognizer.recognize_google(audio_data)
+                    return {
                         'text': text,
+                        'confidence': 0.8,
                         'start_time': chunk_info['start_time'],
                         'end_time': chunk_info['end_time'],
                         'duration': chunk_info['duration'],
+                        'index': chunk_info['index'],
+                        'success': True
                     }
                 except sr.UnknownValueError:
                     return {
                         'text': '[INAUDIBLE]',
                         'confidence': 0.0,
                         'start_time': chunk_info['start_time'],
                         'end_time': chunk_info['end_time'],
                         'duration': chunk_info['duration'],
                         'index': chunk_info['index'],
+                        'success': False
                     }
             except sr.RequestError as e:
                 return {
+                    'text': f'[RECOGNITION_ERROR: {str(e)}]',
                     'confidence': 0.0,
                     'start_time': chunk_info['start_time'],
                     'end_time': chunk_info['end_time'],
                     'duration': chunk_info['duration'],
                     'index': chunk_info['index'],
+                    'success': False,
+                    'error': str(e)
                 }
         except Exception as e:
             return {
+                'text': f'[ERROR: {str(e)}]',
                 'confidence': 0.0,
                 'start_time': chunk_info.get('start_time', 0),
                 'end_time': chunk_info.get('end_time', 0),
                 'duration': chunk_info.get('duration', 0),
                 'index': chunk_info.get('index', 0),
+                'success': False,
+                'error': str(e)
             }
     def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
         """Transcribe audio chunks in parallel"""
         results = []
+        # Use fewer workers to avoid API rate limits
+        max_workers = min(3, len(chunk_data))
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_chunk = {
+                executor.submit(self._transcribe_audio_chunk, chunk_info, input_data): chunk_info
+                for chunk_info in chunk_data
+            }
+            for future in as_completed(future_to_chunk):
+                chunk_info = future_to_chunk[future]
+                try:
+                    result = future.result()
+                    results.append(result)
+                    if result['success']:
+                        preview = result['text'][:50] + "..." if len(result['text']) > 50 else result['text']
+                        print(f"Transcribed chunk {result['index']}: {preview}")
+                    else:
+                        print(f"Failed to transcribe chunk {result['index']}: {result['text']}")
+                except Exception as e:
+                    print(f"Error processing chunk {chunk_info.get('index', '?')}: {str(e)}")
+                    results.append({
+                        'text': f'[PROCESSING_ERROR: {str(e)}]',
+                        'confidence': 0.0,
+                        'start_time': chunk_info.get('start_time', 0),
+                        'end_time': chunk_info.get('end_time', 0),
+                        'duration': chunk_info.get('duration', 0),
+                        'index': chunk_info.get('index', 0),
+                        'success': False,
+                        'error': str(e)
+                    })
         # Sort results by chunk index to maintain order
         results.sort(key=lambda x: x['index'])
         return results
     def extract_transcript(self, audio_path: str, video_hash: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]:
         """Extract complete transcript from audio file"""
         cache_enabled = self._get_config('cache_enabled', True, input_data)
+        cache_path = self._get_cache_path(video_hash, "transcript.json")
         # Check cache
         cached_transcript = self._load_from_cache(cache_path, cache_enabled)
                 return {
                     'error': 'Failed to split audio into chunks',
                     'full_transcript': '',
                     'success_rate': 0.0
                 }
             print(f"Transcribing {len(chunk_data)} audio chunks...")
             transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
+            # Step 3: Combine results
+            successful_results = [r for r in transcript_results if r['success']]
+            full_text = ' '.join([r['text'] for r in successful_results])
+            # Calculate statistics
+            total_chunks = len(transcript_results)
+            successful_chunks = len(successful_results)
+            success_rate = successful_chunks / total_chunks if total_chunks > 0 else 0
+            word_count = len(full_text.split()) if full_text else 0
+            final_result = {
+                'full_transcript': full_text,
+                'word_count': word_count,
+                'total_chunks': total_chunks,
+                'successful_chunks': successful_chunks,
+                'success_rate': success_rate,
+                'extraction_timestamp': time.time(),
+                'extraction_date': time.strftime('%Y-%m-%d %H:%M:%S'),
+                'detailed_results': transcript_results
+            }
             # Cache results
             self._save_to_cache(cache_path, final_result, cache_enabled)
+            print(f"Transcript extraction completed. Success rate: {success_rate:.1%}")
             return final_result
         except Exception as e:
             return {
                 'error': str(e),
                 'full_transcript': '',
                 'success_rate': 0.0
             }
             print(f"Downloading YouTube audio from {youtube_url}...")
             audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
             if not audio_path or not os.path.exists(audio_path):
+                return "Error: Failed to download the YouTube audio. Please check the URL and try again."
             # Step 2: Extract transcript
             print("Extracting audio transcript...")
             if transcript_result.get("error"):
                 return f"Error: {transcript_result['error']}"
+            main_transcript = transcript_result.get('full_transcript', '')
+            if not main_transcript:
+                return "Error: No transcript could be extracted from the audio."
+            print(f"Transcript extracted successfully. Word count: {transcript_result.get('word_count', 0)}")
+            print(f"Success rate: {transcript_result.get('success_rate', 0):.1%}")
             return "TRANSCRIPT: " + main_transcript
         except Exception as e:
             return f"Error during transcript extraction: {str(e)}"
 # Factory function to create the tool
 def create_youtube_transcript_tool(**kwargs):
     """Factory function to create the transcript extraction tool with custom parameters"""