Spaces:
Sleeping
Sleeping
Samuel Thomas
commited on
Commit
·
fe1bd6e
1
Parent(s):
4000d20
new tools
Browse files- app.py +1 -1
- requirements.txt +2 -1
- tools.py +675 -76
app.py
CHANGED
|
@@ -143,7 +143,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 143 |
task_id = hf_questions[r]['task_id']
|
| 144 |
question_text = hf_questions[r]['question']
|
| 145 |
full_answer = run_agent(agent, s)
|
| 146 |
-
submitted_answer =
|
| 147 |
print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
|
| 148 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 149 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
|
|
| 143 |
task_id = hf_questions[r]['task_id']
|
| 144 |
question_text = hf_questions[r]['question']
|
| 145 |
full_answer = run_agent(agent, s)
|
| 146 |
+
submitted_answer = extract_final_answer(full_answer[-1].content)
|
| 147 |
print(f"\n\nQuestion {r+1} Answer: {submitted_answer}\n\n")
|
| 148 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 149 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
requirements.txt
CHANGED
|
@@ -24,4 +24,5 @@ duckduckgo-search==8.0.0
|
|
| 24 |
sentencepiece
|
| 25 |
nltk
|
| 26 |
SpeechRecognition
|
| 27 |
-
pandas
|
|
|
|
|
|
| 24 |
sentencepiece
|
| 25 |
nltk
|
| 26 |
SpeechRecognition
|
| 27 |
+
pandas
|
| 28 |
+
openai-whisper
|
tools.py
CHANGED
|
@@ -6,16 +6,19 @@ import string
|
|
| 6 |
import glob
|
| 7 |
import shutil
|
| 8 |
import gc
|
|
|
|
| 9 |
import uuid
|
| 10 |
import signal
|
|
|
|
|
|
|
| 11 |
from datetime import datetime
|
| 12 |
from io import BytesIO
|
| 13 |
from contextlib import contextmanager
|
| 14 |
from langchain_huggingface import HuggingFacePipeline
|
| 15 |
-
from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set
|
| 16 |
import time
|
| 17 |
from collections import Counter
|
| 18 |
-
from pydantic import Field
|
| 19 |
import hashlib
|
| 20 |
import json
|
| 21 |
import numpy as np
|
|
@@ -44,6 +47,7 @@ from pydub import AudioSegment
|
|
| 44 |
from pydub.silence import split_on_silence
|
| 45 |
import nltk
|
| 46 |
from nltk.corpus import words
|
|
|
|
| 47 |
|
| 48 |
# LangChain Ecosystem
|
| 49 |
from langchain.docstore.document import Document
|
|
@@ -89,23 +93,21 @@ def create_llm_pipeline():
|
|
| 89 |
#model_id = "meta-llama/Llama-3.3-70B-Instruct"
|
| 90 |
#model_id = "mistralai/Mistral-Small-24B-Base-2501"
|
| 91 |
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
#model_id = "Qwen/Qwen2-7B-Instruct"
|
| 93 |
-
|
| 94 |
-
# Load tokenizer explicitly with fast version
|
| 95 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 96 |
-
model_id,
|
| 97 |
-
use_fast=True, # Force fast tokenizer
|
| 98 |
-
add_prefix_space=True # Only if actually needed
|
| 99 |
-
)
|
| 100 |
-
|
| 101 |
return pipeline(
|
| 102 |
"text-generation",
|
| 103 |
model=model_id,
|
| 104 |
-
|
| 105 |
-
device_map="cpu",
|
| 106 |
torch_dtype=torch.float16,
|
| 107 |
max_new_tokens=1024,
|
| 108 |
-
temperature=0.
|
|
|
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Define file extension sets for each category
|
|
@@ -150,21 +152,637 @@ def write_bytes_to_temp_dir(file_bytes: bytes, file_name: str) -> str:
|
|
| 150 |
print(f"File written to: {file_path}")
|
| 151 |
return file_path
|
| 152 |
|
| 153 |
-
|
| 154 |
def extract_final_answer(text: str) -> str:
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
| 170 |
name: str = "enhanced_search"
|
|
@@ -755,12 +1373,11 @@ class WikipediaSearchToolWithFAISS(BaseTool):
|
|
| 755 |
return f"An unexpected error occurred: {str(e)}"
|
| 756 |
|
| 757 |
|
|
|
|
| 758 |
class EnhancedYoutubeScreenshotQA(BaseTool):
|
| 759 |
-
name: str = "
|
| 760 |
description: str = (
|
| 761 |
-
"
|
| 762 |
-
"and answers questions using advanced visual QA with semantic analysis. "
|
| 763 |
-
"Use this tool for questions about the VIDEO or IMAGES in the video,"
|
| 764 |
"Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
|
| 765 |
#"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
|
| 766 |
#"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
|
|
@@ -796,8 +1413,8 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
| 796 |
def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
|
| 797 |
"""Get configuration value with fallback to defaults"""
|
| 798 |
defaults = {
|
| 799 |
-
'frame_interval_seconds':
|
| 800 |
-
'max_frames':
|
| 801 |
'use_scene_detection': True,
|
| 802 |
'resize_frames': True,
|
| 803 |
'parallel_processing': True,
|
|
@@ -822,6 +1439,11 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
| 822 |
"Salesforce/blip-vqa-base"
|
| 823 |
).to(self.device)
|
| 824 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 825 |
print("BLIP VQA model loaded successfully")
|
| 826 |
except Exception as e:
|
| 827 |
print(f"Error initializing VQA model: {str(e)}")
|
|
@@ -1057,6 +1679,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
| 1057 |
def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
|
| 1058 |
"""Answer question on single frame with confidence scoring"""
|
| 1059 |
try:
|
|
|
|
| 1060 |
image = Image.open(frame_path).convert('RGB')
|
| 1061 |
inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
|
| 1062 |
|
|
@@ -1373,6 +1996,7 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
| 1373 |
def _run(self, youtube_url, question, **kwargs) -> str:
|
| 1374 |
"""Enhanced main execution method"""
|
| 1375 |
#ipdb.set_trace()
|
|
|
|
| 1376 |
|
| 1377 |
#input_data = query
|
| 1378 |
#youtube_url = input_data.get("youtube_url")
|
|
@@ -1411,20 +2035,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
| 1411 |
|
| 1412 |
# Format comprehensive result - Fixed the reference to stats
|
| 1413 |
result = f"""
|
| 1414 |
-
📊 **ANALYSIS SUMMARY**:
|
| 1415 |
-
• Confidence Score: {analysis_result['confidence']:.2%}
|
| 1416 |
-
• Frames Analyzed: {analysis_result['successful_analyses']}/{analysis_result['frame_count']}
|
| 1417 |
-
• Answer Consistency: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
|
| 1418 |
-
|
| 1419 |
-
📈 **ANSWER DISTRIBUTION**:
|
| 1420 |
-
{chr(10).join([f"• {answer}: {count} frames" for answer, count in analysis_result['answer_distribution'].items()])}
|
| 1421 |
-
|
| 1422 |
-
🔍 **SEMANTIC CLUSTERS**:
|
| 1423 |
-
{chr(10).join([f"• '{cluster}': {count} similar answers" for cluster, count in analysis_result['semantic_clusters'].items()])}
|
| 1424 |
-
|
| 1425 |
-
⏱️ **TEMPORAL ANALYSIS**:
|
| 1426 |
-
• Answer Changes: {analysis_result['temporal_analysis'].get('total_changes', 0)}
|
| 1427 |
-
• Stability: {analysis_result['temporal_analysis'].get('stability_ratio', 0):.2%}
|
| 1428 |
|
| 1429 |
📊 **STATISTICAL SUMMARY**:
|
| 1430 |
• Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
|
|
@@ -1433,10 +2043,6 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
| 1433 |
• Median: {analysis_result['statistical_summary']['median']:.2f}
|
| 1434 |
• Range: {analysis_result['statistical_summary']['range']:.2f}
|
| 1435 |
|
| 1436 |
-
🎯 **CONFIDENCE BREAKDOWN**:
|
| 1437 |
-
• Frequency-based: {analysis_result['frequency_confidence']:.2%}
|
| 1438 |
-
• Model-based: {analysis_result['average_model_confidence']:.2%}
|
| 1439 |
-
• Combined: {analysis_result['confidence']:.2%}
|
| 1440 |
""".strip()
|
| 1441 |
|
| 1442 |
return result
|
|
@@ -1449,30 +2055,18 @@ class EnhancedYoutubeScreenshotQA(BaseTool):
|
|
| 1449 |
def create_enhanced_youtube_qa_tool(**kwargs):
|
| 1450 |
"""Factory function to create the enhanced tool with custom parameters"""
|
| 1451 |
return EnhancedYoutubeScreenshotQA(**kwargs)
|
| 1452 |
-
# Example of creating the tool instance:
|
| 1453 |
-
# wikipedia_tool_faiss = WikipediaSearchToolWithFAISS()
|
| 1454 |
-
|
| 1455 |
-
# To use this new tool in your agent, you would replace the old
|
| 1456 |
-
# `wikipedia_tool` instance with `wikipedia_tool_faiss` in your `tools` list.
|
| 1457 |
-
# For example:
|
| 1458 |
-
# tools = [wikipedia_tool_faiss, search_tool]
|
| 1459 |
-
# Create tool instances
|
| 1460 |
-
#wikipedia_tool = WikipediaSearchTool()
|
| 1461 |
-
|
| 1462 |
-
# --- Define Call LLM function ---
|
| 1463 |
-
|
| 1464 |
-
# 3. Improved LLM call with memory management
|
| 1465 |
|
| 1466 |
|
| 1467 |
class YouTubeTranscriptExtractor(BaseTool):
|
| 1468 |
name: str = "youtube_transcript_extractor"
|
| 1469 |
description: str = (
|
| 1470 |
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
|
| 1471 |
-
"Use this tool
|
|
|
|
| 1472 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
| 1473 |
-
"Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
|
| 1474 |
-
"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
|
| 1475 |
-
"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
|
| 1476 |
"Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
|
| 1477 |
)
|
| 1478 |
|
|
@@ -2240,8 +2834,6 @@ def create_youtube_transcript_tool(**kwargs):
|
|
| 2240 |
"""Factory function to create the transcript extraction tool with custom parameters"""
|
| 2241 |
return YouTubeTranscriptExtractor(**kwargs)
|
| 2242 |
|
| 2243 |
-
|
| 2244 |
-
|
| 2245 |
# --- Model Configuration ---
|
| 2246 |
def create_llm_pipeline():
|
| 2247 |
#model_id = "meta-llama/Llama-2-13b-chat-hf"
|
|
@@ -2993,17 +3585,19 @@ def fix_backwards_text(text):
|
|
| 2993 |
|
| 2994 |
# --- Run the Agent ---
|
| 2995 |
# Enhanced system prompt for better behavior
|
| 2996 |
-
|
| 2997 |
def run_agent(agent, state: AgentState):
|
| 2998 |
"""Enhanced agent initialization with better prompt and hallucination prevention."""
|
| 2999 |
-
global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, tools
|
| 3000 |
|
| 3001 |
# Initialize tools
|
| 3002 |
WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
|
| 3003 |
-
SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=
|
| 3004 |
YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
|
| 3005 |
YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
|
| 3006 |
-
|
|
|
|
|
|
|
|
|
|
| 3007 |
|
| 3008 |
formatted_tools_description = render_text_description(tools)
|
| 3009 |
current_date_str = datetime.now().strftime("%Y-%m-%d")
|
|
@@ -3019,6 +3613,7 @@ CRITICAL INSTRUCTIONS:
|
|
| 3019 |
3. Use tools ONLY when you need specific information you don't know
|
| 3020 |
4. After using a tool, provide your FINAL ANSWER immediately
|
| 3021 |
5. STOP after giving your FINAL ANSWER - do not continue
|
|
|
|
| 3022 |
|
| 3023 |
FORMAT for tool use:
|
| 3024 |
Thought: <brief reasoning>
|
|
@@ -3030,12 +3625,15 @@ FINAL ANSWER: [concise answer only]
|
|
| 3030 |
|
| 3031 |
ANSWER FORMAT:
|
| 3032 |
- Numbers: no commas, no units unless specified
|
|
|
|
| 3033 |
- Strings: no articles, no abbreviations, digits in plain text
|
| 3034 |
-
- Lists: comma-separated
|
| 3035 |
- Be extremely brief and concise
|
| 3036 |
- Do not provide additional context or explanations
|
| 3037 |
- Do not provide parentheticals
|
| 3038 |
|
|
|
|
|
|
|
| 3039 |
IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
|
| 3040 |
|
| 3041 |
Current date: {current_date_str}
|
|
@@ -3062,9 +3660,10 @@ Current date: {current_date_str}
|
|
| 3062 |
|
| 3063 |
# Cleanup
|
| 3064 |
if result.get("done"):
|
| 3065 |
-
|
| 3066 |
-
|
| 3067 |
gc.collect()
|
| 3068 |
print("🧹 Released GPU memory after completion")
|
| 3069 |
|
| 3070 |
return result["messages"]
|
|
|
|
|
|
| 6 |
import glob
|
| 7 |
import shutil
|
| 8 |
import gc
|
| 9 |
+
import sys
|
| 10 |
import uuid
|
| 11 |
import signal
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import subprocess
|
| 14 |
from datetime import datetime
|
| 15 |
from io import BytesIO
|
| 16 |
from contextlib import contextmanager
|
| 17 |
from langchain_huggingface import HuggingFacePipeline
|
| 18 |
+
from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
|
| 19 |
import time
|
| 20 |
from collections import Counter
|
| 21 |
+
from pydantic import Field, BaseModel
|
| 22 |
import hashlib
|
| 23 |
import json
|
| 24 |
import numpy as np
|
|
|
|
| 47 |
from pydub.silence import split_on_silence
|
| 48 |
import nltk
|
| 49 |
from nltk.corpus import words
|
| 50 |
+
import pandas as pd
|
| 51 |
|
| 52 |
# LangChain Ecosystem
|
| 53 |
from langchain.docstore.document import Document
|
|
|
|
| 93 |
#model_id = "meta-llama/Llama-3.3-70B-Instruct"
|
| 94 |
#model_id = "mistralai/Mistral-Small-24B-Base-2501"
|
| 95 |
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
| 96 |
+
#model_id = "Meta-Llama/Llama-2-7b-chat-hf"
|
| 97 |
+
#model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
|
| 98 |
+
#model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
|
| 99 |
+
#model_id = "mistralai/Mistral-7B-Instruct-v0.2"
|
| 100 |
#model_id = "Qwen/Qwen2-7B-Instruct"
|
| 101 |
+
#model_id = "GSAI-ML/LLaDA-8B-Instruct"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
return pipeline(
|
| 103 |
"text-generation",
|
| 104 |
model=model_id,
|
| 105 |
+
device_map="auto",
|
|
|
|
| 106 |
torch_dtype=torch.float16,
|
| 107 |
max_new_tokens=1024,
|
| 108 |
+
temperature=0.05,
|
| 109 |
+
do_sample=False,
|
| 110 |
+
repetition_penalty=1.2
|
| 111 |
)
|
| 112 |
|
| 113 |
# Define file extension sets for each category
|
|
|
|
| 152 |
print(f"File written to: {file_path}")
|
| 153 |
return file_path
|
| 154 |
|
|
|
|
| 155 |
def extract_final_answer(text: str) -> str:
|
| 156 |
+
"""
|
| 157 |
+
Extracts the answer after the last 'FINAL ANSWER:' (case-insensitive),
|
| 158 |
+
removes any parenthetical immediately following a numeric answer,
|
| 159 |
+
strips trailing punctuation, sorts comma-separated lists,
|
| 160 |
+
and does not split numbers containing commas.
|
| 161 |
+
Returns an empty string if marker not found.
|
| 162 |
+
"""
|
| 163 |
+
marker = "FINAL ANSWER:"
|
| 164 |
+
idx = text.lower().rfind(marker.lower())
|
| 165 |
+
if idx == -1:
|
| 166 |
+
return ""
|
| 167 |
+
# Extract answer after marker
|
| 168 |
+
result = text[idx + len(marker):].strip()
|
| 169 |
+
# Remove parenthetical immediately following a number at the start
|
| 170 |
+
result = re.sub(r'^(\d+(?:\.\d+)?)\s*\(.*?\)', r'\1', result)
|
| 171 |
+
# Remove trailing punctuation and whitespace
|
| 172 |
+
result = result.rstrip(string.punctuation + " ")
|
| 173 |
+
# Split on commas NOT between digits (i.e., not inside numbers)
|
| 174 |
+
# This regex splits on commas not surrounded by digits (to avoid splitting numbers like 1,000)
|
| 175 |
+
items = re.split(r',(?!\s*\d{3}\b)', result)
|
| 176 |
+
# If we have a list, sort it
|
| 177 |
+
if len(items) > 1:
|
| 178 |
+
items = [item.strip() for item in items]
|
| 179 |
+
# Try to sort numerically
|
| 180 |
+
try:
|
| 181 |
+
sorted_items = sorted(
|
| 182 |
+
items,
|
| 183 |
+
key=lambda x: float(re.sub(r'[^\d\.]', '', x)) # Remove non-numeric except .
|
| 184 |
+
)
|
| 185 |
+
return ', '.join(sorted_items)
|
| 186 |
+
except ValueError:
|
| 187 |
+
# Fallback: sort alphabetically
|
| 188 |
+
sorted_items = sorted(items, key=lambda x: x.lower())
|
| 189 |
+
return ', '.join(sorted_items)
|
| 190 |
+
return result
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class AudioTranscriptionInput(BaseModel):
|
| 194 |
+
"""Input schema for AudioTranscriptionTool."""
|
| 195 |
+
file_path: str = Field(description="Path to the audio file to transcribe")
|
| 196 |
+
engine: Optional[str] = Field(default="google", description="Speech recognition engine to use")
|
| 197 |
+
language: Optional[str] = Field(default="en-US", description="Language of the audio")
|
| 198 |
+
|
| 199 |
+
class AudioTranscriptionTool(BaseTool):
|
| 200 |
+
"""Tool for transcribing audio files using local speech recognition."""
|
| 201 |
+
|
| 202 |
+
name: str = "audio_transcription"
|
| 203 |
+
description: str = """
|
| 204 |
+
Transcribes voice memo, audio files (mp3, wav, m4a, flac, etc.) to text using local speech recognition.
|
| 205 |
+
Input should be a dictionary with 'file_path' key containing the path to the audio file.
|
| 206 |
+
Optionally accepts 'engine' and 'language' parameters.
|
| 207 |
+
Returns the transcribed text as a string.
|
| 208 |
+
"""
|
| 209 |
+
args_schema: type[BaseModel] = AudioTranscriptionInput
|
| 210 |
+
|
| 211 |
+
class Config:
|
| 212 |
+
arbitrary_types_allowed = True
|
| 213 |
+
|
| 214 |
+
def __init__(self, **kwargs):
|
| 215 |
+
"""Initialize the AudioTranscriptionTool."""
|
| 216 |
+
super().__init__(**kwargs)
|
| 217 |
+
self._init_speech_recognition()
|
| 218 |
+
|
| 219 |
+
def _init_speech_recognition(self):
|
| 220 |
+
"""Initialize speech recognition components."""
|
| 221 |
+
try:
|
| 222 |
+
import speech_recognition as sr
|
| 223 |
+
from pydub import AudioSegment
|
| 224 |
+
object.__setattr__(self, 'recognizer', sr.Recognizer())
|
| 225 |
+
object.__setattr__(self, 'sr', sr)
|
| 226 |
+
object.__setattr__(self, 'AudioSegment', AudioSegment)
|
| 227 |
+
except ImportError as e:
|
| 228 |
+
raise ImportError(
|
| 229 |
+
"Required libraries not found. Install with: "
|
| 230 |
+
"pip install SpeechRecognition pydub"
|
| 231 |
+
) from e
|
| 232 |
+
|
| 233 |
+
def _validate_audio_file(self, file_path: str) -> bool:
|
| 234 |
+
"""Validate that the audio file exists and has a supported format."""
|
| 235 |
+
if not os.path.exists(file_path):
|
| 236 |
+
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
| 237 |
+
|
| 238 |
+
# Check file extension - pydub supports many formats
|
| 239 |
+
supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
|
| 240 |
+
file_extension = Path(file_path).suffix.lower()
|
| 241 |
+
|
| 242 |
+
if file_extension not in supported_formats:
|
| 243 |
+
raise ValueError(
|
| 244 |
+
f"Unsupported audio format: {file_extension}. "
|
| 245 |
+
f"Supported formats: {', '.join(supported_formats)}"
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
return True
|
| 249 |
+
|
| 250 |
+
def _convert_to_wav(self, file_path: str) -> str:
|
| 251 |
+
"""Convert audio file to WAV format if needed."""
|
| 252 |
+
file_extension = Path(file_path).suffix.lower()
|
| 253 |
+
|
| 254 |
+
if file_extension == '.wav':
|
| 255 |
+
return file_path
|
| 256 |
+
|
| 257 |
+
try:
|
| 258 |
+
# Convert to WAV using pydub
|
| 259 |
+
audio = self.AudioSegment.from_file(file_path)
|
| 260 |
+
|
| 261 |
+
# Create temporary WAV file
|
| 262 |
+
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
| 263 |
+
audio.export(temp_wav.name, format="wav")
|
| 264 |
+
return temp_wav.name
|
| 265 |
+
except Exception as e:
|
| 266 |
+
raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
|
| 267 |
+
|
| 268 |
+
def _transcribe_audio(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
|
| 269 |
+
"""Transcribe audio file using local speech recognition."""
|
| 270 |
+
temp_wav_path = None
|
| 271 |
+
|
| 272 |
+
try:
|
| 273 |
+
# Convert to WAV if necessary
|
| 274 |
+
wav_path = self._convert_to_wav(file_path)
|
| 275 |
+
if wav_path != file_path:
|
| 276 |
+
temp_wav_path = wav_path
|
| 277 |
+
|
| 278 |
+
# Load audio file
|
| 279 |
+
with self.sr.AudioFile(wav_path) as source:
|
| 280 |
+
# Adjust for ambient noise
|
| 281 |
+
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
| 282 |
+
# Record the audio
|
| 283 |
+
audio_data = self.recognizer.record(source)
|
| 284 |
+
|
| 285 |
+
# Choose recognition engine
|
| 286 |
+
if engine == "google":
|
| 287 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
| 288 |
+
elif engine == "sphinx":
|
| 289 |
+
transcript = self.recognizer.recognize_sphinx(audio_data, language=language)
|
| 290 |
+
elif engine == "wit":
|
| 291 |
+
# Note: requires WIT_AI_KEY environment variable
|
| 292 |
+
wit_key = os.getenv('WIT_AI_KEY')
|
| 293 |
+
if not wit_key:
|
| 294 |
+
raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
|
| 295 |
+
transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
|
| 296 |
+
elif engine == "bing":
|
| 297 |
+
# Note: requires BING_KEY environment variable
|
| 298 |
+
bing_key = os.getenv('BING_KEY')
|
| 299 |
+
if not bing_key:
|
| 300 |
+
raise ValueError("BING_KEY environment variable required for Bing engine")
|
| 301 |
+
transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
|
| 302 |
+
else:
|
| 303 |
+
# Default to Google
|
| 304 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
| 305 |
+
|
| 306 |
+
return transcript
|
| 307 |
+
|
| 308 |
+
except self.sr.UnknownValueError:
|
| 309 |
+
return "Could not understand the audio - speech was unclear or inaudible"
|
| 310 |
+
except self.sr.RequestError as e:
|
| 311 |
+
return f"Error with speech recognition service: {str(e)}"
|
| 312 |
+
except Exception as e:
|
| 313 |
+
raise RuntimeError(f"Error transcribing audio: {str(e)}")
|
| 314 |
+
finally:
|
| 315 |
+
# Clean up temporary WAV file
|
| 316 |
+
if temp_wav_path and os.path.exists(temp_wav_path):
|
| 317 |
+
try:
|
| 318 |
+
os.unlink(temp_wav_path)
|
| 319 |
+
except OSError:
|
| 320 |
+
pass # Ignore cleanup errors
|
| 321 |
+
|
| 322 |
+
def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
|
| 323 |
+
"""
|
| 324 |
+
Internal method required by LangChain BaseTool.
|
| 325 |
+
|
| 326 |
+
Args:
|
| 327 |
+
file_path: Path to the audio file to transcribe
|
| 328 |
+
engine: Speech recognition engine to use
|
| 329 |
+
language: Language of the audio
|
| 330 |
+
|
| 331 |
+
Returns:
|
| 332 |
+
str: Transcribed text from the audio file
|
| 333 |
+
"""
|
| 334 |
+
try:
|
| 335 |
+
# Validate audio file
|
| 336 |
+
self._validate_audio_file(file_path)
|
| 337 |
+
|
| 338 |
+
# Transcribe audio
|
| 339 |
+
transcript = self._transcribe_audio(
|
| 340 |
+
file_path=file_path,
|
| 341 |
+
engine=engine,
|
| 342 |
+
language=language
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
return transcript
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
error_msg = f"AudioTranscriptionTool error: {str(e)}"
|
| 349 |
+
print(error_msg)
|
| 350 |
+
return error_msg
|
| 351 |
+
|
| 352 |
+
def run(self, tool_input: Dict[str, Any]) -> str:
|
| 353 |
+
"""
|
| 354 |
+
Main method to run the audio transcription tool.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
tool_input: Dictionary containing 'file_path' and optional parameters
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
str: Transcribed text from the audio file
|
| 361 |
+
"""
|
| 362 |
+
try:
|
| 363 |
+
# Extract parameters from input
|
| 364 |
+
file_path = tool_input.get('file_path')
|
| 365 |
+
if not file_path:
|
| 366 |
+
raise ValueError("file_path is required in tool_input")
|
| 367 |
+
|
| 368 |
+
engine = tool_input.get('engine', 'google')
|
| 369 |
+
language = tool_input.get('language', 'en-US')
|
| 370 |
+
|
| 371 |
+
# Call the internal _run method
|
| 372 |
+
return self._run(file_path=file_path, engine=engine, language=language)
|
| 373 |
+
|
| 374 |
+
except Exception as e:
|
| 375 |
+
error_msg = f"AudioTranscriptionTool error: {str(e)}"
|
| 376 |
+
print(error_msg)
|
| 377 |
+
return error_msg
|
| 378 |
+
|
| 379 |
+
# Enhanced local transcription tool with multiple engine support
|
| 380 |
+
class AdvancedAudioTranscriptionTool(BaseTool):
|
| 381 |
+
"""Advanced tool with support for multiple local transcription engines including Whisper."""
|
| 382 |
+
|
| 383 |
+
name: str = "advanced_audio_transcription"
|
| 384 |
+
description: str = """
|
| 385 |
+
Advanced audio transcription tool supporting multiple engines including local Whisper.
|
| 386 |
+
Supports engines: 'whisper' (local), 'google', 'sphinx', 'wit', 'bing'.
|
| 387 |
+
Input should be a dictionary with 'file_path' key.
|
| 388 |
+
Returns the transcribed text as a string.
|
| 389 |
+
"""
|
| 390 |
+
args_schema: type[BaseModel] = AudioTranscriptionInput
|
| 391 |
+
|
| 392 |
+
class Config:
|
| 393 |
+
arbitrary_types_allowed = True
|
| 394 |
+
|
| 395 |
+
def __init__(self, **kwargs):
|
| 396 |
+
"""Initialize the AdvancedAudioTranscriptionTool."""
|
| 397 |
+
super().__init__(**kwargs)
|
| 398 |
+
self._init_speech_recognition()
|
| 399 |
+
self._init_whisper()
|
| 400 |
+
|
| 401 |
+
def _init_speech_recognition(self):
|
| 402 |
+
"""Initialize speech recognition components."""
|
| 403 |
+
try:
|
| 404 |
+
import speech_recognition as sr
|
| 405 |
+
from pydub import AudioSegment
|
| 406 |
+
object.__setattr__(self, 'recognizer', sr.Recognizer())
|
| 407 |
+
object.__setattr__(self, 'sr', sr)
|
| 408 |
+
object.__setattr__(self, 'AudioSegment', AudioSegment)
|
| 409 |
+
except ImportError as e:
|
| 410 |
+
raise ImportError(
|
| 411 |
+
"Required libraries not found. Install with: "
|
| 412 |
+
"pip install SpeechRecognition pydub"
|
| 413 |
+
) from e
|
| 414 |
+
|
| 415 |
+
def _init_whisper(self):
|
| 416 |
+
"""Initialize Whisper if available."""
|
| 417 |
+
try:
|
| 418 |
+
import whisper
|
| 419 |
+
object.__setattr__(self, 'whisper', whisper)
|
| 420 |
+
except ImportError:
|
| 421 |
+
object.__setattr__(self, 'whisper', None)
|
| 422 |
+
print("Warning: OpenAI Whisper not installed. Install with 'pip install openai-whisper' for local Whisper support.")
|
| 423 |
+
|
| 424 |
+
def _validate_audio_file(self, file_path: str) -> bool:
|
| 425 |
+
"""Validate that the audio file exists and has a supported format."""
|
| 426 |
+
if not os.path.exists(file_path):
|
| 427 |
+
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
| 428 |
+
|
| 429 |
+
supported_formats = {'.mp3', '.wav', '.m4a', '.flac', '.mp4', '.mpeg', '.mpga', '.webm', '.ogg', '.aac'}
|
| 430 |
+
file_extension = Path(file_path).suffix.lower()
|
| 431 |
+
|
| 432 |
+
if file_extension not in supported_formats:
|
| 433 |
+
raise ValueError(
|
| 434 |
+
f"Unsupported audio format: {file_extension}. "
|
| 435 |
+
f"Supported formats: {', '.join(supported_formats)}"
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
return True
|
| 439 |
+
|
| 440 |
+
def _transcribe_with_whisper(self, file_path: str, language: str = "en") -> str:
|
| 441 |
+
"""Transcribe using local Whisper model."""
|
| 442 |
+
if not self.whisper:
|
| 443 |
+
raise RuntimeError("Whisper not installed. Install with 'pip install openai-whisper'")
|
| 444 |
+
|
| 445 |
+
try:
|
| 446 |
+
# Load the model (you can change model size: tiny, base, small, medium, large)
|
| 447 |
+
model = self.whisper.load_model("base")
|
| 448 |
+
|
| 449 |
+
# Transcribe the audio
|
| 450 |
+
result = model.transcribe(file_path, language=language if language != "en-US" else "en")
|
| 451 |
+
|
| 452 |
+
return result["text"].strip()
|
| 453 |
+
|
| 454 |
+
except Exception as e:
|
| 455 |
+
raise RuntimeError(f"Error with Whisper transcription: {str(e)}")
|
| 456 |
+
|
| 457 |
+
def _convert_to_wav(self, file_path: str) -> str:
|
| 458 |
+
"""Convert audio file to WAV format if needed."""
|
| 459 |
+
file_extension = Path(file_path).suffix.lower()
|
| 460 |
+
|
| 461 |
+
if file_extension == '.wav':
|
| 462 |
+
return file_path
|
| 463 |
+
|
| 464 |
+
try:
|
| 465 |
+
audio = self.AudioSegment.from_file(file_path)
|
| 466 |
+
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
| 467 |
+
audio.export(temp_wav.name, format="wav")
|
| 468 |
+
return temp_wav.name
|
| 469 |
+
except Exception as e:
|
| 470 |
+
raise RuntimeError(f"Error converting audio file to WAV: {str(e)}")
|
| 471 |
+
|
| 472 |
+
def _transcribe_with_sr(self, file_path: str, engine: str = "google", language: str = "en-US") -> str:
|
| 473 |
+
"""Transcribe using speech_recognition library."""
|
| 474 |
+
temp_wav_path = None
|
| 475 |
+
|
| 476 |
+
try:
|
| 477 |
+
wav_path = self._convert_to_wav(file_path)
|
| 478 |
+
if wav_path != file_path:
|
| 479 |
+
temp_wav_path = wav_path
|
| 480 |
+
|
| 481 |
+
with self.sr.AudioFile(wav_path) as source:
|
| 482 |
+
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
| 483 |
+
audio_data = self.recognizer.record(source)
|
| 484 |
+
|
| 485 |
+
if engine == "google":
|
| 486 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
| 487 |
+
elif engine == "sphinx":
|
| 488 |
+
transcript = self.recognizer.recognize_sphinx(audio_data)
|
| 489 |
+
elif engine == "wit":
|
| 490 |
+
wit_key = os.getenv('WIT_AI_KEY')
|
| 491 |
+
if not wit_key:
|
| 492 |
+
raise ValueError("WIT_AI_KEY environment variable required for Wit.ai engine")
|
| 493 |
+
transcript = self.recognizer.recognize_wit(audio_data, key=wit_key)
|
| 494 |
+
elif engine == "bing":
|
| 495 |
+
bing_key = os.getenv('BING_KEY')
|
| 496 |
+
if not bing_key:
|
| 497 |
+
raise ValueError("BING_KEY environment variable required for Bing engine")
|
| 498 |
+
transcript = self.recognizer.recognize_bing(audio_data, key=bing_key, language=language)
|
| 499 |
+
else:
|
| 500 |
+
transcript = self.recognizer.recognize_google(audio_data, language=language)
|
| 501 |
+
|
| 502 |
+
return transcript
|
| 503 |
+
|
| 504 |
+
except self.sr.UnknownValueError:
|
| 505 |
+
return "Could not understand the audio - speech was unclear or inaudible"
|
| 506 |
+
except self.sr.RequestError as e:
|
| 507 |
+
return f"Error with speech recognition service: {str(e)}"
|
| 508 |
+
finally:
|
| 509 |
+
if temp_wav_path and os.path.exists(temp_wav_path):
|
| 510 |
+
try:
|
| 511 |
+
os.unlink(temp_wav_path)
|
| 512 |
+
except OSError:
|
| 513 |
+
pass
|
| 514 |
+
|
| 515 |
+
def _run(self, file_path: str, engine: str = "google", language: str = "en-US", **kwargs) -> str:
|
| 516 |
+
"""
|
| 517 |
+
Internal method required by LangChain BaseTool.
|
| 518 |
+
|
| 519 |
+
Args:
|
| 520 |
+
file_path: Path to the audio file to transcribe
|
| 521 |
+
engine: Speech recognition engine to use
|
| 522 |
+
language: Language of the audio
|
| 523 |
+
|
| 524 |
+
Returns:
|
| 525 |
+
str: Transcribed text from the audio file
|
| 526 |
+
"""
|
| 527 |
+
try:
|
| 528 |
+
self._validate_audio_file(file_path)
|
| 529 |
+
|
| 530 |
+
# Use local Whisper if specified
|
| 531 |
+
if engine == "whisper":
|
| 532 |
+
transcript = self._transcribe_with_whisper(file_path, language)
|
| 533 |
+
else:
|
| 534 |
+
# Use speech_recognition library
|
| 535 |
+
transcript = self._transcribe_with_sr(file_path, engine, language)
|
| 536 |
+
|
| 537 |
+
return transcript
|
| 538 |
+
|
| 539 |
+
except Exception as e:
|
| 540 |
+
error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
|
| 541 |
+
print(error_msg)
|
| 542 |
+
return error_msg
|
| 543 |
+
|
| 544 |
+
def run(self, tool_input: Dict[str, Any]) -> str:
|
| 545 |
+
"""
|
| 546 |
+
Main method to run the advanced audio transcription tool.
|
| 547 |
+
|
| 548 |
+
Args:
|
| 549 |
+
tool_input: Dictionary containing 'file_path' and optional parameters
|
| 550 |
+
|
| 551 |
+
Returns:
|
| 552 |
+
str: Transcribed text from the audio file
|
| 553 |
+
"""
|
| 554 |
+
try:
|
| 555 |
+
file_path = tool_input.get('file_path')
|
| 556 |
+
if not file_path:
|
| 557 |
+
raise ValueError("file_path is required in tool_input")
|
| 558 |
+
|
| 559 |
+
engine = tool_input.get('engine', 'google')
|
| 560 |
+
language = tool_input.get('language', 'en-US')
|
| 561 |
+
|
| 562 |
+
# Call the internal _run method
|
| 563 |
+
return self._run(file_path=file_path, engine=engine, language=language)
|
| 564 |
+
|
| 565 |
+
except Exception as e:
|
| 566 |
+
error_msg = f"AdvancedAudioTranscriptionTool error: {str(e)}"
|
| 567 |
+
print(error_msg)
|
| 568 |
+
return error_msg
|
| 569 |
+
|
| 570 |
|
| 571 |
+
class ExcelReaderInput(BaseModel):
|
| 572 |
+
"""Input schema for ExcelReaderTool."""
|
| 573 |
+
file_path: str = Field(description="Path to the Excel file to read")
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
class ExcelReaderTool(BaseTool):
|
| 577 |
+
"""Tool for reading Excel files and formatting them for LLM consumption."""
|
| 578 |
+
|
| 579 |
+
name: str = "excel_reader"
|
| 580 |
+
description: str = (
|
| 581 |
+
"Reads an Excel file from the specified file path and returns the entire "
|
| 582 |
+
"Use for running math operations on a table of data"
|
| 583 |
+
"table from Sheet1 in a format that can be easily processed by an LLM. "
|
| 584 |
+
"Input should be a file path to an Excel file (.xlsx or .xls)."
|
| 585 |
+
)
|
| 586 |
+
args_schema: Type[BaseModel] = ExcelReaderInput
|
| 587 |
+
|
| 588 |
+
def _run(self, file_path: str, run_manager: Optional[Any] = None) -> str:
|
| 589 |
+
"""
|
| 590 |
+
Execute the tool to read Excel file and return formatted table.
|
| 591 |
+
|
| 592 |
+
Args:
|
| 593 |
+
file_path: Path to the Excel file
|
| 594 |
+
run_manager: Optional callback manager
|
| 595 |
+
|
| 596 |
+
Returns:
|
| 597 |
+
Formatted string representation of the Excel table
|
| 598 |
+
"""
|
| 599 |
+
try:
|
| 600 |
+
# Validate file exists
|
| 601 |
+
if not os.path.exists(file_path):
|
| 602 |
+
return f"Error: File not found at path: {file_path}"
|
| 603 |
+
|
| 604 |
+
# Validate file extension
|
| 605 |
+
if not file_path.lower().endswith(('.xlsx', '.xls')):
|
| 606 |
+
return f"Error: File must be an Excel file (.xlsx or .xls). Got: {file_path}"
|
| 607 |
+
|
| 608 |
+
# Read Excel file - specifically Sheet1
|
| 609 |
+
try:
|
| 610 |
+
df = pd.read_excel(file_path, sheet_name='Sheet1')
|
| 611 |
+
except ValueError as e:
|
| 612 |
+
if "Worksheet named 'Sheet1' not found" in str(e):
|
| 613 |
+
# If Sheet1 doesn't exist, try reading the first sheet
|
| 614 |
+
df = pd.read_excel(file_path, sheet_name=0)
|
| 615 |
+
else:
|
| 616 |
+
raise e
|
| 617 |
+
|
| 618 |
+
# Check if dataframe is empty
|
| 619 |
+
if df.empty:
|
| 620 |
+
return "The Excel file contains no data in Sheet1."
|
| 621 |
+
|
| 622 |
+
# Format the table for LLM consumption
|
| 623 |
+
formatted_output = self._format_table_for_llm(df, file_path)
|
| 624 |
+
|
| 625 |
+
return formatted_output
|
| 626 |
+
|
| 627 |
+
except FileNotFoundError:
|
| 628 |
+
return f"Error: File not found at path: {file_path}"
|
| 629 |
+
except PermissionError:
|
| 630 |
+
return f"Error: Permission denied accessing file: {file_path}"
|
| 631 |
+
except Exception as e:
|
| 632 |
+
return f"Error reading Excel file: {str(e)}"
|
| 633 |
+
|
| 634 |
+
def _format_table_for_llm(self, df: pd.DataFrame, file_path: str) -> str:
|
| 635 |
+
"""
|
| 636 |
+
Format the pandas DataFrame into a readable string format for LLMs.
|
| 637 |
+
|
| 638 |
+
Args:
|
| 639 |
+
df: The pandas DataFrame containing the Excel data
|
| 640 |
+
file_path: Original file path for reference
|
| 641 |
+
|
| 642 |
+
Returns:
|
| 643 |
+
Formatted string representation of the table
|
| 644 |
+
"""
|
| 645 |
+
output_lines = []
|
| 646 |
+
|
| 647 |
+
# Add header information
|
| 648 |
+
#output_lines.append(f"EXCEL FILE DATA FROM: {os.path.basename(file_path)}")
|
| 649 |
+
#output_lines.append(f"Sheet: Sheet1")
|
| 650 |
+
#output_lines.append(f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns")
|
| 651 |
+
#output_lines.append("-" * 60)
|
| 652 |
+
|
| 653 |
+
# Add column information
|
| 654 |
+
#output_lines.append("COLUMNS:")
|
| 655 |
+
#for i, col in enumerate(df.columns, 1):
|
| 656 |
+
# col_type = str(df[col].dtype)
|
| 657 |
+
# non_null_count = df[col].count()
|
| 658 |
+
# output_lines.append(f" {i}. {col} ({col_type}) - {non_null_count} non-null values")
|
| 659 |
+
|
| 660 |
+
#output_lines.append("-" * 60)
|
| 661 |
+
|
| 662 |
+
# Add table data in a clean format
|
| 663 |
+
output_lines.append("TABLE DATA:")
|
| 664 |
+
|
| 665 |
+
# Convert DataFrame to string with proper formatting
|
| 666 |
+
# Handle potential NaN values and make it readable
|
| 667 |
+
df_clean = df.fillna("N/A") # Replace NaN with readable placeholder
|
| 668 |
+
|
| 669 |
+
# Create a formatted table string
|
| 670 |
+
#table_str = df_clean.to_string(index=True, max_rows=None, max_cols=None)
|
| 671 |
+
#output_lines.append(table_str)
|
| 672 |
+
|
| 673 |
+
# Add summary statistics for numeric columns if they exist
|
| 674 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
sums = df_clean[numeric_cols].sum()
|
| 678 |
+
|
| 679 |
+
|
| 680 |
+
# Step 2: Define which columns are food and which are drink
|
| 681 |
+
food_cols = [col for col in numeric_cols if col.lower() != 'soda']
|
| 682 |
+
drink_cols = [col for col in numeric_cols if col.lower() == 'soda']
|
| 683 |
+
|
| 684 |
+
# Step 3: Aggregate totals
|
| 685 |
+
food_total = sums[food_cols].sum()
|
| 686 |
+
drink_total = sums[drink_cols].sum()
|
| 687 |
+
|
| 688 |
+
# Step 4: Format the results as dollars
|
| 689 |
+
formatted_totals = {
|
| 690 |
+
'Food': f"${food_total:,.2f}",
|
| 691 |
+
'Drink': f"${drink_total:,.2f}"
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
# Step 5: Convert to string for display (optional)
|
| 695 |
+
result_string = '\n'.join([f"{k}: {v}" for k, v in formatted_totals.items()])
|
| 696 |
+
|
| 697 |
+
# Convert to string for display
|
| 698 |
+
#result_string = formatted.to_string()
|
| 699 |
+
|
| 700 |
+
output_lines.append(result_string)
|
| 701 |
+
#output_lines.append(df_clean[numeric_cols].sum())
|
| 702 |
+
if len(numeric_cols) > 0:
|
| 703 |
+
output_lines.append("-" * 60)
|
| 704 |
+
#output_lines.append("NUMERIC COLUMN SUMMARY:")
|
| 705 |
+
#for col in numeric_cols:
|
| 706 |
+
# stats = df[col].describe()
|
| 707 |
+
# output_lines.append(f"\n{col}:")
|
| 708 |
+
# output_lines.append(f" Count: {stats['count']}")
|
| 709 |
+
# output_lines.append(f" Mean: {stats['mean']:.2f}")
|
| 710 |
+
# output_lines.append(f" Min: {stats['min']}")
|
| 711 |
+
# output_lines.append(f" Max: {stats['max']}")
|
| 712 |
+
|
| 713 |
+
return "\n".join(output_lines)
|
| 714 |
+
|
| 715 |
+
async def _arun(self, file_path: str, run_manager: Optional[Any] = None) -> str:
|
| 716 |
+
"""Async version of the tool (falls back to sync implementation)."""
|
| 717 |
+
return self._run(file_path, run_manager)
|
| 718 |
+
|
| 719 |
+
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
class PythonExecutorInput(BaseModel):
|
| 723 |
+
"""Input schema for PythonExecutor tool."""
|
| 724 |
+
file_path: str = Field(description="Path to the Python file to execute")
|
| 725 |
+
|
| 726 |
+
|
| 727 |
+
class PythonExecutorTool(BaseTool):
|
| 728 |
+
"""Tool that executes a Python file and returns the result."""
|
| 729 |
+
|
| 730 |
+
name: str = "python_executor"
|
| 731 |
+
description: str = "Executes a Python file from the given file path and returns the output"
|
| 732 |
+
args_schema: Type[BaseModel] = PythonExecutorInput
|
| 733 |
+
|
| 734 |
+
def _run(
|
| 735 |
+
self,
|
| 736 |
+
file_path: str,
|
| 737 |
+
run_manager: Optional[Any] = None,
|
| 738 |
+
) -> str:
|
| 739 |
+
"""Execute the Python file and return the result."""
|
| 740 |
+
try:
|
| 741 |
+
# Validate that the file exists
|
| 742 |
+
if not os.path.exists(file_path):
|
| 743 |
+
return f"Error: File '{file_path}' does not exist"
|
| 744 |
+
|
| 745 |
+
# Validate that it's a Python file
|
| 746 |
+
if not file_path.endswith('.py'):
|
| 747 |
+
return f"Error: '{file_path}' is not a Python file (.py extension required)"
|
| 748 |
+
|
| 749 |
+
# Execute the Python file
|
| 750 |
+
result = subprocess.run(
|
| 751 |
+
[sys.executable, file_path],
|
| 752 |
+
capture_output=True,
|
| 753 |
+
text=True,
|
| 754 |
+
timeout=600 # 30 second timeout to prevent hanging
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
+
# Prepare the output
|
| 758 |
+
output_parts = []
|
| 759 |
+
|
| 760 |
+
if result.stdout:
|
| 761 |
+
output_parts.append(f"STDOUT:\n{result.stdout}")
|
| 762 |
+
|
| 763 |
+
if result.stderr:
|
| 764 |
+
output_parts.append(f"STDERR:\n{result.stderr}")
|
| 765 |
+
|
| 766 |
+
if result.returncode != 0:
|
| 767 |
+
output_parts.append(f"Return code: {result.returncode}")
|
| 768 |
+
|
| 769 |
+
if not output_parts:
|
| 770 |
+
return "Script executed successfully with no output"
|
| 771 |
+
|
| 772 |
+
return "\n\n".join(output_parts)
|
| 773 |
+
|
| 774 |
+
except subprocess.TimeoutExpired:
|
| 775 |
+
return "Error: Script execution timed out (30 seconds)"
|
| 776 |
+
except Exception as e:
|
| 777 |
+
return f"Error executing Python file: {str(e)}"
|
| 778 |
+
|
| 779 |
+
async def _arun(
|
| 780 |
+
self,
|
| 781 |
+
file_path: str,
|
| 782 |
+
run_manager: Optional[Any] = None,
|
| 783 |
+
) -> str:
|
| 784 |
+
"""Async version - delegates to sync implementation."""
|
| 785 |
+
return self._run(file_path, run_manager)
|
| 786 |
|
| 787 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
| 788 |
name: str = "enhanced_search"
|
|
|
|
| 1373 |
return f"An unexpected error occurred: {str(e)}"
|
| 1374 |
|
| 1375 |
|
| 1376 |
+
|
| 1377 |
class EnhancedYoutubeScreenshotQA(BaseTool):
|
| 1378 |
+
name: str = "bird_species_screenshot_qa"
|
| 1379 |
description: str = (
|
| 1380 |
+
"Use this tool to calculate the number of bird species on camera at any one time,"
|
|
|
|
|
|
|
| 1381 |
"Input should be a dict with keys: 'youtube_url', 'question', and optional parameters. "
|
| 1382 |
#"Optional parameters: 'frame_interval_seconds' (default: 10), 'max_frames' (default: 50), "
|
| 1383 |
#"'use_scene_detection' (default: True), 'parallel_processing' (default: True). "
|
|
|
|
| 1413 |
def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
|
| 1414 |
"""Get configuration value with fallback to defaults"""
|
| 1415 |
defaults = {
|
| 1416 |
+
'frame_interval_seconds': 5,
|
| 1417 |
+
'max_frames': 500,
|
| 1418 |
'use_scene_detection': True,
|
| 1419 |
'resize_frames': True,
|
| 1420 |
'parallel_processing': True,
|
|
|
|
| 1439 |
"Salesforce/blip-vqa-base"
|
| 1440 |
).to(self.device)
|
| 1441 |
|
| 1442 |
+
#self.processor_vqa = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
| 1443 |
+
#self.model_vqa = BlipForQuestionAnswering.from_pretrained(
|
| 1444 |
+
# "Salesforce/blip-vqa-capfilt-large"
|
| 1445 |
+
#).to(self.device)
|
| 1446 |
+
|
| 1447 |
print("BLIP VQA model loaded successfully")
|
| 1448 |
except Exception as e:
|
| 1449 |
print(f"Error initializing VQA model: {str(e)}")
|
|
|
|
| 1679 |
def _answer_question_on_frame(self, frame_path: str, question: str) -> Tuple[str, float]:
|
| 1680 |
"""Answer question on single frame with confidence scoring"""
|
| 1681 |
try:
|
| 1682 |
+
#ipdb.set_trace()
|
| 1683 |
image = Image.open(frame_path).convert('RGB')
|
| 1684 |
inputs = self.processor_vqa(image, question, return_tensors="pt").to(self.device)
|
| 1685 |
|
|
|
|
| 1996 |
def _run(self, youtube_url, question, **kwargs) -> str:
|
| 1997 |
"""Enhanced main execution method"""
|
| 1998 |
#ipdb.set_trace()
|
| 1999 |
+
question = "How many unique bird species are on camera?"
|
| 2000 |
|
| 2001 |
#input_data = query
|
| 2002 |
#youtube_url = input_data.get("youtube_url")
|
|
|
|
| 2035 |
|
| 2036 |
# Format comprehensive result - Fixed the reference to stats
|
| 2037 |
result = f"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2038 |
|
| 2039 |
📊 **STATISTICAL SUMMARY**:
|
| 2040 |
• Minimum: {analysis_result['statistical_summary']['minimum']:.2f}
|
|
|
|
| 2043 |
• Median: {analysis_result['statistical_summary']['median']:.2f}
|
| 2044 |
• Range: {analysis_result['statistical_summary']['range']:.2f}
|
| 2045 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2046 |
""".strip()
|
| 2047 |
|
| 2048 |
return result
|
|
|
|
| 2055 |
def create_enhanced_youtube_qa_tool(**kwargs):
|
| 2056 |
"""Factory function to create the enhanced tool with custom parameters"""
|
| 2057 |
return EnhancedYoutubeScreenshotQA(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2058 |
|
| 2059 |
|
| 2060 |
class YouTubeTranscriptExtractor(BaseTool):
|
| 2061 |
name: str = "youtube_transcript_extractor"
|
| 2062 |
description: str = (
|
| 2063 |
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition with speaker identification. "
|
| 2064 |
+
#"Use this tool for AUDIO questions, when the youtube question involves what a person says,"
|
| 2065 |
+
"Use this tool for questions like 'what does jim say in response to a question in this video',"
|
| 2066 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
| 2067 |
+
#"Optional parameters: 'language' (default: 'en-US'), 'chunk_length_ms' (default: 30000), "
|
| 2068 |
+
#"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
|
| 2069 |
+
#"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
|
| 2070 |
"Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
|
| 2071 |
)
|
| 2072 |
|
|
|
|
| 2834 |
"""Factory function to create the transcript extraction tool with custom parameters"""
|
| 2835 |
return YouTubeTranscriptExtractor(**kwargs)
|
| 2836 |
|
|
|
|
|
|
|
| 2837 |
# --- Model Configuration ---
|
| 2838 |
def create_llm_pipeline():
|
| 2839 |
#model_id = "meta-llama/Llama-2-13b-chat-hf"
|
|
|
|
| 3585 |
|
| 3586 |
# --- Run the Agent ---
|
| 3587 |
# Enhanced system prompt for better behavior
|
|
|
|
| 3588 |
def run_agent(agent, state: AgentState):
|
| 3589 |
"""Enhanced agent initialization with better prompt and hallucination prevention."""
|
| 3590 |
+
global WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_TOOL, YOUTUBE_AUDIO_TOOL, AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL, tools
|
| 3591 |
|
| 3592 |
# Initialize tools
|
| 3593 |
WIKIPEDIA_TOOL = WikipediaSearchToolWithFAISS()
|
| 3594 |
+
SEARCH_TOOL = EnhancedDuckDuckGoSearchTool(max_results=3, max_chars_per_page=8000)
|
| 3595 |
YOUTUBE_TOOL = EnhancedYoutubeScreenshotQA()
|
| 3596 |
YOUTUBE_AUDIO_TOOL = YouTubeTranscriptExtractor()
|
| 3597 |
+
AUDIO_TRANSCRIPTION_TOOL = AudioTranscriptionTool()
|
| 3598 |
+
EXCEL_TOOL = ExcelReaderTool()
|
| 3599 |
+
PYTHON_TOOL = PythonExecutorTool()
|
| 3600 |
+
tools = [WIKIPEDIA_TOOL, SEARCH_TOOL, YOUTUBE_AUDIO_TOOL, YOUTUBE_TOOL, AUDIO_TRANSCRIPTION_TOOL, EXCEL_TOOL, PYTHON_TOOL]
|
| 3601 |
|
| 3602 |
formatted_tools_description = render_text_description(tools)
|
| 3603 |
current_date_str = datetime.now().strftime("%Y-%m-%d")
|
|
|
|
| 3613 |
3. Use tools ONLY when you need specific information you don't know
|
| 3614 |
4. After using a tool, provide your FINAL ANSWER immediately
|
| 3615 |
5. STOP after giving your FINAL ANSWER - do not continue
|
| 3616 |
+
6. Do not repeat words in the question in the answer
|
| 3617 |
|
| 3618 |
FORMAT for tool use:
|
| 3619 |
Thought: <brief reasoning>
|
|
|
|
| 3625 |
|
| 3626 |
ANSWER FORMAT:
|
| 3627 |
- Numbers: no commas, no units unless specified
|
| 3628 |
+
- Questions on "how many" should be answered with a number ONLY
|
| 3629 |
- Strings: no articles, no abbreviations, digits in plain text
|
| 3630 |
+
- Lists: comma-separated either in ascending numeric order or alphabetical order as requested
|
| 3631 |
- Be extremely brief and concise
|
| 3632 |
- Do not provide additional context or explanations
|
| 3633 |
- Do not provide parentheticals
|
| 3634 |
|
| 3635 |
+
|
| 3636 |
+
|
| 3637 |
IMPORTANT: You are responding to ONE question only. Do not ask follow-up questions or generate additional dialogue.
|
| 3638 |
|
| 3639 |
Current date: {current_date_str}
|
|
|
|
| 3660 |
|
| 3661 |
# Cleanup
|
| 3662 |
if result.get("done"):
|
| 3663 |
+
torch.cuda.empty_cache()
|
| 3664 |
+
torch.cuda.ipc_collect()
|
| 3665 |
gc.collect()
|
| 3666 |
print("🧹 Released GPU memory after completion")
|
| 3667 |
|
| 3668 |
return result["messages"]
|
| 3669 |
+
|