Commit
·
e7d7ac8
1
Parent(s):
86c6b45
....
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ from typing import Optional, Dict, Any, List, Tuple, Set
|
|
| 10 |
from functools import lru_cache
|
| 11 |
import copy
|
| 12 |
import re
|
|
|
|
| 13 |
from abc import ABC, abstractmethod
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
from concurrent.futures import TimeoutError as FuturesTimeoutError
|
|
@@ -75,6 +76,18 @@ except ImportError:
|
|
| 75 |
pdfplumber = None
|
| 76 |
print("WARNING: pdfplumber library not found. PDF file processing will be unavailable. Install with: pip install pdfplumber")
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
logging.basicConfig(
|
| 79 |
level=logging.INFO,
|
| 80 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
@@ -96,10 +109,21 @@ MAX_FILE_SIZE = 5 * 1024 * 1024
|
|
| 96 |
CSV_SAMPLE_ROWS = 3
|
| 97 |
MAX_FILE_CONTEXT_LENGTH = 10000
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
asr_pipeline_instance: Optional[Any] = None
|
| 100 |
ASR_MODEL_NAME = "openai/whisper-tiny"
|
| 101 |
ASR_PROCESSING_TIMEOUT_SECONDS = 240
|
| 102 |
|
|
|
|
| 103 |
DEFAULT_RAG_CONFIG = {
|
| 104 |
'search': {
|
| 105 |
'tavily_quota': int(os.getenv("TAVILY_QUOTA", "1000")),
|
|
@@ -130,6 +154,48 @@ DEFAULT_RAG_CONFIG = {
|
|
| 130 |
'results': {'total_limit': 3, 'enrich_count': 2 }
|
| 131 |
}
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
class FileProcessor:
|
| 134 |
@staticmethod
|
| 135 |
def _get_asr_pipeline():
|
|
@@ -540,12 +606,12 @@ class CompositeSearchClient:
|
|
| 540 |
self._def_max_r = self._search_config.get("default_max_results", 3)
|
| 541 |
def _init_providers(self, config_dict: Dict) -> List[SearchProvider]:
|
| 542 |
providers: List[SearchProvider] = []
|
| 543 |
-
if TAVILY_API_KEY and TavilyClient:
|
| 544 |
-
tavily_prov = TavilyProvider(config_dict)
|
| 545 |
-
if tavily_prov.available(): providers.append(tavily_prov)
|
| 546 |
if GOOGLE_CUSTOM_SEARCH_API_KEY and GOOGLE_CUSTOM_SEARCH_CSE_ID:
|
| 547 |
google_prov = GoogleProvider(config_dict)
|
| 548 |
if google_prov.available(): providers.append(google_prov)
|
|
|
|
|
|
|
|
|
|
| 549 |
if DDGS:
|
| 550 |
ddgs_prov = DuckDuckGoProvider(config_dict)
|
| 551 |
if ddgs_prov.available(): providers.append(ddgs_prov)
|
|
|
|
| 10 |
from functools import lru_cache
|
| 11 |
import copy
|
| 12 |
import re
|
| 13 |
+
from PIL import Image
|
| 14 |
from abc import ABC, abstractmethod
|
| 15 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 16 |
from concurrent.futures import TimeoutError as FuturesTimeoutError
|
|
|
|
| 76 |
pdfplumber = None
|
| 77 |
print("WARNING: pdfplumber library not found. PDF file processing will be unavailable. Install with: pip install pdfplumber")
|
| 78 |
|
| 79 |
+
try:
|
| 80 |
+
import yt_dlp
|
| 81 |
+
except ImportError:
|
| 82 |
+
yt_dlp = None
|
| 83 |
+
print("WARNING: yt-dlp library not found. Video URL processing will be unavailable. Install with: pip install yt-dlp")
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
import cv2
|
| 87 |
+
except ImportError:
|
| 88 |
+
cv2 = None
|
| 89 |
+
print("WARNING: opencv-python library not found. Video processing will be unavailable. Install with: pip install opencv-python")
|
| 90 |
+
|
| 91 |
logging.basicConfig(
|
| 92 |
level=logging.INFO,
|
| 93 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
|
|
| 109 |
CSV_SAMPLE_ROWS = 3
|
| 110 |
MAX_FILE_CONTEXT_LENGTH = 10000
|
| 111 |
|
| 112 |
+
# Global instances for video analysis pipelines
|
| 113 |
+
video_object_detector_pipeline: Optional[Any] = None
|
| 114 |
+
video_species_classifier_pipeline: Optional[Any] = None
|
| 115 |
+
VIDEO_ANALYSIS_DEVICE: int = -1 # -1 for CPU, 0 for CUDA if available and torch is imported
|
| 116 |
+
VIDEO_ANALYSIS_OBJECT_MODEL = "facebook/detr-resnet-50"
|
| 117 |
+
VIDEO_ANALYSIS_SPECIES_MODEL = "openai/clip-vit-base-patch32" # CLIP for zero-shot
|
| 118 |
+
VIDEO_MAX_FRAMES_TO_PROCESS = 120 # Max frames to analyze (e.g., 2 mins at 1fps)
|
| 119 |
+
VIDEO_CONFIDENCE_THRESHOLD_BIRD = 0.6 # Confidence for 'bird' detection
|
| 120 |
+
VIDEO_CONFIDENCE_THRESHOLD_SPECIES = 0.25 # Confidence for species classification via CLIP
|
| 121 |
+
|
| 122 |
asr_pipeline_instance: Optional[Any] = None
|
| 123 |
ASR_MODEL_NAME = "openai/whisper-tiny"
|
| 124 |
ASR_PROCESSING_TIMEOUT_SECONDS = 240
|
| 125 |
|
| 126 |
+
|
| 127 |
DEFAULT_RAG_CONFIG = {
|
| 128 |
'search': {
|
| 129 |
'tavily_quota': int(os.getenv("TAVILY_QUOTA", "1000")),
|
|
|
|
| 154 |
'results': {'total_limit': 3, 'enrich_count': 2 }
|
| 155 |
}
|
| 156 |
|
| 157 |
+
def _get_video_object_detector():
|
| 158 |
+
global video_object_detector_pipeline, VIDEO_ANALYSIS_DEVICE
|
| 159 |
+
if video_object_detector_pipeline is None and hf_transformers_pipeline and torch:
|
| 160 |
+
try:
|
| 161 |
+
if VIDEO_ANALYSIS_DEVICE == -1: # Determine device if not already set
|
| 162 |
+
if torch.cuda.is_available():
|
| 163 |
+
VIDEO_ANALYSIS_DEVICE = 0
|
| 164 |
+
gaia_logger.info("CUDA available for video analysis, will use GPU.")
|
| 165 |
+
else:
|
| 166 |
+
VIDEO_ANALYSIS_DEVICE = -1 # Explicitly CPU
|
| 167 |
+
gaia_logger.info("CUDA not available for video analysis, will use CPU.")
|
| 168 |
+
|
| 169 |
+
video_object_detector_pipeline = hf_transformers_pipeline(
|
| 170 |
+
"object-detection",
|
| 171 |
+
model=VIDEO_ANALYSIS_OBJECT_MODEL,
|
| 172 |
+
device=VIDEO_ANALYSIS_DEVICE
|
| 173 |
+
)
|
| 174 |
+
gaia_logger.info(f"Video Object Detection pipeline ('{VIDEO_ANALYSIS_OBJECT_MODEL}') initialized on {'cuda' if VIDEO_ANALYSIS_DEVICE==0 else 'cpu'}.")
|
| 175 |
+
except Exception as e:
|
| 176 |
+
gaia_logger.error(f"Failed to initialize Video Object Detection pipeline: {e}", exc_info=True)
|
| 177 |
+
return None
|
| 178 |
+
return video_object_detector_pipeline
|
| 179 |
+
|
| 180 |
+
def _get_video_species_classifier():
|
| 181 |
+
global video_species_classifier_pipeline, VIDEO_ANALYSIS_DEVICE
|
| 182 |
+
if video_species_classifier_pipeline is None and hf_transformers_pipeline and torch:
|
| 183 |
+
try:
|
| 184 |
+
if VIDEO_ANALYSIS_DEVICE == -1: # Determine device if not already set (e.g. if detector wasn't called first)
|
| 185 |
+
if torch.cuda.is_available(): VIDEO_ANALYSIS_DEVICE = 0
|
| 186 |
+
else: VIDEO_ANALYSIS_DEVICE = -1
|
| 187 |
+
|
| 188 |
+
video_species_classifier_pipeline = hf_transformers_pipeline(
|
| 189 |
+
"zero-shot-image-classification",
|
| 190 |
+
model=VIDEO_ANALYSIS_SPECIES_MODEL,
|
| 191 |
+
device=VIDEO_ANALYSIS_DEVICE
|
| 192 |
+
)
|
| 193 |
+
gaia_logger.info(f"Video Species Classification pipeline ('{VIDEO_ANALYSIS_SPECIES_MODEL}') initialized on {'cuda' if VIDEO_ANALYSIS_DEVICE==0 else 'cpu'}.")
|
| 194 |
+
except Exception as e:
|
| 195 |
+
gaia_logger.error(f"Failed to initialize Video Species Classification pipeline: {e}", exc_info=True)
|
| 196 |
+
return None
|
| 197 |
+
return video_species_classifier_pipeline
|
| 198 |
+
|
| 199 |
class FileProcessor:
|
| 200 |
@staticmethod
|
| 201 |
def _get_asr_pipeline():
|
|
|
|
| 606 |
self._def_max_r = self._search_config.get("default_max_results", 3)
|
| 607 |
def _init_providers(self, config_dict: Dict) -> List[SearchProvider]:
|
| 608 |
providers: List[SearchProvider] = []
|
|
|
|
|
|
|
|
|
|
| 609 |
if GOOGLE_CUSTOM_SEARCH_API_KEY and GOOGLE_CUSTOM_SEARCH_CSE_ID:
|
| 610 |
google_prov = GoogleProvider(config_dict)
|
| 611 |
if google_prov.available(): providers.append(google_prov)
|
| 612 |
+
if TAVILY_API_KEY and TavilyClient:
|
| 613 |
+
tavily_prov = TavilyProvider(config_dict)
|
| 614 |
+
if tavily_prov.available(): providers.append(tavily_prov)
|
| 615 |
if DDGS:
|
| 616 |
ddgs_prov = DuckDuckGoProvider(config_dict)
|
| 617 |
if ddgs_prov.available(): providers.append(ddgs_prov)
|