Final_Assignment_Template

Sleeping

App Files Files Community

ChillThrills commited on May 13, 2025

Commit

64b2383

1 Parent(s): d2da2aa

refactor code structure for enhanced maintainability and clarity

Browse files

Files changed (1) hide show

app.py +323 -455

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import copy
 import re
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from collections import defaultdict
 try:
@@ -63,7 +64,7 @@ except ImportError:
     print("WARNING: librosa library not found. Audio processing may be impaired. Install with: pip install librosa")
 try:
-    import openpyxl # Engine for pandas to read .xlsx
 except ImportError:
     openpyxl = None
     print("WARNING: openpyxl library not found. .xlsx file processing might fail. Install with: pip install openpyxl")
@@ -73,7 +74,6 @@ try:
 except ImportError:
     pdfplumber = None
     print("WARNING: pdfplumber library not found. PDF file processing will be unavailable. Install with: pip install pdfplumber")
-# --- End of New Imports ---
 logging.basicConfig(
     level=logging.INFO,
@@ -90,15 +90,15 @@ GOOGLE_GEMINI_API_KEY = os.getenv("GOOGLE_GEMINI_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 AGENT_DEFAULT_TIMEOUT = 15
-MAX_CONTEXT_LENGTH_LLM = 30000
-MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
 CSV_SAMPLE_ROWS = 3
-MAX_FILE_CONTEXT_LENGTH = 10000 # Max characters for file context summary
-# Global variable for ASR pipeline (initialized on first use)
 asr_pipeline_instance: Optional[Any] = None
-ASR_MODEL_NAME = "openai/whisper-tiny" # Smaller model for resource efficiency
 DEFAULT_RAG_CONFIG = {
     'search': {
@@ -108,7 +108,8 @@ DEFAULT_RAG_CONFIG = {
         'google_cse_id': GOOGLE_CUSTOM_SEARCH_CSE_ID,
         'tavily_api_key': TAVILY_API_KEY,
         'default_max_results': 3, 'retry_attempts': 2, 'retry_delay': 2,
-        'google_timeout': 8, 'tavily_depth': "basic"
     },
     'processing': {
         'trusted_sources': {'wikipedia.org': 0.8, 'reuters.com': 0.75, 'apnews.com': 0.75},
@@ -135,9 +136,7 @@ class FileProcessor:
         global asr_pipeline_instance
         if asr_pipeline_instance is None and hf_transformers_pipeline and torch:
             try:
-                # device = 0 if torch.cuda.is_available() else -1 # For GPU if available
-                # Simpler for HF Spaces CPU instances:
-                device = -1 # CPU
                 asr_pipeline_instance = hf_transformers_pipeline(
                     "automatic-speech-recognition",
                     model=ASR_MODEL_NAME,
@@ -167,34 +166,26 @@ class FileProcessor:
         try:
             if len(content) > MAX_FILE_SIZE:
-                gaia_logger.warning(f"File '{filename_str}' exceeds max size {MAX_FILE_SIZE} bytes.")
                 return f"Error: File '{filename_str}' exceeds maximum allowed size ({MAX_FILE_SIZE // (1024*1024)}MB)."
             if 'csv' in content_type_str or filename_str.endswith('.csv'):
-                gaia_logger.info(f"Processing CSV file: {filename_str}")
                 return FileProcessor._process_csv(content, filename_str)
             elif 'json' in content_type_str or filename_str.endswith('.json'):
-                gaia_logger.info(f"Processing JSON file: {filename_str}")
                 return FileProcessor._process_json(content, filename_str)
             elif ('excel' in content_type_str or 'spreadsheetml' in content_type_str or \
-                  filename_str.endswith(('.xlsx', '.xls'))) and openpyxl: # Check for openpyxl
-                gaia_logger.info(f"Processing Excel file: {filename_str}")
                 return FileProcessor._process_excel(content, filename_str)
-            elif ('pdf' in content_type_str or filename_str.endswith('.pdf')) and pdfplumber: # Check for pdfplumber
-                gaia_logger.info(f"Processing PDF file: {filename_str}")
                 return FileProcessor._process_pdf(content, filename_str)
             elif ('audio' in content_type_str or \
                   filename_str.endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a'))) and \
-                  hf_transformers_pipeline and librosa: # Check for ASR libs
-                gaia_logger.info(f"Processing Audio file: {filename_str}")
                 return FileProcessor._process_audio(content, filename_str)
             elif 'text/plain' in content_type_str or \
                  ('text/' in content_type_str and not any(sub in content_type_str for sub in ['html', 'xml'])) or \
                  filename_str.endswith(('.txt', '.md', '.py', '.js', '.c', '.cpp', '.java', '.html', '.xml', '.log')):
-                gaia_logger.info(f"Processing Text-like file: {filename_str} (Content-Type: {content_type_str})")
                 return FileProcessor._process_text(content, filename_str)
             else:
-                gaia_logger.info(f"Handling unknown/binary file type: {filename_str} (Content-Type: {content_type_str})")
                 return FileProcessor._handle_unknown_type(content, filename_str)
         except Exception as e:
             gaia_logger.error(f"File processing error for '{filename_str}': {str(e)}", exc_info=True)
@@ -210,9 +201,10 @@ class FileProcessor:
     @staticmethod
     def _process_csv(content: bytes, filename: str) -> str:
         try:
             encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
-            df = None
             for enc in encodings_to_try:
                 try:
                     df = pd.read_csv(io.BytesIO(content), encoding=enc)
@@ -221,19 +213,30 @@ class FileProcessor:
                 except Exception: continue
             if df is None: return f"Error: Could not decode CSV '{filename}'."
-            num_rows, num_cols = len(df), len(df.columns)
-            cols_str = ', '.join(df.columns)
-            sample_str = df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)
             summary = (
-                f"CSV Document Summary: '{filename}' ({num_rows} rows, {num_cols} columns):\n"
-                f"Columns: {cols_str}\nFirst {min(CSV_SAMPLE_ROWS, num_rows)} sample rows:\n{sample_str}"
             )
             return FileProcessor._truncate_text(summary, filename, "CSV")
-        except Exception as e:
             return f"Error processing CSV '{filename}': {str(e)}"
     @staticmethod
     def _process_json(content: bytes, filename: str) -> str:
         try:
             decoded_content = content.decode('utf-8', errors='replace')
             data = json.loads(decoded_content)
@@ -248,6 +251,7 @@ class FileProcessor:
     @staticmethod
     def _process_text(content: bytes, filename: str) -> str:
         try:
             text = None
             encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
@@ -265,33 +269,59 @@ class FileProcessor:
     @staticmethod
     def _process_excel(content: bytes, filename: str) -> str:
         if not openpyxl: return f"Error: Excel processing skipped for '{filename}', openpyxl library not available."
         try:
-            # Reading all sheets and summarizing; can be adjusted for first sheet or specific sheets
             xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
             summary_parts = [f"Excel Document Summary: '{filename}'"]
             for sheet_name in xls.sheet_names:
                 df = xls.parse(sheet_name)
-                num_rows, num_cols = len(df), len(df.columns)
-                cols_str = ', '.join(df.columns)
-                sample_str = df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)
                 sheet_summary = (
-                    f"\n---\nSheet: '{sheet_name}' ({num_rows} rows, {num_cols} columns):\n"
-                    f"Columns: {cols_str}\nFirst {min(CSV_SAMPLE_ROWS, num_rows)} sample rows:\n{sample_str}"
                 )
                 summary_parts.append(sheet_summary)
-                # Check length to avoid overly long summaries from many sheets
-                if sum(len(p) for p in summary_parts) > MAX_FILE_CONTEXT_LENGTH * 0.8: # Soft limit before final truncate
                     summary_parts.append("\n... (further sheets omitted due to length)")
                     break
             full_summary = "".join(summary_parts)
             return FileProcessor._truncate_text(full_summary, filename, "Excel")
-        except Exception as e:
-            gaia_logger.error(f"Excel processing error for '{filename}': {str(e)}", exc_info=True)
             return f"Error processing Excel file '{filename}': {str(e)}"
     @staticmethod
     def _process_pdf(content: bytes, filename: str) -> str:
         if not pdfplumber: return f"Error: PDF processing skipped for '{filename}', pdfplumber library not available."
         text_content = ""
         try:
@@ -301,46 +331,67 @@ class FileProcessor:
                         page_text = page.extract_text()
                         if page_text:
                             text_content += page_text + "\n"
-                        if len(text_content) > MAX_FILE_CONTEXT_LENGTH * 1.2: # Allow slight overage before hard truncate
-                            gaia_logger.info(f"PDF '{filename}' text extraction stopped early due to length at page {i+1}.")
                             break
             if not text_content:
                 return f"PDF Document: '{filename}'. No text could be extracted or PDF is empty."
             summary = f"PDF Document: '{filename}':\n{text_content}"
             return FileProcessor._truncate_text(summary, filename, "PDF")
         except Exception as e:
-            gaia_logger.error(f"PDF processing error for '{filename}': {str(e)}", exc_info=True)
             return f"Error processing PDF file '{filename}': {str(e)}"
     @staticmethod
     def _process_audio(content: bytes, filename: str) -> str:
-        asr_pipeline = FileProcessor._get_asr_pipeline()
-        if not asr_pipeline:
             return f"Error: Audio processing skipped for '{filename}', ASR pipeline not available."
         if not librosa:
             return f"Error: Audio processing skipped for '{filename}', librosa library not available."
         try:
             with io.BytesIO(content) as audio_buffer:
                 y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
-            gaia_logger.info(f"Transcribing audio file: {filename} ({len(y)/sr:.2f} seconds)")
             start_time = time.time()
-            # Added generate_kwargs to hint language and task - adjust 'en' if other languages are primary
-            transcription_result = asr_pipeline(y, generate_kwargs={"task": "transcribe", "language": "en"})
-            end_time = time.time()
-            gaia_logger.info(f"Audio transcription for '{filename}' took {end_time - start_time:.2f} seconds.")
-            transcribed_text = transcription_result.get("text", "") if isinstance(transcription_result, dict) else str(transcription_result)
             if not transcribed_text.strip():
-                return f"Audio Document: '{filename}'. Transcription result was empty."
             summary = f"Audio Document (Transcription): '{filename}':\n{transcribed_text}"
             return FileProcessor._truncate_text(summary, filename, "Audio Transcription")
         except Exception as e:
             gaia_logger.error(f"Audio processing/transcription error for '{filename}': {str(e)}", exc_info=True)
             return f"Error processing Audio file '{filename}': {str(e)}"
     @staticmethod
     def _handle_unknown_type(content: bytes, filename: str) -> str:
         gaia_logger.warning(f"Attempting to handle unknown file type for '{filename}' as text snippet.")
@@ -351,7 +402,7 @@ class FileProcessor:
         except Exception:
             return f"File with Unknown Content Type: '{filename}'. Content is likely binary and cannot be displayed as text."
-class CacheManager:
     def __init__(self, ttl: int = 300, max_size: int = 100, name: str = "Cache"):
         self.ttl = ttl; self.max_size = max_size
         self._cache: Dict[Any, Any] = {}; self._timestamps: Dict[Any, float] = {}
@@ -361,13 +412,10 @@ class CacheManager:
         if key in self._cache and (time.time() - self._timestamps.get(key, 0) < self.ttl):
             try:
                 self._access_order.remove(key); self._access_order.append(key)
-                gaia_logger.debug(f"[{self.name}] Cache hit: {str(key)[:100]}...")
                 return copy.deepcopy(self._cache[key])
             except (ValueError, TypeError) as e:
-                gaia_logger.debug(f"[{self.name}] Error accessing {str(key)[:100]}: {e}")
                 self.delete(key); return None
         elif key in self._cache:
-            gaia_logger.debug(f"[{self.name}] Cache expired: {str(key)[:100]}...")
             self.delete(key)
         return None
     def set(self, key: Any, value: Any):
@@ -375,12 +423,10 @@ class CacheManager:
         while len(self._cache) >= self.max_size and self._access_order:
             old_key = self._access_order.pop(0)
             if old_key in self._cache:
-                gaia_logger.debug(f"[{self.name}] Evicting: {str(old_key)[:100]}...")
                 del self._cache[old_key]; del self._timestamps[old_key]
         try: self._cache[key] = copy.deepcopy(value)
         except TypeError: self._cache[key] = value
         self._timestamps[key] = time.time(); self._access_order.append(key)
-        gaia_logger.debug(f"[{self.name}] Cache set: {str(key)[:100]}. Size: {len(self)}")
     def delete(self, key: Any):
         if key in self._cache:
             try:
@@ -391,28 +437,20 @@ class CacheManager:
     def __len__(self): return len(self._cache)
     def __contains__(self, key): return key in self._cache and (time.time()-self._timestamps.get(key,0)<self.ttl)
-class SearchProvider(ABC):
     def __init__(self, config_dict: Dict):
         self.provider_config = config_dict.get('search', {})
-        self._enabled = False  # Initial default
         self._quota_used = 0
-        # This call to self.provider_name will invoke the subclass's implementation.
         raw_quota = self.provider_config.get(f'{self.provider_name.lower()}_quota', float('inf'))
         self._quota_limit = float(raw_quota) if raw_quota is not None else float('inf')
     @property
     @abstractmethod
-    def provider_name(self) -> str:
-        pass
     @abstractmethod
-    def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
-        pass
     def search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
-        if not self._enabled:
-            gaia_logger.debug(f"[{self.provider_name}] Skip: Not enabled.")
-            return None
         if self._quota_limit != float('inf') and self._quota_used >= self._quota_limit:
             gaia_logger.warning(f"[{self.provider_name}] Skip: Quota ({self._quota_used}/{int(self._quota_limit)})")
             return None
@@ -422,371 +460,230 @@ class SearchProvider(ABC):
             usage_str = f"({self._quota_used}/{int(self._quota_limit)}) "
         gaia_logger.info(f"[{self.provider_name}] {usage_str}Search: '{query[:70]}...'")
         return self._perform_search(query, max_results)
-    def available(self) -> bool:
-        return self._enabled
-class GoogleProvider(SearchProvider):
     @property
-    def provider_name(self) -> str:
-        return "Google"
     def __init__(self, config_dict: Dict):
-        super().__init__(config_dict)  # Sets self._enabled = False initially via SearchProvider
         self._api_key = self.provider_config.get("google_api_key")
         self._cse_id = self.provider_config.get("google_cse_id")
         self._timeout = self.provider_config.get("google_timeout", 8)
-        if self._api_key and self._cse_id:
-            self._enabled = True
-            gaia_logger.info(f"✓ {self.provider_name} API configured.")
-        else:
-            self._enabled = False # Explicitly ensure it's false if keys are missing
-            gaia_logger.warning(f"✗ {self.provider_name} API key/CSE ID missing in RAG config.")
     def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
         try:
-            params = {
-                'key': self._api_key,
-                'cx': self._cse_id,
-                'q': query,
-                'num': max_results,
-                'safe': 'active'
-            }
-            response = requests.get(
-                "https://www.googleapis.com/customsearch/v1",
-                params=params,
-                timeout=self._timeout
-            )
             response.raise_for_status()
-            data = response.json()
-            items = data.get('items', [])
-            if not items:
-                gaia_logger.info(f"[{self.provider_name}] No results for '{query[:70]}'")
-                return []
-            return [{
-                'href': i.get('link'),
-                'title': i.get('title', ''),
-                'body': i.get('snippet', '')
-            } for i in items]
-        except requests.exceptions.Timeout:
-            gaia_logger.warning(f"[{self.provider_name}] Timeout: '{query[:70]}'")
-            return None
-        except requests.exceptions.RequestException as e:
-            gaia_logger.warning(f"[{self.provider_name}] RequestEx: '{query[:70]}': {e}")
-            return None
-        except Exception as e:
-            gaia_logger.error(f"[{self.provider_name}] Error: '{query[:70]}': {e}", exc_info=True)
-            return None
-class TavilyProvider(SearchProvider):
     @property
-    def provider_name(self) -> str:
-        return "Tavily"
     def __init__(self, config_dict: Dict):
-        super().__init__(config_dict) # Sets self._enabled = False initially
         self._api_key = self.provider_config.get("tavily_api_key")
         self._search_depth = self.provider_config.get("tavily_depth", "basic")
         if self._api_key and TavilyClient:
-            try:
-                self._client = TavilyClient(api_key=self._api_key)
-                self._enabled = True
-                gaia_logger.info(f"✓ {self.provider_name} API initialized.")
-            except Exception as e:
-                self._enabled = False # Explicitly ensure it's false on init fail
-                gaia_logger.warning(f"✗ {self.provider_name} init fail: {e}", exc_info=False)
-        elif not TavilyClient:
-            self._enabled = False # Explicitly ensure it's false if lib missing
-            gaia_logger.warning(f"✗ {self.provider_name}: TavilyClient lib missing.")
-        else:
-            self._enabled = False # Explicitly ensure it's false if API key missing
-            gaia_logger.warning(f"✗ {self.provider_name}: API key missing in RAG config.")
     def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
-        if not self._enabled: return None # Should be caught by SearchProvider.search, but good for safety
         try:
             response = self._client.search(query=query, max_results=max_results, search_depth=self._search_depth)
             hits = response.get('results', [])
-            if not hits: gaia_logger.info(f"[{self.provider_name}] No results: '{query[:70]}'"); return []
             return [{'href': h.get('url'), 'title': h.get('title',''), 'body': h.get('content','')} for h in hits]
         except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
-class DuckDuckGoProvider(SearchProvider):
     @property
-    def provider_name(self) -> str:
-        return "DuckDuckGo"
     def __init__(self, config_dict: Dict):
-        super().__init__(config_dict) # Sets self._enabled = False initially
         if DDGS:
-            try:
-                self._client = DDGS(timeout=10)
-                self._enabled = True
-                gaia_logger.info(f"✓ {self.provider_name} Search initialized.")
-            except Exception as e:
-                self._enabled = False # Explicitly ensure it's false on init fail
-                gaia_logger.warning(f"✗ {self.provider_name} init fail: {e}", exc_info=False)
-        else:
-            self._enabled = False # Explicitly ensure it's false if lib missing
-            gaia_logger.warning(f"✗ {self.provider_name}: DDGS lib missing.")
     def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
-        if not self._enabled: return None # Should be caught by SearchProvider.search
         try:
             hits = list(self._client.text(query, region='wt-wt', max_results=max_results))[:max_results]
-            if not hits: gaia_logger.info(f"[{self.provider_name}] No results: '{query[:70]}'"); return []
             return [{'href': r.get('href'), 'title': r.get('title',''), 'body': r.get('body','')} for r in hits]
         except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
-class CompositeSearchClient:
     def __init__(self, config_dict: Dict):
         self.config = config_dict
         self._search_config = config_dict.get('search', {})
         self.providers = self._init_providers(config_dict)
         self.cache = CacheManager(
             ttl=config_dict.get('caching', {}).get('search_cache_ttl', 300),
-            max_size=config_dict.get('caching', {}).get('search_cache_size', 50),
-            name="SearchClientCache"
         )
         self._retry_att = self._search_config.get("retry_attempts", 2)
         self._retry_del = self._search_config.get("retry_delay", 2)
         self._def_max_r = self._search_config.get("default_max_results", 3)
     def _init_providers(self, config_dict: Dict) -> List[SearchProvider]:
         providers: List[SearchProvider] = []
         if TAVILY_API_KEY and TavilyClient:
             tavily_prov = TavilyProvider(config_dict)
-            if tavily_prov.available():
-                providers.append(tavily_prov)
         if GOOGLE_CUSTOM_SEARCH_API_KEY and GOOGLE_CUSTOM_SEARCH_CSE_ID:
             google_prov = GoogleProvider(config_dict)
-            if google_prov.available():
-                providers.append(google_prov)
         if DDGS:
             ddgs_prov = DuckDuckGoProvider(config_dict)
-            if ddgs_prov.available():
-                providers.append(ddgs_prov)
-        if not providers:
-            gaia_logger.error("RAG: No search providers initialized!")
-        else:
-            gaia_logger.info(f"RAG Providers: {[p.provider_name for p in providers]}")
         return providers
     def search(self, query: str, max_results: Optional[int] = None, force_refresh: bool = False) -> List[Dict]:
         q, actual_r = query.strip(), max_results if max_results is not None else self._def_max_r
-        if not q:
-            return []
         cache_key = (q, actual_r)
-        if not force_refresh and (cached := self.cache.get(cache_key)) is not None:
-            return cached
         for prov in self.providers:
             for attempt in range(self._retry_att + 1):
-                if not prov.available():
-                    break
                 try:
                     results = prov.search(q, actual_r)
-                    if results is not None:
-                        self.cache.set(cache_key, results)
-                        return results
-                    gaia_logger.warning(f"[{prov.provider_name}] search None: '{q[:50]}' (att {attempt+1})")
-                    if attempt < self._retry_att:
-                        time.sleep(self._retry_del)
                 except Exception as e:
-                    gaia_logger.error(f"[{prov.provider_name}] Ex during search '{q[:50]}': {e}", exc_info=True)
-                    if attempt < self._retry_att:
-                        time.sleep(self._retry_del)
-        gaia_logger.error(f"RAG: All providers failed for query: '{q[:50]}'.")
         self.cache.set(cache_key, [])
         return []
-class GaiaQueryBuilder:
     def __init__(self, base_query: str, config_dict: Dict):
         self.base_query = base_query.strip()
-        self.config = config_dict  # Fixed: store config_dict for potential future use.
-        gaia_logger.debug(f"GaiaQueryBuilder init: '{self.base_query[:100]}'")
     def get_queries(self) -> Dict[str, List[Tuple[str, str]]]:
-        queries = {'primary': [(self.base_query, 'GENERAL')]} if self.base_query else {'primary': []}
-        gaia_logger.debug(f"RAG Generated queries: {queries}")
-        return queries
-class ResultProcessor:
     def __init__(self, config_dict: Dict):
         self.proc_config = config_dict.get('processing', {})
         self.trusted_sources = self.proc_config.get('trusted_sources', {})
         self.seen_urls: Set[str] = set()
         self.date_pattern = DEFAULT_RAG_CONFIG['processing'].get('date_pattern', r'\b\d{4}\b')
-        gaia_logger.debug("RAG ResultProcessor initialized.")
     def process_batch(self, results: List[Dict], query_tag: str, initial_cat: str='GENERAL') -> List[Dict]:
         processed: List[Dict] = []
-        if not results:
-            return processed
         for r in results:
             url = r.get('href')
-            if not url or self._normalize_url(url) in self.seen_urls:
-                continue
             self.seen_urls.add(self._normalize_url(url))
-            res_data = {
-                'title': r.get('title',''),
-                'body': r.get('body',''),
-                'href': url,
-                'query_tag': query_tag,
-                'category': initial_cat,
-                'source_quality': 0.5,
-                'temporal_relevance': 0.1,
-                'combined_score': 0.0
-            }
             self._score_result(res_data)
             processed.append(res_data)
-        gaia_logger.debug(f"[RAG Proc] Batch: {len(processed)} new results from '{query_tag}'")
         return processed
-    def _normalize_url(self, url: str) -> str:
-        return re.sub(r'^https?://(?:www\.)?', '', str(url)).rstrip('/') if url else ""
     def _score_result(self, result: Dict):
         url, body, title = result.get('href', ''), result.get('body', ''), result.get('title', '')
         source_q = 0.5
-        if domain_match := re.search(r'https?://(?:www\.)?([^/]+)', url or ""):
-            source_q = self.trusted_sources.get(domain_match.group(1), 0.5)
         result['source_quality'] = source_q
-        temporal_r = 0.1
-        text_combo = (str(title) + ' ' + str(body)).lower()
-        if any(k in text_combo for k in ['today', 'current', 'latest']) or re.search(r'\b\d+\s+hours?\s+ago', text_combo):
-            temporal_r = 0.9
-        elif re.search(self.date_pattern, text_combo):
-            temporal_r = 0.5
         result['temporal_relevance'] = temporal_r
         result['combined_score'] = (source_q * 0.6 + temporal_r * 0.4)
-class ContentEnricher:
     def __init__(self, config_dict: Dict):
         self.enrich_config = config_dict.get('enrichment', {})
         self._enabled = self.enrich_config.get('enabled', False) and bool(BeautifulSoup)
-        if not self._enabled:
-            gaia_logger.warning("RAG ContentEnricher disabled (BeautifulSoup missing or config).")
-            return
         self._timeout = self.enrich_config.get('timeout', 10)
         self._max_w = self.enrich_config.get('workers', 3)
         self._min_l, self._max_l = self.enrich_config.get('min_text_length', 200), self.enrich_config.get('max_text_length', 8000)
         self._skip_ext = tuple(self.enrich_config.get('skip_extensions', []))
-        self.cache = CacheManager(
-            ttl=config_dict.get('caching', {}).get('enrich_cache_ttl', 600),
-            max_size=config_dict.get('caching', {}).get('enrich_cache_size', 25),
-            name="EnrichCache"
-        )
         gaia_logger.info(f"RAG ContentEnricher Initialized. Enabled: {self._enabled}")
     def enrich_batch(self, results: List[Dict], force_refresh: bool = False) -> List[Dict]:
-        if not self._enabled or not results:
-            return results
         updated_res = []
         with ThreadPoolExecutor(max_workers=self._max_w) as executor:
             future_map = {executor.submit(self._fetch_single, r, force_refresh): r for r in results}
-            for future in as_completed(future_map):
-                updated_res.append(future.result())
         return updated_res
     def _fetch_single(self, result: Dict, force_refresh: bool) -> Dict:
-        url = result.get('href')
-        result.setdefault('enriched', False)
-        result.setdefault('enrichment_failed', None)
-        result.setdefault('enrichment_skipped_type', None)
-        if not url:
-            result['enrichment_skipped_type'] = 'no_url'
-            return result
         if not force_refresh and (cached := self.cache.get(url)) is not None:
-            if cached:
-                result.update(cached)
-                gaia_logger.debug(f"[Enrich] Cache hit: {url}")
-                return result
-        if url.lower().endswith(self._skip_ext):
-            result['enrichment_skipped_type'] = 'extension'
-            return result
         try:
             headers = {'User-Agent': 'Mozilla/5.0 GaiaRAGAgent/1.0'}
             response = requests.get(url, headers=headers, timeout=self._timeout, allow_redirects=True)
             response.raise_for_status()
-            if 'text/html' not in response.headers.get('Content-Type', '').lower():
-                result['enrichment_skipped_type'] = 'non-html'
-                return result
             soup = BeautifulSoup(response.text, 'lxml')
             for el_name in ["script", "style", "nav", "header", "footer", "aside", "form", "iframe", "img", "svg", ".ad", ".advertisement"]:
-                for el in soup.select(el_name):
-                    el.decompose()
             main_el = soup.select_one('article, main, [role="main"], .entry-content, .post-content, #content, #main') or soup.body
             text = main_el.get_text(separator='\n', strip=True) if main_el else ""
             text = re.sub(r'(\s*\n\s*){2,}', '\n\n', text).strip()
             if len(text) >= self._min_l:
                 result['body'] = text[:self._max_l] + ("..." if len(text) > self._max_l else "")
-                result['enriched'] = True
-                self.cache.set(url, {'body': result['body'], 'enriched': True})
-                gaia_logger.info(f"[Enrich] OK: {url} ({len(result['body'])} chars).")
-            else:
-                result['enrichment_failed'] = 'too_short'
-        except Exception as e:
-            result['enrichment_failed'] = type(e).__name__
-            gaia_logger.warning(f"[Enrich] Fail: {url}: {e}", exc_info=False)
         return result
-class GeneralRAGPipeline:
     def __init__(self, config_dict: Optional[Dict] = None):
         self.config = config_dict if config_dict is not None else DEFAULT_RAG_CONFIG
         self.search_client = CompositeSearchClient(self.config)
         enrich_cfg = self.config.get('enrichment', {})
         self.enricher = ContentEnricher(self.config) if enrich_cfg.get('enabled', False) and BeautifulSoup else None
-        if not self.enricher:
-            gaia_logger.info("RAG Content Enrichment is disabled (no BeautifulSoup or config).")
-        self.pipeline_cache = CacheManager(
-            ttl=self.config.get('caching', {}).get('analyzer_cache_ttl', 3600),
-            max_size=self.config.get('caching', {}).get('analyzer_cache_size', 30),
-            name="RAGPipelineCache"
-        )
         gaia_logger.info("GeneralRAGPipeline initialized.")
     def analyze(self, query: str, force_refresh: bool = False) -> List[Dict]:
-        q = query.strip()
-        if not q:
-            return []
         cfg_res, cfg_search = self.config.get('results', {}), self.config.get('search', {})
         total_lim, enrich_cnt = cfg_res.get('total_limit', 3), cfg_res.get('enrich_count', 2)
         enrich_en = self.config.get('enrichment', {}).get('enabled', False) and bool(self.enricher)
         max_r_pq = cfg_search.get('default_max_results', 3)
         cache_key = (q, max_r_pq, total_lim, enrich_en, enrich_cnt)
-        if not force_refresh and (cached := self.pipeline_cache.get(cache_key)) is not None:
-            gaia_logger.info(f"[RAG Analyze] Cache hit: '{q[:50]}'")
-            return cached
-        if force_refresh:
-            self.search_client.cache.clear()
-            if self.enricher:
-                self.enricher.cache.clear()
         all_res, res_proc = [], ResultProcessor(self.config)
         staged_qs = GaiaQueryBuilder(q, self.config).get_queries()
         for stage, qs_in_stage in staged_qs.items():
             for query_s, cat in qs_in_stage:
-                if len(all_res) >= total_lim * 2:
-                    break
-                gaia_logger.info(f"[RAG Analyze] Stage '{stage}': Search '{query_s[:70]}'")
                 s_res = self.search_client.search(query_s, max_results=max_r_pq, force_refresh=force_refresh)
                 all_res.extend(res_proc.process_batch(s_res or [], query_s, initial_cat=cat))
         all_res.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
         if enrich_en and self.enricher and all_res:
             to_enrich = [r for r in all_res[:enrich_cnt] if r.get('href')]
-            gaia_logger.info(f"[RAG Analyze] Enriching {len(to_enrich)} items...")
-            enriched_map = {
-                item['href']: item for item in self.enricher.enrich_batch(to_enrich, force_refresh=force_refresh)
-                if item.get('href')
-            }
             temp_results = [enriched_map.get(r['href'], r) if r.get('href') else r for r in all_res]
-            all_res = temp_results
-            all_res.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
         final_results = all_res[:total_lim]
-        gaia_logger.info(f"[RAG Analyze] Done. {len(final_results)} results for '{q[:50]}'")
         self.pipeline_cache.set(cache_key, final_results)
         return final_results
 class GaiaLevel1Agent:
     def __init__(self, api_url: str = DEFAULT_API_URL):
         self.api_url = api_url
@@ -796,9 +693,7 @@ class GaiaLevel1Agent:
         if genai and GOOGLE_GEMINI_API_KEY:
             try:
                 genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
-                # Using gemini-1.5-flash-latest for better context window support
-                # and consistency with MAX_CONTEXT_LENGTH_LLM = 30000
-                model_name = 'gemini-2.0-flash'
                 self.llm_model = genai.GenerativeModel(model_name)
                 gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
             except Exception as e:
@@ -813,7 +708,6 @@ class GaiaLevel1Agent:
     @lru_cache(maxsize=32)
     def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
         file_url = f"{self.api_url}/files/{task_id}"
-        # gaia_logger.info(f"Agent fetching file from: {file_url}") # Reduced verbosity
         for attempt in range(2):
             try:
                 response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
@@ -827,8 +721,6 @@ class GaiaLevel1Agent:
                         filename = header_filename
                 content_type = response.headers.get("Content-Type", "")
-                # gaia_logger.info(f"File downloaded: {filename}, type: {content_type}, size: {len(response.content)} bytes") # Reduced verbosity
                 processed_content = FileProcessor.process(response.content, filename, content_type)
                 return processed_content
@@ -845,19 +737,43 @@ class GaiaLevel1Agent:
                 if attempt < 1: time.sleep(1)
         return None
-    def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> str:
         if not self.llm_model:
             gaia_logger.warning("LLM model (Gemini) not available for answer formulation.")
-            # Fallback if LLM is entirely unavailable
             if web_context and file_context:
-                return "FINAL ANSWER: LLM unavailable; context from file and web was found but not processed by LLM."
             elif web_context:
-                return f"FINAL ANSWER: LLM unavailable; web context found: {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
             elif file_context:
-                return f"FINAL ANSWER: LLM unavailable; file context found: {file_context[:100]}..."
-            return "FINAL ANSWER: LLM unavailable and no context found."
-        # --- NEW PROMPT STRUCTURE ---
         prompt_parts = [
             "You are a general AI assistant. Your primary goal is to answer the user's question accurately and concisely based *only* on the provided context (from a document and/or web search results).",
             "First, think step-by-step and briefly explain your reasoning based on the context. This part is for clarity and should come before your final answer.",
@@ -870,18 +786,16 @@ class GaiaLevel1Agent:
             "Prioritize information from 'Enriched Content' from web search results if available and relevant over shorter 'Snippets'.",
             "\nUser Question: ", question
         ]
-        # --- END OF NEW PROMPT STRUCTURE HEAD ---
         current_prompt_text_len = sum(len(p) for p in prompt_parts)
-        # Context preparation (similar to before, but ensure it fits with new prompt instructions)
         context_added = False
         if file_context:
             file_header = "\n\nContext from Provided Document:\n---"
             file_footer = "\n---"
-            max_len_for_file = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - (len(web_context) if web_context else 0) - len(file_header) - len(file_footer) - 500 # Buffer for web, answer instructions etc.
-            if max_len_for_file > 100 : # Only add if there's meaningful space
                 truncated_file_context = file_context[:max_len_for_file]
                 if len(file_context) > len(truncated_file_context):
                     truncated_file_context += " ... (file context truncated)"
@@ -895,10 +809,9 @@ class GaiaLevel1Agent:
         if web_context:
             web_header = "\n\nContext from Web Search Results:\n---"
             web_footer = "\n---"
-            # Recalculate available length for web specifically
-            available_len_for_web = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(web_header) - len(web_footer) - 300 # Buffer for answer instructions
-            if available_len_for_web > 100: # Only add if there's meaningful space
                 truncated_web_context = web_context
                 if len(web_context) > available_len_for_web:
                     truncated_web_context = web_context[:available_len_for_web] + "\n... (web context truncated)"
@@ -910,10 +823,10 @@ class GaiaLevel1Agent:
                 gaia_logger.warning("Not enough space for web context in LLM prompt, or web context itself is empty.")
-        if not context_added: # If neither file nor web context could be added (e.g., due to length)
             prompt_parts.append("\n\nNo document or web context could be provided due to length constraints or availability.")
-        prompt_parts.append("\n\nReasoning and Final Answer:") # LLM will put its thoughts here, then "FINAL ANSWER: ..."
         final_prompt = "\n".join(prompt_parts)
         gaia_logger.info(f"LLM Prompt (first 300): {final_prompt[:300]}...")
@@ -922,15 +835,14 @@ class GaiaLevel1Agent:
         if not GenerationConfig:
             gaia_logger.error("GenerationConfig not available. Cannot make LLM call.")
-            return "FINAL ANSWER: LLM configuration error."
         try:
             gen_config = GenerationConfig(
-                temperature=0.1, # Reduced temperature for more deterministic and rule-following answers
-                top_p=0.95,      # Kept top_p
-                max_output_tokens=2048 # Should be enough for thoughts + answer
             )
-            # Safety settings remain the same
             safety_set = [{"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
             response = self.llm_model.generate_content(
@@ -940,59 +852,38 @@ class GaiaLevel1Agent:
             )
             if not response.candidates or (hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason):
-                reason = "Unknown"
                 if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
-                    reason = response.prompt_feedback.block_reason.name
-                gaia_logger.warning(f"Gemini response blocked. Reason: {reason}.")
-                # Return in the requested format even for errors
-                return f"My response was blocked (Reason: {reason}). FINAL ANSWER: Error processing request."
-            llm_answer = response.text
-            gaia_logger.info(f"LLM Full Answer (first 200): {llm_answer[:200]}...")
-            # Ensure the output contains "FINAL ANSWER:" as per new strong requirement.
-            # If not, we might need to append it or re-prompt, but for now, let's see how well the LLM adheres.
-            if "FINAL ANSWER:" not in llm_answer:
-                gaia_logger.warning("LLM did not produce 'FINAL ANSWER:' template. Appending based on full response.")
-                # This is a fallback, ideally the LLM follows the prompt.
-                # For a GAIA contest, just returning the raw text might be safer if it's mostly the answer.
-                # Or, if the answer is consistently the last part:
-                # lines = llm_answer.strip().split('\n')
-                # simple_final_answer = lines[-1] if lines else "Could not extract answer"
-                # return f"LLM output did not follow template. Attempted extraction: FINAL ANSWER: {simple_final_answer}"
-                # For now, let the raw output pass, as it might contain partial reasoning + answer.
-                # The strictness of GAIA might penalize this more than a missing template from the LLM.
-                # The prompt is very explicit, so the LLM *should* follow it.
-                pass # Let raw LLM output through if it misses the template for now.
-            return llm_answer
         except Exception as e:
             gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
             error_type_name = type(e).__name__
             if "429" in str(e) or "ResourceExhausted" in error_type_name:
-                return "Error: LLM temporarily unavailable (rate limit). FINAL ANSWER: LLM rate limit."
-            return f"Error generating LLM answer: {error_type_name}. FINAL ANSWER: LLM error."
-    def __call__(self, question: str, task_id: Optional[str] = None) -> str:
-        # This part remains largely the same, as it's about gathering context
-        # The _formulate_answer_with_llm will now use the new prompt
         gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
         q_lower = question.lower().strip()
-        # Simple canned response - ensure it also follows the new format if strictly needed,
-        # but this is usually for agent identity, not a GAIA scored question.
-        # For GAIA, it might be better to let the LLM answer this with context if any.
-        # However, if this is a hardcoded check:
         if "what is your name" in q_lower or "who are you" in q_lower:
-            return "I am a general AI assistant. FINAL ANSWER: general AI assistant"
         file_ctx_str: Optional[str] = None
-        # Expanded keywords slightly for more robust file-related question detection
         file_kws = ["document", "file", "text", "provide", "attach", "read", "content", "table", "data", "excel", "pdf", "audio", "code", "script", "log"]
-        # Check if question *implies* a file is primary, not just mentions a type
-        if task_id and (any(kw in q_lower for kw in file_kws) or "this task involves a file" in q_lower): # Hypothetical trigger
             file_ctx_str = self._fetch_and_process_file_content(task_id)
             if file_ctx_str:
                 gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
@@ -1001,14 +892,11 @@ class GaiaLevel1Agent:
         web_ctx_str: Optional[str] = None
         needs_web = True
-        # Heuristic to skip web search if substantial file context exists and question isn't clearly web-focused
-        if file_ctx_str and len(file_ctx_str) > 300: # If file context is somewhat substantial
-            # Keywords that strongly suggest a web search is still needed
             web_still_needed_kws = [
                 "what is", "who is", "current", "latest", "news", "public opinion",
                 "recent events", "search for", "find information on", "browse", "look up"
             ]
-            # Keywords that might be answerable from a good document
             doc_can_answer_kws = ["summarize", "according to the document", "in the provided text"]
             if any(kw in q_lower for kw in doc_can_answer_kws) and not any(kw in q_lower for kw in web_still_needed_kws):
@@ -1024,10 +912,8 @@ class GaiaLevel1Agent:
         if needs_web:
             search_q = question.replace("?", "").strip()
-            # Tavily query length is handled within TavilyProvider now.
-            # No general truncation here unless other providers also show issues.
             gaia_logger.info(f"RAG Pipeline initiated for query: {search_q[:70]}")
-            rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False) # Consider force_refresh for some GAIA levels if freshness is key
             if rag_res:
                 snippets = []
                 for i, res_item in enumerate(rag_res):
@@ -1036,93 +922,97 @@ class GaiaLevel1Agent:
                     href = res_item.get('href','#')
                     provider = res_item.get('query_tag','WebSearch')
                     prefix = "EnrichedContent" if res_item.get('enriched') else "Snippet"
-                    # Truncate individual snippets less aggressively here, final truncation happens in _formulate_answer_with_llm
                     body_preview = (body[:1500] + "...") if len(body) > 1500 else body
                     snippets.append(f"Source [{i+1} - {provider}]: {title}\nURL: {href}\n{prefix}: {body_preview}\n---")
                 web_ctx_str = "\n\n".join(snippets)
                 gaia_logger.info(f"RAG processed {len(rag_res)} sources, total web context length for LLM (pre-truncation): {len(web_ctx_str)} chars.")
             else:
                 gaia_logger.warning("RAG pipeline yielded no web results for the query.")
-        answer = self._formulate_answer_with_llm(question, file_ctx_str, web_ctx_str)
-        gaia_logger.info(f"LLM-based answer (first 70 after FINAL ANSWER: if present): {answer.split('FINAL ANSWER:')[-1].strip()[:70]}...")
-        return answer
-def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
-    if profile:
-        username = f"{profile.username}"
-        gaia_logger.info(f"User logged in: {username}")
-    else:
-        gaia_logger.warning("User not logged in.")
-        return "Please Login to Hugging Face.", None
     questions_url, submit_url = f"{DEFAULT_API_URL}/questions", f"{DEFAULT_API_URL}/submit"
-    try:
-        agent = GaiaLevel1Agent(api_url=DEFAULT_API_URL)
-        gaia_logger.info("GaiaLevel1Agent (RAG & FileProcessor) initialized for evaluation.")
-    except Exception as e:
-        gaia_logger.error(f"Error instantiating agent: {e}", exc_info=True)
-        return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code link unavailable"
-    gaia_logger.info(f"Agent code link: {agent_code}")
     try:
-        response = requests.get(questions_url, timeout=15)
-        response.raise_for_status()
         questions_data = response.json()
-        if not questions_data or not isinstance(questions_data, list):
-            gaia_logger.error(f"Fetched questions list empty/invalid: {questions_data}")
-            return "Questions list empty/invalid.", None
-        gaia_logger.info(f"Fetched {len(questions_data)} questions.")
-    except Exception as e:
-        gaia_logger.error(f"Error fetching questions: {e}", exc_info=True)
-        return f"Error fetching questions: {e}", None
-    results_log, answers_payload = [], []
-    GEMINI_RPM_LIMIT = 60
-    sleep_llm = (60.0 / GEMINI_RPM_LIMIT) + 0.8 if GEMINI_RPM_LIMIT > 0 else 0.5
-    gaia_logger.info(f"LLM Rate: {GEMINI_RPM_LIMIT} RPM. Sleep ~{sleep_llm:.2f}s between LLM calls.")
-    gaia_logger.info(f"Running agent on {len(questions_data)} questions...")
     for i, item in enumerate(questions_data):
         task_id, q_text = item.get("task_id"), item.get("question")
         if not task_id or q_text is None:
-            results_log.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": "SKIPPED"})
             continue
         gaia_logger.info(f"Q {i+1}/{len(questions_data)} - Task: {task_id}")
         try:
-            answer = agent(question=q_text, task_id=task_id)
-            answers_payload.append({"task_id": task_id, "submitted_answer": answer})
-            results_log.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": answer})
         except Exception as e:
-            gaia_logger.error(f"Error agent task {task_id}: {e}", exc_info=True)
-            results_log.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": f"AGENT ERROR: {e}"})
-        if i < len(questions_data) - 1:
-            gaia_logger.info(f"Sleep {sleep_llm:.2f}s for LLM rate limit.")
-            time.sleep(sleep_llm)
-    if not answers_payload:
-        return "Agent produced no answers.", pd.DataFrame(results_log or [{"Info": "No questions processed"}])
-    submission = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    gaia_logger.info(f"Submitting {len(answers_payload)} answers for '{username}'...")
     try:
-        response = requests.post(submit_url, json=submission, timeout=60)
         response.raise_for_status()
         result_data = response.json()
         status = (f"Submission Successful!\nUser: {result_data.get('username')}\nScore: {result_data.get('score','N/A')}% "
                   f"({result_data.get('correct_count','?')}/{result_data.get('total_attempted','?')} correct)\n"
                   f"Msg: {result_data.get('message','No message.')}")
-        gaia_logger.info("Submission successful.")
         return status, pd.DataFrame(results_log)
     except requests.exceptions.HTTPError as e:
         err_detail = f"Server: {e.response.status_code}. Detail: {e.response.text[:200]}"
-        gaia_logger.error(f"Submission Fail HTTP: {err_detail}", exc_info=False)
         return f"Submission Failed: {err_detail}", pd.DataFrame(results_log)
-    except Exception as e:
-        gaia_logger.error(f"Submission Fail: {e}", exc_info=True)
-        return f"Submission Failed: {e}", pd.DataFrame(results_log)
-with gr.Blocks(title="GAIA RAG Agent - Advanced") as demo:
     gr.Markdown("# Gaia Level 1 Agent (RAG & FileProcessor) Evaluation Runner")
     gr.Markdown(
         """
@@ -1131,6 +1021,7 @@ with gr.Blocks(title="GAIA RAG Agent - Advanced") as demo:
         2.  Click 'Run Evaluation & Submit All Answers'.
         ---
         Agent uses RAG, advanced File Processing, and LLM.
         """
     )
     gr.LoginButton()
@@ -1139,37 +1030,14 @@ with gr.Blocks(title="GAIA RAG Agent - Advanced") as demo:
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(fn=run_and_submit_all, inputs=[], outputs=[status_output, results_table])
-if __name__ == "__main__":
     print("\n" + "-"*30 + " RAG & FileProcessor Agent App Starting " + "-"*30)
-    required_env = {
-        "GOOGLE_GEMINI_API_KEY": GOOGLE_GEMINI_API_KEY,
-        "GOOGLE_API_KEY": GOOGLE_CUSTOM_SEARCH_API_KEY,
-        "GOOGLE_CSE_ID": GOOGLE_CUSTOM_SEARCH_CSE_ID,
-        "TAVILY_API_KEY": TAVILY_API_KEY,
-    }
-    missing_keys = [key_name for key_name, key_val in required_env.items() if not key_val]
-    for key_name in required_env:
-        if required_env[key_name]:
-            print(f"✅ {key_name} found.")
-        else:
-            print(f"⚠️ WARNING: {key_name} not set.")
-    if not DDGS:
-        print("⚠️ WARNING: duckduckgo_search lib missing (for RAG DDG).")
-    else:
-        print("✅ duckduckgo_search lib found (for RAG DDG).")
-    if not BeautifulSoup:
-        print("⚠️ WARNING: BeautifulSoup lib missing (for RAG Enricher).")
-    else:
-        print("✅ BeautifulSoup lib found (for RAG Enricher).")
-    if not genai:
-        print("⚠️ WARNING: google-generativeai lib missing (for LLM).")
-    else:
-        print("✅ google-generativeai lib found (for LLM).")
-    if missing_keys:
-        print(f"\n--- PLEASE SET THE FOLLOWING MISSING ENVIRONMENT VARIABLES FOR FULL FUNCTIONALITY: {', '.join(missing_keys)} ---\n")
     print("-"*(60 + len(" RAG & FileProcessor Agent App Starting ")) + "\n")
-    print("Launching Gradio Interface...")
-    demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False)

 import re
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import TimeoutError as FuturesTimeoutError
 from collections import defaultdict
 try:
     print("WARNING: librosa library not found. Audio processing may be impaired. Install with: pip install librosa")
 try:
+    import openpyxl
 except ImportError:
     openpyxl = None
     print("WARNING: openpyxl library not found. .xlsx file processing might fail. Install with: pip install openpyxl")
 except ImportError:
     pdfplumber = None
     print("WARNING: pdfplumber library not found. PDF file processing will be unavailable. Install with: pip install pdfplumber")
 logging.basicConfig(
     level=logging.INFO,
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 AGENT_DEFAULT_TIMEOUT = 15
+MAX_CONTEXT_LENGTH_LLM = 30000
+MAX_FILE_SIZE = 5 * 1024 * 1024
 CSV_SAMPLE_ROWS = 3
+MAX_FILE_CONTEXT_LENGTH = 10000
 asr_pipeline_instance: Optional[Any] = None
+ASR_MODEL_NAME = "openai/whisper-tiny"
+ASR_PROCESSING_TIMEOUT_SECONDS = 240
 DEFAULT_RAG_CONFIG = {
     'search': {
         'google_cse_id': GOOGLE_CUSTOM_SEARCH_CSE_ID,
         'tavily_api_key': TAVILY_API_KEY,
         'default_max_results': 3, 'retry_attempts': 2, 'retry_delay': 2,
+        'google_timeout': 8, 'tavily_depth': "basic",
+        'max_query_length_tavily': 380
     },
     'processing': {
         'trusted_sources': {'wikipedia.org': 0.8, 'reuters.com': 0.75, 'apnews.com': 0.75},
         global asr_pipeline_instance
         if asr_pipeline_instance is None and hf_transformers_pipeline and torch:
             try:
+                device = -1
                 asr_pipeline_instance = hf_transformers_pipeline(
                     "automatic-speech-recognition",
                     model=ASR_MODEL_NAME,
         try:
             if len(content) > MAX_FILE_SIZE:
                 return f"Error: File '{filename_str}' exceeds maximum allowed size ({MAX_FILE_SIZE // (1024*1024)}MB)."
             if 'csv' in content_type_str or filename_str.endswith('.csv'):
                 return FileProcessor._process_csv(content, filename_str)
             elif 'json' in content_type_str or filename_str.endswith('.json'):
                 return FileProcessor._process_json(content, filename_str)
             elif ('excel' in content_type_str or 'spreadsheetml' in content_type_str or \
+                  filename_str.endswith(('.xlsx', '.xls'))) and openpyxl:
                 return FileProcessor._process_excel(content, filename_str)
+            elif ('pdf' in content_type_str or filename_str.endswith('.pdf')) and pdfplumber:
                 return FileProcessor._process_pdf(content, filename_str)
             elif ('audio' in content_type_str or \
                   filename_str.endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a'))) and \
+                  hf_transformers_pipeline and librosa:
                 return FileProcessor._process_audio(content, filename_str)
             elif 'text/plain' in content_type_str or \
                  ('text/' in content_type_str and not any(sub in content_type_str for sub in ['html', 'xml'])) or \
                  filename_str.endswith(('.txt', '.md', '.py', '.js', '.c', '.cpp', '.java', '.html', '.xml', '.log')):
                 return FileProcessor._process_text(content, filename_str)
             else:
                 return FileProcessor._handle_unknown_type(content, filename_str)
         except Exception as e:
             gaia_logger.error(f"File processing error for '{filename_str}': {str(e)}", exc_info=True)
     @staticmethod
     def _process_csv(content: bytes, filename: str) -> str:
+        gaia_logger.info(f"Processing CSV file: {filename}")
+        df = None
         try:
             encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
             for enc in encodings_to_try:
                 try:
                     df = pd.read_csv(io.BytesIO(content), encoding=enc)
                 except Exception: continue
             if df is None: return f"Error: Could not decode CSV '{filename}'."
             summary = (
+                f"CSV Document Summary: '{filename}' ({len(df)} rows, {len(df.columns)} columns):\n"
+                f"Columns: {', '.join(df.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df))} sample rows:\n{df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)}"
             )
             return FileProcessor._truncate_text(summary, filename, "CSV")
+        except Exception as e:
+            if "tabulate" in str(e).lower() and df is not None:
+                gaia_logger.error(f"CSV to_markdown error for '{filename}' (missing tabulate): {e}", exc_info=False)
+                try:
+                    summary = (
+                        f"CSV Document Summary: '{filename}' ({len(df)} rows, {len(df.columns)} columns):\n"
+                        f"Columns: {', '.join(df.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df))} sample rows (plain text):\n{df.head(CSV_SAMPLE_ROWS).to_string(index=False)}"
+                    )
+                    return FileProcessor._truncate_text(summary, filename, "CSV (plain)")
+                except Exception as e_fallback:
+                    gaia_logger.error(f"CSV fallback to_string error for '{filename}': {e_fallback}", exc_info=True)
+                    return f"Error processing CSV '{filename}' (formatting fallback failed): {str(e_fallback)}"
+            gaia_logger.error(f"CSV processing error for '{filename}': {e}", exc_info=True)
             return f"Error processing CSV '{filename}': {str(e)}"
     @staticmethod
     def _process_json(content: bytes, filename: str) -> str:
+        gaia_logger.info(f"Processing JSON file: {filename}")
         try:
             decoded_content = content.decode('utf-8', errors='replace')
             data = json.loads(decoded_content)
     @staticmethod
     def _process_text(content: bytes, filename: str) -> str:
+        gaia_logger.info(f"Processing Text-like file: {filename}")
         try:
             text = None
             encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
     @staticmethod
     def _process_excel(content: bytes, filename: str) -> str:
+        gaia_logger.info(f"Processing Excel file: {filename}")
         if not openpyxl: return f"Error: Excel processing skipped for '{filename}', openpyxl library not available."
+        xls = None
+        df_list_for_fallback = []
         try:
             xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
             summary_parts = [f"Excel Document Summary: '{filename}'"]
             for sheet_name in xls.sheet_names:
                 df = xls.parse(sheet_name)
+                df_list_for_fallback.append((sheet_name, df))
                 sheet_summary = (
+                    f"\n---\nSheet: '{sheet_name}' ({len(df)} rows, {len(df.columns)} columns):\n"
+                    f"Columns: {', '.join(df.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df))} sample rows:\n{df.head(CSV_SAMPLE_ROWS).to_markdown(index=False)}"
                 )
                 summary_parts.append(sheet_summary)
+                if sum(len(p) for p in summary_parts) > MAX_FILE_CONTEXT_LENGTH * 0.8:
                     summary_parts.append("\n... (further sheets omitted due to length)")
                     break
             full_summary = "".join(summary_parts)
             return FileProcessor._truncate_text(full_summary, filename, "Excel")
+        except Exception as e:
+            if "tabulate" in str(e).lower():
+                gaia_logger.error(f"Excel to_markdown error for '{filename}' (missing tabulate): {e}", exc_info=False)
+                try:
+                    summary_parts_fallback = [f"Excel Document Summary: '{filename}'"]
+                    if not df_list_for_fallback and xls:
+                         for sheet_name in xls.sheet_names:
+                            df_list_for_fallback.append((sheet_name, xls.parse(sheet_name)))
+                    elif not xls and not df_list_for_fallback:
+                        temp_xls = pd.ExcelFile(io.BytesIO(content), engine='openpyxl')
+                        for sheet_name in temp_xls.sheet_names:
+                            df_list_for_fallback.append((sheet_name, temp_xls.parse(sheet_name)))
+                    for sheet_name_fb, df_fb in df_list_for_fallback:
+                        sheet_summary_fallback = (
+                            f"\n---\nSheet: '{sheet_name_fb}' ({len(df_fb)} rows, {len(df_fb.columns)} columns):\n"
+                            f"Columns: {', '.join(df_fb.columns)}\nFirst {min(CSV_SAMPLE_ROWS, len(df_fb))} sample rows (plain text):\n{df_fb.head(CSV_SAMPLE_ROWS).to_string(index=False)}"
+                        )
+                        summary_parts_fallback.append(sheet_summary_fallback)
+                        if sum(len(p) for p in summary_parts_fallback) > MAX_FILE_CONTEXT_LENGTH * 0.8:
+                            summary_parts_fallback.append("\n... (further sheets omitted due to length)")
+                            break
+                    full_summary_fallback = "".join(summary_parts_fallback)
+                    return FileProcessor._truncate_text(full_summary_fallback, filename, "Excel (plain)")
+                except Exception as e_fallback:
+                    gaia_logger.error(f"Excel fallback to_string error for '{filename}': {e_fallback}", exc_info=True)
+                    return f"Error processing Excel '{filename}' (formatting fallback failed): {str(e_fallback)}"
+            gaia_logger.error(f"Excel processing error for '{filename}': {e}", exc_info=True)
             return f"Error processing Excel file '{filename}': {str(e)}"
     @staticmethod
     def _process_pdf(content: bytes, filename: str) -> str:
+        gaia_logger.info(f"Processing PDF file: {filename}")
         if not pdfplumber: return f"Error: PDF processing skipped for '{filename}', pdfplumber library not available."
         text_content = ""
         try:
                         page_text = page.extract_text()
                         if page_text:
                             text_content += page_text + "\n"
+                        if len(text_content) > MAX_FILE_CONTEXT_LENGTH * 1.2:
                             break
             if not text_content:
                 return f"PDF Document: '{filename}'. No text could be extracted or PDF is empty."
             summary = f"PDF Document: '{filename}':\n{text_content}"
             return FileProcessor._truncate_text(summary, filename, "PDF")
         except Exception as e:
             return f"Error processing PDF file '{filename}': {str(e)}"
+    @staticmethod
+    def _perform_asr_transcription(asr_pipeline_ref, audio_data_np, filename_for_log):
+        gaia_logger.info(f"ASR: Starting transcription for {filename_for_log} in thread.")
+        return asr_pipeline_ref(audio_data_np, chunk_length_s=30, return_timestamps=False, generate_kwargs={"task": "transcribe", "language": "en"})
     @staticmethod
     def _process_audio(content: bytes, filename: str) -> str:
+        gaia_logger.info(f"Processing Audio file: {filename}")
+        asr_pipeline_ref = FileProcessor._get_asr_pipeline()
+        if not asr_pipeline_ref:
             return f"Error: Audio processing skipped for '{filename}', ASR pipeline not available."
         if not librosa:
             return f"Error: Audio processing skipped for '{filename}', librosa library not available."
         try:
             with io.BytesIO(content) as audio_buffer:
                 y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
+            duration_seconds = len(y) / sr
+            gaia_logger.info(f"Audio file: {filename}, Duration: {duration_seconds:.2f} seconds. Timeout set to: {ASR_PROCESSING_TIMEOUT_SECONDS}s")
             start_time = time.time()
+            transcribed_text = ""
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(FileProcessor._perform_asr_transcription, asr_pipeline_ref, y, filename)
+                try:
+                    transcription_result = future.result(timeout=ASR_PROCESSING_TIMEOUT_SECONDS)
+                    transcribed_text = transcription_result.get("text", "") if isinstance(transcription_result, dict) else str(transcription_result)
+                except FuturesTimeoutError:
+                    gaia_logger.warning(f"ASR transcription for '{filename}' timed out after {ASR_PROCESSING_TIMEOUT_SECONDS} seconds.")
+                    return f"Error: Audio transcription for '{filename}' timed out after {ASR_PROCESSING_TIMEOUT_SECONDS}s."
+                except Exception as e_thread:
+                    gaia_logger.error(f"ASR transcription thread for '{filename}' failed: {e_thread}", exc_info=True)
+                    if "3000 mel input features" in str(e_thread) or "return_timestamps" in str(e_thread):
+                         return f"Error processing Audio file '{filename}': Transcription failed due to long-form audio issue (mel features/timestamps). Original error: {str(e_thread)}"
+                    return f"Error during audio transcription for '{filename}': {str(e_thread)}"
+            end_time = time.time()
+            gaia_logger.info(f"Audio transcription for '{filename}' (or timeout) took {end_time - start_time:.2f} seconds.")
             if not transcribed_text.strip():
+                return f"Audio Document: '{filename}'. Transcription result was empty or ASR failed."
             summary = f"Audio Document (Transcription): '{filename}':\n{transcribed_text}"
             return FileProcessor._truncate_text(summary, filename, "Audio Transcription")
         except Exception as e:
             gaia_logger.error(f"Audio processing/transcription error for '{filename}': {str(e)}", exc_info=True)
             return f"Error processing Audio file '{filename}': {str(e)}"
     @staticmethod
     def _handle_unknown_type(content: bytes, filename: str) -> str:
         gaia_logger.warning(f"Attempting to handle unknown file type for '{filename}' as text snippet.")
         except Exception:
             return f"File with Unknown Content Type: '{filename}'. Content is likely binary and cannot be displayed as text."
+class CacheManager:
     def __init__(self, ttl: int = 300, max_size: int = 100, name: str = "Cache"):
         self.ttl = ttl; self.max_size = max_size
         self._cache: Dict[Any, Any] = {}; self._timestamps: Dict[Any, float] = {}
         if key in self._cache and (time.time() - self._timestamps.get(key, 0) < self.ttl):
             try:
                 self._access_order.remove(key); self._access_order.append(key)
                 return copy.deepcopy(self._cache[key])
             except (ValueError, TypeError) as e:
                 self.delete(key); return None
         elif key in self._cache:
             self.delete(key)
         return None
     def set(self, key: Any, value: Any):
         while len(self._cache) >= self.max_size and self._access_order:
             old_key = self._access_order.pop(0)
             if old_key in self._cache:
                 del self._cache[old_key]; del self._timestamps[old_key]
         try: self._cache[key] = copy.deepcopy(value)
         except TypeError: self._cache[key] = value
         self._timestamps[key] = time.time(); self._access_order.append(key)
     def delete(self, key: Any):
         if key in self._cache:
             try:
     def __len__(self): return len(self._cache)
     def __contains__(self, key): return key in self._cache and (time.time()-self._timestamps.get(key,0)<self.ttl)
+class SearchProvider(ABC):
     def __init__(self, config_dict: Dict):
         self.provider_config = config_dict.get('search', {})
+        self._enabled = False
         self._quota_used = 0
         raw_quota = self.provider_config.get(f'{self.provider_name.lower()}_quota', float('inf'))
         self._quota_limit = float(raw_quota) if raw_quota is not None else float('inf')
     @property
     @abstractmethod
+    def provider_name(self) -> str: pass
     @abstractmethod
+    def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]: pass
     def search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
+        if not self._enabled: return None
         if self._quota_limit != float('inf') and self._quota_used >= self._quota_limit:
             gaia_logger.warning(f"[{self.provider_name}] Skip: Quota ({self._quota_used}/{int(self._quota_limit)})")
             return None
             usage_str = f"({self._quota_used}/{int(self._quota_limit)}) "
         gaia_logger.info(f"[{self.provider_name}] {usage_str}Search: '{query[:70]}...'")
         return self._perform_search(query, max_results)
+    def available(self) -> bool: return self._enabled
+class GoogleProvider(SearchProvider):
     @property
+    def provider_name(self) -> str: return "Google"
     def __init__(self, config_dict: Dict):
+        super().__init__(config_dict)
         self._api_key = self.provider_config.get("google_api_key")
         self._cse_id = self.provider_config.get("google_cse_id")
         self._timeout = self.provider_config.get("google_timeout", 8)
+        if self._api_key and self._cse_id: self._enabled = True; gaia_logger.info(f"✓ {self.provider_name} API configured.")
+        else: self._enabled = False; gaia_logger.warning(f"✗ {self.provider_name} API key/CSE ID missing.")
     def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
         try:
+            params = {'key': self._api_key, 'cx': self._cse_id, 'q': query, 'num': max_results, 'safe': 'active'}
+            response = requests.get("https://www.googleapis.com/customsearch/v1", params=params, timeout=self._timeout)
             response.raise_for_status()
+            items = response.json().get('items', [])
+            if not items: return []
+            return [{'href': i.get('link'), 'title': i.get('title', ''), 'body': i.get('snippet', '')} for i in items]
+        except requests.exceptions.Timeout: gaia_logger.warning(f"[{self.provider_name}] Timeout: '{query[:70]}'"); return None
+        except requests.exceptions.RequestException as e: gaia_logger.warning(f"[{self.provider_name}] RequestEx: '{query[:70]}': {e}"); return None
+        except Exception as e: gaia_logger.error(f"[{self.provider_name}] Error: '{query[:70]}': {e}", exc_info=True); return None
+class TavilyProvider(SearchProvider):
     @property
+    def provider_name(self) -> str: return "Tavily"
     def __init__(self, config_dict: Dict):
+        super().__init__(config_dict)
         self._api_key = self.provider_config.get("tavily_api_key")
         self._search_depth = self.provider_config.get("tavily_depth", "basic")
         if self._api_key and TavilyClient:
+            try: self._client = TavilyClient(api_key=self._api_key); self._enabled = True; gaia_logger.info(f"✓ {self.provider_name} API initialized.")
+            except Exception as e: self._enabled = False; gaia_logger.warning(f"✗ {self.provider_name} init fail: {e}", exc_info=False)
+        elif not TavilyClient: self._enabled = False; gaia_logger.warning(f"✗ {self.provider_name}: TavilyClient lib missing.")
+        else: self._enabled = False; gaia_logger.warning(f"✗ {self.provider_name}: API key missing.")
     def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
+        if not self._enabled: return None
         try:
+            max_len = DEFAULT_RAG_CONFIG['search'].get('max_query_length_tavily', 380)
+            if len(query) > max_len:
+                gaia_logger.warning(f"[{self.provider_name}] Query truncated from {len(query)} to {max_len} chars for API limit.")
+                query = query[:max_len]
             response = self._client.search(query=query, max_results=max_results, search_depth=self._search_depth)
             hits = response.get('results', [])
+            if not hits: return []
             return [{'href': h.get('url'), 'title': h.get('title',''), 'body': h.get('content','')} for h in hits]
         except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
+class DuckDuckGoProvider(SearchProvider):
     @property
+    def provider_name(self) -> str: return "DuckDuckGo"
     def __init__(self, config_dict: Dict):
+        super().__init__(config_dict)
         if DDGS:
+            try: self._client = DDGS(timeout=10); self._enabled = True; gaia_logger.info(f"✓ {self.provider_name} Search initialized.")
+            except Exception as e: self._enabled = False; gaia_logger.warning(f"✗ {self.provider_name} init fail: {e}", exc_info=False)
+        else: self._enabled = False; gaia_logger.warning(f"✗ {self.provider_name}: DDGS lib missing.")
     def _perform_search(self, query: str, max_results: int) -> Optional[List[Dict[str, str]]]:
+        if not self._enabled: return None
         try:
             hits = list(self._client.text(query, region='wt-wt', max_results=max_results))[:max_results]
+            if not hits: return []
             return [{'href': r.get('href'), 'title': r.get('title',''), 'body': r.get('body','')} for r in hits]
         except Exception as e: gaia_logger.warning(f"[{self.provider_name}] Search fail: '{query[:70]}': {e}"); return None
+class CompositeSearchClient:
     def __init__(self, config_dict: Dict):
         self.config = config_dict
         self._search_config = config_dict.get('search', {})
         self.providers = self._init_providers(config_dict)
         self.cache = CacheManager(
             ttl=config_dict.get('caching', {}).get('search_cache_ttl', 300),
+            max_size=config_dict.get('caching', {}).get('search_cache_size', 50), name="SearchClientCache"
         )
         self._retry_att = self._search_config.get("retry_attempts", 2)
         self._retry_del = self._search_config.get("retry_delay", 2)
         self._def_max_r = self._search_config.get("default_max_results", 3)
     def _init_providers(self, config_dict: Dict) -> List[SearchProvider]:
         providers: List[SearchProvider] = []
         if TAVILY_API_KEY and TavilyClient:
             tavily_prov = TavilyProvider(config_dict)
+            if tavily_prov.available(): providers.append(tavily_prov)
         if GOOGLE_CUSTOM_SEARCH_API_KEY and GOOGLE_CUSTOM_SEARCH_CSE_ID:
             google_prov = GoogleProvider(config_dict)
+            if google_prov.available(): providers.append(google_prov)
         if DDGS:
             ddgs_prov = DuckDuckGoProvider(config_dict)
+            if ddgs_prov.available(): providers.append(ddgs_prov)
+        if not providers: gaia_logger.error("RAG: No search providers initialized!")
+        else: gaia_logger.info(f"RAG Providers: {[p.provider_name for p in providers]}")
         return providers
     def search(self, query: str, max_results: Optional[int] = None, force_refresh: bool = False) -> List[Dict]:
         q, actual_r = query.strip(), max_results if max_results is not None else self._def_max_r
+        if not q: return []
         cache_key = (q, actual_r)
+        if not force_refresh and (cached := self.cache.get(cache_key)) is not None: return cached
         for prov in self.providers:
             for attempt in range(self._retry_att + 1):
+                if not prov.available(): break
                 try:
                     results = prov.search(q, actual_r)
+                    if results is not None: self.cache.set(cache_key, results); return results
+                    if attempt < self._retry_att: time.sleep(self._retry_del)
                 except Exception as e:
+                    if attempt < self._retry_att: time.sleep(self._retry_del)
         self.cache.set(cache_key, [])
         return []
+class GaiaQueryBuilder:
     def __init__(self, base_query: str, config_dict: Dict):
         self.base_query = base_query.strip()
+        self.config = config_dict
     def get_queries(self) -> Dict[str, List[Tuple[str, str]]]:
+        return {'primary': [(self.base_query, 'GENERAL')]} if self.base_query else {'primary': []}
+class ResultProcessor:
     def __init__(self, config_dict: Dict):
         self.proc_config = config_dict.get('processing', {})
         self.trusted_sources = self.proc_config.get('trusted_sources', {})
         self.seen_urls: Set[str] = set()
         self.date_pattern = DEFAULT_RAG_CONFIG['processing'].get('date_pattern', r'\b\d{4}\b')
     def process_batch(self, results: List[Dict], query_tag: str, initial_cat: str='GENERAL') -> List[Dict]:
         processed: List[Dict] = []
+        if not results: return processed
         for r in results:
             url = r.get('href')
+            if not url or self._normalize_url(url) in self.seen_urls: continue
             self.seen_urls.add(self._normalize_url(url))
+            res_data = {'title': r.get('title',''), 'body': r.get('body',''), 'href': url, 'query_tag': query_tag, 'category': initial_cat, 'source_quality': 0.5, 'temporal_relevance': 0.1, 'combined_score': 0.0}
             self._score_result(res_data)
             processed.append(res_data)
         return processed
+    def _normalize_url(self, url: str) -> str: return re.sub(r'^https?://(?:www\.)?', '', str(url)).rstrip('/') if url else ""
     def _score_result(self, result: Dict):
         url, body, title = result.get('href', ''), result.get('body', ''), result.get('title', '')
         source_q = 0.5
+        if domain_match := re.search(r'https?://(?:www\.)?([^/]+)', url or ""): source_q = self.trusted_sources.get(domain_match.group(1), 0.5)
         result['source_quality'] = source_q
+        temporal_r = 0.1; text_combo = (str(title) + ' ' + str(body)).lower()
+        if any(k in text_combo for k in ['today', 'current', 'latest']) or re.search(r'\b\d+\s+hours?\s+ago', text_combo): temporal_r = 0.9
+        elif re.search(self.date_pattern, text_combo): temporal_r = 0.5
         result['temporal_relevance'] = temporal_r
         result['combined_score'] = (source_q * 0.6 + temporal_r * 0.4)
+class ContentEnricher:
     def __init__(self, config_dict: Dict):
         self.enrich_config = config_dict.get('enrichment', {})
         self._enabled = self.enrich_config.get('enabled', False) and bool(BeautifulSoup)
+        if not self._enabled: return
         self._timeout = self.enrich_config.get('timeout', 10)
         self._max_w = self.enrich_config.get('workers', 3)
         self._min_l, self._max_l = self.enrich_config.get('min_text_length', 200), self.enrich_config.get('max_text_length', 8000)
         self._skip_ext = tuple(self.enrich_config.get('skip_extensions', []))
+        self.cache = CacheManager(ttl=config_dict.get('caching', {}).get('enrich_cache_ttl', 600), max_size=config_dict.get('caching', {}).get('enrich_cache_size', 25), name="EnrichCache")
         gaia_logger.info(f"RAG ContentEnricher Initialized. Enabled: {self._enabled}")
     def enrich_batch(self, results: List[Dict], force_refresh: bool = False) -> List[Dict]:
+        if not self._enabled or not results: return results
         updated_res = []
         with ThreadPoolExecutor(max_workers=self._max_w) as executor:
             future_map = {executor.submit(self._fetch_single, r, force_refresh): r for r in results}
+            for future in as_completed(future_map): updated_res.append(future.result())
         return updated_res
     def _fetch_single(self, result: Dict, force_refresh: bool) -> Dict:
+        url = result.get('href'); result.setdefault('enriched', False); result.setdefault('enrichment_failed', None); result.setdefault('enrichment_skipped_type', None)
+        if not url: result['enrichment_skipped_type'] = 'no_url'; return result
         if not force_refresh and (cached := self.cache.get(url)) is not None:
+            if cached: result.update(cached); return result
+        if url.lower().endswith(self._skip_ext): result['enrichment_skipped_type'] = 'extension'; return result
         try:
             headers = {'User-Agent': 'Mozilla/5.0 GaiaRAGAgent/1.0'}
             response = requests.get(url, headers=headers, timeout=self._timeout, allow_redirects=True)
             response.raise_for_status()
+            if 'text/html' not in response.headers.get('Content-Type', '').lower(): result['enrichment_skipped_type'] = 'non-html'; return result
             soup = BeautifulSoup(response.text, 'lxml')
             for el_name in ["script", "style", "nav", "header", "footer", "aside", "form", "iframe", "img", "svg", ".ad", ".advertisement"]:
+                for el in soup.select(el_name): el.decompose()
             main_el = soup.select_one('article, main, [role="main"], .entry-content, .post-content, #content, #main') or soup.body
             text = main_el.get_text(separator='\n', strip=True) if main_el else ""
             text = re.sub(r'(\s*\n\s*){2,}', '\n\n', text).strip()
             if len(text) >= self._min_l:
                 result['body'] = text[:self._max_l] + ("..." if len(text) > self._max_l else "")
+                result['enriched'] = True; self.cache.set(url, {'body': result['body'], 'enriched': True})
+            else: result['enrichment_failed'] = 'too_short'
+        except Exception as e: result['enrichment_failed'] = type(e).__name__
         return result
+class GeneralRAGPipeline:
     def __init__(self, config_dict: Optional[Dict] = None):
         self.config = config_dict if config_dict is not None else DEFAULT_RAG_CONFIG
         self.search_client = CompositeSearchClient(self.config)
         enrich_cfg = self.config.get('enrichment', {})
         self.enricher = ContentEnricher(self.config) if enrich_cfg.get('enabled', False) and BeautifulSoup else None
+        if not self.enricher: gaia_logger.info("RAG Content Enrichment disabled.")
+        self.pipeline_cache = CacheManager(ttl=self.config.get('caching', {}).get('analyzer_cache_ttl', 3600), max_size=self.config.get('caching', {}).get('analyzer_cache_size', 30), name="RAGPipelineCache")
         gaia_logger.info("GeneralRAGPipeline initialized.")
     def analyze(self, query: str, force_refresh: bool = False) -> List[Dict]:
+        q = query.strip();
+        if not q: return []
         cfg_res, cfg_search = self.config.get('results', {}), self.config.get('search', {})
         total_lim, enrich_cnt = cfg_res.get('total_limit', 3), cfg_res.get('enrich_count', 2)
         enrich_en = self.config.get('enrichment', {}).get('enabled', False) and bool(self.enricher)
         max_r_pq = cfg_search.get('default_max_results', 3)
         cache_key = (q, max_r_pq, total_lim, enrich_en, enrich_cnt)
+        if not force_refresh and (cached := self.pipeline_cache.get(cache_key)) is not None: return cached
+        if force_refresh: self.search_client.cache.clear();
+        if self.enricher: self.enricher.cache.clear()
         all_res, res_proc = [], ResultProcessor(self.config)
         staged_qs = GaiaQueryBuilder(q, self.config).get_queries()
         for stage, qs_in_stage in staged_qs.items():
             for query_s, cat in qs_in_stage:
+                if len(all_res) >= total_lim * 2: break
                 s_res = self.search_client.search(query_s, max_results=max_r_pq, force_refresh=force_refresh)
                 all_res.extend(res_proc.process_batch(s_res or [], query_s, initial_cat=cat))
         all_res.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
         if enrich_en and self.enricher and all_res:
             to_enrich = [r for r in all_res[:enrich_cnt] if r.get('href')]
+            enriched_map = {item['href']: item for item in self.enricher.enrich_batch(to_enrich, force_refresh=force_refresh) if item.get('href')}
             temp_results = [enriched_map.get(r['href'], r) if r.get('href') else r for r in all_res]
+            all_res = temp_results; all_res.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
         final_results = all_res[:total_lim]
         self.pipeline_cache.set(cache_key, final_results)
         return final_results
 class GaiaLevel1Agent:
     def __init__(self, api_url: str = DEFAULT_API_URL):
         self.api_url = api_url
         if genai and GOOGLE_GEMINI_API_KEY:
             try:
                 genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
+                model_name = 'gemini-2.0-flash'
                 self.llm_model = genai.GenerativeModel(model_name)
                 gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
             except Exception as e:
     @lru_cache(maxsize=32)
     def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
         file_url = f"{self.api_url}/files/{task_id}"
         for attempt in range(2):
             try:
                 response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
                         filename = header_filename
                 content_type = response.headers.get("Content-Type", "")
                 processed_content = FileProcessor.process(response.content, filename, content_type)
                 return processed_content
                 if attempt < 1: time.sleep(1)
         return None
+    def _parse_llm_output(self, llm_text: str) -> Dict[str, str]:
+        reasoning_trace = ""
+        model_answer = ""
+        final_answer_sentinel = "FINAL ANSWER:"
+        parts = llm_text.split(final_answer_sentinel, 1)
+        if len(parts) == 2:
+            reasoning_trace = parts[0].strip()
+            model_answer = parts[1].strip()
+        else:
+            reasoning_trace = llm_text # Fallback: all text is reasoning
+            lines = llm_text.strip().split('\n')
+            model_answer = lines[-1].strip() if lines else "Could not parse answer" # Fallback: last line is answer
+            gaia_logger.warning(f"LLM output did not contain '{final_answer_sentinel}'. Using fallback parsing.")
+        return {"model_answer": model_answer, "reasoning_trace": reasoning_trace}
+    def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> Dict[str, str]:
+        default_error_answer = "Information not available in provided context"
+        default_reasoning = "LLM processing failed or context insufficient."
         if not self.llm_model:
             gaia_logger.warning("LLM model (Gemini) not available for answer formulation.")
+            reasoning = "LLM model (Gemini) not available for answer formulation."
+            answer = default_error_answer
             if web_context and file_context:
+                reasoning += " Context from file and web was found but not processed by LLM."
             elif web_context:
+                reasoning += f" Web context found: {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
             elif file_context:
+                reasoning += f" File context found: {file_context[:100]}..."
+            else:
+                 reasoning += " No context found."
+            return {"model_answer": answer, "reasoning_trace": reasoning}
         prompt_parts = [
             "You are a general AI assistant. Your primary goal is to answer the user's question accurately and concisely based *only* on the provided context (from a document and/or web search results).",
             "First, think step-by-step and briefly explain your reasoning based on the context. This part is for clarity and should come before your final answer.",
             "Prioritize information from 'Enriched Content' from web search results if available and relevant over shorter 'Snippets'.",
             "\nUser Question: ", question
         ]
         current_prompt_text_len = sum(len(p) for p in prompt_parts)
         context_added = False
         if file_context:
             file_header = "\n\nContext from Provided Document:\n---"
             file_footer = "\n---"
+            max_len_for_file = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - (len(web_context) if web_context else 0) - len(file_header) - len(file_footer) - 500
+            if max_len_for_file > 100 :
                 truncated_file_context = file_context[:max_len_for_file]
                 if len(file_context) > len(truncated_file_context):
                     truncated_file_context += " ... (file context truncated)"
         if web_context:
             web_header = "\n\nContext from Web Search Results:\n---"
             web_footer = "\n---"
+            available_len_for_web = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(web_header) - len(web_footer) - 300
+            if available_len_for_web > 100:
                 truncated_web_context = web_context
                 if len(web_context) > available_len_for_web:
                     truncated_web_context = web_context[:available_len_for_web] + "\n... (web context truncated)"
                 gaia_logger.warning("Not enough space for web context in LLM prompt, or web context itself is empty.")
+        if not context_added:
             prompt_parts.append("\n\nNo document or web context could be provided due to length constraints or availability.")
+        prompt_parts.append("\n\nReasoning and Final Answer:")
         final_prompt = "\n".join(prompt_parts)
         gaia_logger.info(f"LLM Prompt (first 300): {final_prompt[:300]}...")
         if not GenerationConfig:
             gaia_logger.error("GenerationConfig not available. Cannot make LLM call.")
+            return {"model_answer": "LLM configuration error", "reasoning_trace": "GenerationConfig not available."}
         try:
             gen_config = GenerationConfig(
+                temperature=0.1,
+                top_p=0.95,
+                max_output_tokens=2048
             )
             safety_set = [{"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
             response = self.llm_model.generate_content(
             )
             if not response.candidates or (hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason):
+                reason_text = "Unknown"
                 if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
+                    reason_text = response.prompt_feedback.block_reason.name
+                gaia_logger.warning(f"Gemini response blocked. Reason: {reason_text}.")
+                return {"model_answer": "Error processing request", "reasoning_trace": f"My response was blocked (Reason: {reason_text})."}
+            llm_answer_text = response.text
+            gaia_logger.info(f"LLM Raw Full Answer (first 200): {llm_answer_text[:200]}...")
+            return self._parse_llm_output(llm_answer_text)
         except Exception as e:
             gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
             error_type_name = type(e).__name__
+            reasoning = f"Error calling Gemini API: {error_type_name} - {str(e)}"
+            answer = "LLM API error"
             if "429" in str(e) or "ResourceExhausted" in error_type_name:
+                answer = "LLM rate limit"
+                reasoning = "Error: LLM temporarily unavailable (rate limit)."
+            return {"model_answer": answer, "reasoning_trace": reasoning}
+    def __call__(self, question: str, task_id: Optional[str] = None) -> Dict[str, str]:
         gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
         q_lower = question.lower().strip()
         if "what is your name" in q_lower or "who are you" in q_lower:
+            return {"model_answer": "general AI assistant", "reasoning_trace": "User asked for my identity."}
         file_ctx_str: Optional[str] = None
         file_kws = ["document", "file", "text", "provide", "attach", "read", "content", "table", "data", "excel", "pdf", "audio", "code", "script", "log"]
+        if task_id and (any(kw in q_lower for kw in file_kws) or "this task involves a file" in q_lower):
             file_ctx_str = self._fetch_and_process_file_content(task_id)
             if file_ctx_str:
                 gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
         web_ctx_str: Optional[str] = None
         needs_web = True
+        if file_ctx_str and len(file_ctx_str) > 300:
             web_still_needed_kws = [
                 "what is", "who is", "current", "latest", "news", "public opinion",
                 "recent events", "search for", "find information on", "browse", "look up"
             ]
             doc_can_answer_kws = ["summarize", "according to the document", "in the provided text"]
             if any(kw in q_lower for kw in doc_can_answer_kws) and not any(kw in q_lower for kw in web_still_needed_kws):
         if needs_web:
             search_q = question.replace("?", "").strip()
             gaia_logger.info(f"RAG Pipeline initiated for query: {search_q[:70]}")
+            rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False)
             if rag_res:
                 snippets = []
                 for i, res_item in enumerate(rag_res):
                     href = res_item.get('href','#')
                     provider = res_item.get('query_tag','WebSearch')
                     prefix = "EnrichedContent" if res_item.get('enriched') else "Snippet"
                     body_preview = (body[:1500] + "...") if len(body) > 1500 else body
                     snippets.append(f"Source [{i+1} - {provider}]: {title}\nURL: {href}\n{prefix}: {body_preview}\n---")
                 web_ctx_str = "\n\n".join(snippets)
                 gaia_logger.info(f"RAG processed {len(rag_res)} sources, total web context length for LLM (pre-truncation): {len(web_ctx_str)} chars.")
             else:
                 gaia_logger.warning("RAG pipeline yielded no web results for the query.")
+        agent_response_dict = self._formulate_answer_with_llm(question, file_ctx_str, web_ctx_str)
+        gaia_logger.info(f"LLM-based model_answer (first 70): {agent_response_dict.get('model_answer', '')[:70]}...")
+        return agent_response_dict
+def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
+    if profile: username = f"{profile.username}"
+    else: return "Please Login to Hugging Face.", None
     questions_url, submit_url = f"{DEFAULT_API_URL}/questions", f"{DEFAULT_API_URL}/submit"
+    try: agent = GaiaLevel1Agent(api_url=DEFAULT_API_URL)
+    except Exception as e: return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code link unavailable"
     try:
+        response = requests.get(questions_url, timeout=15); response.raise_for_status()
         questions_data = response.json()
+        if not questions_data or not isinstance(questions_data, list): return "Questions list empty/invalid.", None
+    except Exception as e: return f"Error fetching questions: {e}", None
+    results_log, answers_payload_for_submission = [], []
+    GEMINI_RPM_LIMIT = int(os.getenv("GEMINI_RPM_LIMIT", "60"))
+    sleep_llm = (60.0 / GEMINI_RPM_LIMIT) + 0.5 if GEMINI_RPM_LIMIT > 0 else 0.2
     for i, item in enumerate(questions_data):
         task_id, q_text = item.get("task_id"), item.get("question")
         if not task_id or q_text is None:
+            results_log.append({"Task ID": task_id, "Question": q_text, "Model Answer": "SKIPPED", "Reasoning Trace": ""})
             continue
         gaia_logger.info(f"Q {i+1}/{len(questions_data)} - Task: {task_id}")
+        model_answer_val = "AGENT ERROR"
+        reasoning_trace_val = "Agent error occurred."
         try:
+            agent_response_dict = agent(question=q_text, task_id=task_id)
+            model_answer_val = agent_response_dict.get("model_answer", "Error: No model_answer key")
+            reasoning_trace_val = agent_response_dict.get("reasoning_trace", "")
+            answers_payload_for_submission.append({
+                "task_id": task_id,
+                "model_answer": model_answer_val,
+                "reasoning_trace": reasoning_trace_val
+            })
+            results_log.append({"Task ID": task_id, "Question": q_text, "Model Answer": model_answer_val, "Reasoning Trace": reasoning_trace_val[:500] + "..." if len(reasoning_trace_val)>500 else reasoning_trace_val})
         except Exception as e:
+            reasoning_trace_val = f"AGENT ERROR: {type(e).__name__} - {e}"
+            answers_payload_for_submission.append({
+                "task_id": task_id,
+                "model_answer": model_answer_val, # "AGENT ERROR"
+                "reasoning_trace": reasoning_trace_val
+            })
+            results_log.append({"Task ID": task_id, "Question": q_text, "Model Answer": model_answer_val, "Reasoning Trace": reasoning_trace_val})
+        if i < len(questions_data) - 1: time.sleep(sleep_llm)
+    if not answers_payload_for_submission: return "Agent produced no answers.", pd.DataFrame(results_log or [{"Info": "No questions processed"}])
+    submission_content_lines = []
+    for ans_item in answers_payload_for_submission:
+        submission_entry = {"task_id": ans_item["task_id"], "model_answer": ans_item["model_answer"]}
+        if ans_item.get("reasoning_trace"): # Add reasoning_trace only if it exists and is not empty
+            submission_entry["reasoning_trace"] = ans_item["reasoning_trace"]
+        submission_content_lines.append(json.dumps(submission_entry))
+    submission_json_lines = "\n".join(submission_content_lines)
+    submission_payload_for_api = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers_jsonl_string": submission_json_lines
+    }
+    gaia_logger.info(f"Submitting {len(answers_payload_for_submission)} answers for '{username}'...")
+    gaia_logger.debug(f"Submission payload sample for API: {json.dumps(submission_payload_for_api)[:500]}")
     try:
+        response = requests.post(submit_url, json=submission_payload_for_api, timeout=60);
         response.raise_for_status()
         result_data = response.json()
         status = (f"Submission Successful!\nUser: {result_data.get('username')}\nScore: {result_data.get('score','N/A')}% "
                   f"({result_data.get('correct_count','?')}/{result_data.get('total_attempted','?')} correct)\n"
                   f"Msg: {result_data.get('message','No message.')}")
         return status, pd.DataFrame(results_log)
     except requests.exceptions.HTTPError as e:
         err_detail = f"Server: {e.response.status_code}. Detail: {e.response.text[:200]}"
         return f"Submission Failed: {err_detail}", pd.DataFrame(results_log)
+    except Exception as e: return f"Submission Failed: {e}", pd.DataFrame(results_log)
+with gr.Blocks(title="GAIA RAG Agent - Advanced") as demo:
     gr.Markdown("# Gaia Level 1 Agent (RAG & FileProcessor) Evaluation Runner")
     gr.Markdown(
         """
         2.  Click 'Run Evaluation & Submit All Answers'.
         ---
         Agent uses RAG, advanced File Processing, and LLM.
+        **Remember to add `tabulate` to your requirements.txt!**
         """
     )
     gr.LoginButton()
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(fn=run_and_submit_all, inputs=[], outputs=[status_output, results_table])
+if __name__ == "__main__":
     print("\n" + "-"*30 + " RAG & FileProcessor Agent App Starting " + "-"*30)
+    required_env = {"GOOGLE_GEMINI_API_KEY": GOOGLE_GEMINI_API_KEY, "GOOGLE_API_KEY": GOOGLE_CUSTOM_SEARCH_API_KEY, "GOOGLE_CSE_ID": GOOGLE_CUSTOM_SEARCH_CSE_ID, "TAVILY_API_KEY": TAVILY_API_KEY,}
+    missing_keys = [k for k, v in required_env.items() if not v]
+    for k, v in required_env.items(): print(f"✅ {k} found." if v else f"⚠️ WARNING: {k} not set.")
+    for lib_name, lib_var in [("transformers", hf_transformers_pipeline), ("torch", torch), ("librosa", librosa), ("openpyxl", openpyxl), ("pdfplumber", pdfplumber)]:
+        print(f"✅ {lib_name} lib found." if lib_var else f"⚠️ WARNING: {lib_name} lib missing (some file types may not be processed).")
+    print("👉 REMEMBER TO INSTALL 'tabulate' if you haven't: pip install tabulate")
+    if missing_keys: print(f"\n--- PLEASE SET MISSING ENV VARS: {', '.join(missing_keys)} ---\n")
     print("-"*(60 + len(" RAG & FileProcessor Agent App Starting ")) + "\n")
+    demo.launch(server_name="0.0.0.0", server_port=7860, debug=False, share=False)