Spaces:

taijichat
/

chat

Running

App Files Files Community

chat / tools /agent_tools.py

WeMWish

Add authentication, token quota tracking, and comprehensive usage logging

8d66edb 5 months ago

raw

history blame contribute delete

53.3 kB

	import pandas as pd
	import os
	import json
	import glob # For os.walk if needed, or can use glob directly
	import mimetypes
	from datetime import datetime
	from functools import lru_cache
	import time
	from openai import OpenAI # Add OpenAI import for the describe_image function

	# --- NEW IMPORTS FOR LITERATURE SEARCH ---
	from semanticscholar import SemanticScholar # For Semantic Scholar API
	from Bio import Entrez # For PubMed
	import arxiv # For ArXiv API
	# --- END NEW IMPORTS ---

	# --- NEW IMPORTS FOR TEXT FETCHING FROM URLS ---
	import requests
	from bs4 import BeautifulSoup
	import base64 # For encoding image data
	import io
	from PIL import Image # For image processing
	# --- END NEW IMPORTS FOR TEXT FETCHING ---

	# --- Define Project Root and WWW Path relative to this file ---
	# This file is in taijichat/tools/
	# Project root is one level up from 'tools' (i.e., 'taijichat/' directory)
	_TOOLS_DIR = os.path.dirname(os.path.abspath(__file__))
	_PROJECT_ROOT = os.path.abspath(os.path.join(_TOOLS_DIR, ".."))
	BASE_WWW_PATH = os.path.join(_PROJECT_ROOT, "www") # This will be an absolute path

	UI_TEXTS_FILE = os.path.join(_TOOLS_DIR, "ui_texts.json") # ui_texts.json is in the same directory

	TF_PAGERANK_PATH = os.path.join(BASE_WWW_PATH, "tablePagerank")
	WAVE_ANALYSIS_PATH = os.path.join(BASE_WWW_PATH, "waveanalysis")
	TF_CORR_PATH = os.path.join(BASE_WWW_PATH, "TFcorintextrm")
	TF_COMMUNITIES_PATH = os.path.join(BASE_WWW_PATH, "tfcommunities")

	# Cache timeout in seconds
	CACHE_TIMEOUT = 300 # 5 minutes

	# Cache for storing data with timestamps
	_data_cache = {}
	_cache_timestamps = {}

	def _is_cache_valid(cache_key):
	"""Check if cached data is still valid based on timeout."""
	if cache_key not in _cache_timestamps:
	return False
	return (time.time() - _cache_timestamps[cache_key]) < CACHE_TIMEOUT

	def _get_cached_data(cache_key):
	"""Get data from cache if valid."""
	if _is_cache_valid(cache_key):
	return _data_cache.get(cache_key)
	return None

	def _set_cached_data(cache_key, data):
	"""Store data in cache with current timestamp."""
	_data_cache[cache_key] = data
	_cache_timestamps[cache_key] = time.time()

	# --- Tool Implementations ---

	@lru_cache(maxsize=32)
	def get_raw_excel_data(file_path: str) -> list:
	"""
	Reads a specified Excel file and returns its raw content as a list of lists.
	Uses caching to improve performance.
	"""
	try:
	# Check cache first
	cache_key = f"raw_excel_{file_path}"
	cached_data = _get_cached_data(cache_key)
	if cached_data is not None:
	return cached_data

	# If not in cache, read file
	df = pd.read_excel(file_path, header=None)
	df = df.fillna('')
	result = df.values.tolist()

	# Store in cache
	_set_cached_data(cache_key, result)
	return result
	except FileNotFoundError:
	print(f"Error: File not found at {file_path}")
	return []
	except Exception as e:
	print(f"Error reading Excel file {file_path}: {e}")
	return []

	def get_processed_tf_data(dataset_identifier: str) -> list:
	"""
	Reads and processes a TF-related Excel file identified by its dataset_identifier.
	Uses caching and timeout handling.
	"""
	try:
	# Check cache first
	cache_key = f"processed_tf_{dataset_identifier}"
	cached_data = _get_cached_data(cache_key)
	if cached_data is not None:
	return cached_data

	file_path = get_tf_catalog_dataset_path(dataset_identifier)
	if not file_path:
	print(f"Error: Could not resolve dataset_identifier '{dataset_identifier}' to a file path.")
	return []
	if not os.path.exists(file_path):
	print(f"Error: File not found at resolved path: {file_path}")
	return []

	df = pd.read_excel(file_path)
	df_transposed = df.transpose()
	new_headers = df_transposed.iloc[0].tolist()
	df_processed = df_transposed[1:]
	df_processed.columns = new_headers
	df_processed = df_processed.fillna('')
	result = [new_headers] + df_processed.values.tolist()

	# Store in cache
	_set_cached_data(cache_key, result)
	return result
	except Exception as e:
	print(f"Error processing TF data for {dataset_identifier}: {e}")
	return []

	def filter_data_by_column_keywords(dataset: list, keywords: str) -> list:
	"""
	Filters a dataset (list of lists, first list is headers) by keywords in column names.
	The 'dataset' input MUST be the actual data (e.g. output from get_processed_tf_data),
	not a dataset name or file path.
	"""
	if not dataset or not isinstance(dataset, list) or len(dataset) < 1 or not isinstance(dataset[0], list):
	print("Error: Invalid dataset format for filtering. Must be a list of lists with headers.")
	return []

	headers = dataset[0]
	# Handle empty dataset (only headers)
	if len(dataset) == 1:
	data_rows = []
	else:
	data_rows = dataset[1:]

	df = pd.DataFrame(data_rows, columns=headers)

	if not keywords or not keywords.strip():
	return dataset

	keyword_list = [k.strip().lower() for k in keywords.split(',')]

	matching_columns = []
	if df.empty and not data_rows: # Only headers were passed, and they are the only columns
	for header_col in headers: # Check against original headers directly
	for keyword in keyword_list:
	if keyword in str(header_col).lower():
	matching_columns.append(header_col)
	break
	else: # DataFrame has columns (either from data_rows or just headers if data_rows was empty but df was formed)
	for header_col in df.columns:
	for keyword in keyword_list:
	if keyword in str(header_col).lower():
	matching_columns.append(header_col)
	break

	if not matching_columns:
	return [headers]

	unique_matching_headers = sorted(list(set(matching_columns)), key=lambda x: headers.index(x))

	if df.empty: # If the original df was empty (only headers passed, or data was empty)
	# We've already found the matching headers, just return them
	return [unique_matching_headers] + [[] for _ in data_rows] # Preserve number of empty data rows if any

	filtered_df = df[unique_matching_headers]
	return [unique_matching_headers] + filtered_df.values.tolist()


	def get_tf_wave_search_data(tf_search_term: str = None) -> dict:
	"""
	Reads searchtfwaves.xlsx and filters by TF if a search term is provided.
	Uses caching for better performance.
	"""
	try:
	# Check cache first
	cache_key = f"wave_search_{tf_search_term}"
	cached_data = _get_cached_data(cache_key)
	if cached_data is not None:
	return cached_data

	file_path = os.path.join(WAVE_ANALYSIS_PATH, "searchtfwaves.xlsx")
	if not os.path.exists(file_path):
	print(f"Error: TF wave search file not found at {file_path}")
	return {}

	df = pd.read_excel(file_path)
	expected_cols = [f"Wave{i}" for i in range(1, 8)]

	if not all(col in df.columns for col in expected_cols):
	if len(df.columns) >= 7:
	rename_map = {df.columns[i]: expected_cols[i] for i in range(7)}
	df.rename(columns=rename_map, inplace=True)
	df = df[expected_cols]
	else:
	print(f"Warning: {file_path} does not have at least 7 columns to map to Wave1-7.")

	df = df.fillna('')

	if not tf_search_term or not tf_search_term.strip():
	result = {col: df[col].tolist() for col in df.columns if col in df}
	else:
	result = {}
	search_term_lower = tf_search_term.strip().lower()
	for col in df.columns:
	if col not in df: continue
	matching_genes = [gene for gene in df[col] if search_term_lower in str(gene).lower()]
	if matching_genes:
	result[col] = matching_genes

	# Store in cache
	_set_cached_data(cache_key, result)
	return result
	except Exception as e:
	print(f"Error reading or processing wave search data: {e}")
	return {}

	def get_tf_correlation_data(tf_name: str = None) -> list:
	"""
	Reads TF-TFcorTRMTEX.xlsx and filters by tf_name if provided.
	Uses caching for better performance.
	"""
	try:
	# Check cache first
	cache_key = f"correlation_{tf_name}"
	cached_data = _get_cached_data(cache_key)
	if cached_data is not None:
	return cached_data

	file_path = os.path.join(TF_CORR_PATH, "TF-TFcorTRMTEX.xlsx")
	if not os.path.exists(file_path):
	print(f"Error: TF correlation data file not found at {file_path}")
	return []

	df = pd.read_excel(file_path)
	df = df.fillna('')
	headers = df.columns.tolist()

	if not tf_name or not tf_name.strip():
	result = [headers] + df.values.tolist()
	else:
	tf_name_col = "TF Name"
	if tf_name_col not in df.columns:
	if len(df.columns) > 0:
	tf_name_col = df.columns[0]
	else:
	return [headers]

	filtered_df = df[df[tf_name_col].astype(str).str.lower() == tf_name.strip().lower()]
	result = [headers] + filtered_df.values.tolist() if not filtered_df.empty else [headers]

	# Store in cache
	_set_cached_data(cache_key, result)
	return result
	except Exception as e:
	print(f"Error processing correlation data: {e}")
	return []

	def get_tf_correlation_image_path(tf_name: str) -> str:
	"""
	Gets the image path for a TF from the correlation data.
	"""
	file_path = os.path.join(TF_CORR_PATH, "TF-TFcorTRMTEX.xlsx")
	if not os.path.exists(file_path):
	print(f"Error: TF correlation data file (for image path) not found at {file_path} (Resolved: {os.path.abspath(file_path)})")
	return ""

	image_column_name = "TF Merged Graph Path"
	tf_identifier_column = "TF Name"

	try:
	df = pd.read_excel(file_path)
	if tf_identifier_column not in df.columns:
	if len(df.columns) > 0:
	tf_identifier_column = df.columns[0]
	else:
	return ""

	if image_column_name not in df.columns:
	print(f"Error: Image path column '{image_column_name}' not found in {file_path} (Resolved: {os.path.abspath(file_path)})")
	return ""

	row = df[df[tf_identifier_column].astype(str).str.lower() == tf_name.strip().lower()]

	if not row.empty:
	image_path_val = row.iloc[0][image_column_name]
	if pd.notna(image_path_val) and isinstance(image_path_val, str):
	if not image_path_val.startswith("www/"):
	return "www/" + image_path_val.lstrip('/')
	return str(image_path_val)
	else:
	return ""
	else:
	return ""

	except FileNotFoundError:
	print(f"Error: File not found at {file_path} (Resolved: {os.path.abspath(file_path)})")
	return ""
	except Exception as e:
	print(f"Error processing {file_path} for TF {tf_name}: {e} (Resolved: {os.path.abspath(file_path)})")
	return ""

	def list_all_tfs_in_correlation_data() -> list:
	"""
	Lists all unique TFs from the TF-TF correlation data file.
	"""
	file_path = os.path.join(TF_CORR_PATH, "TF-TFcorTRMTEX.xlsx")
	if not os.path.exists(file_path):
	print(f"Error: TF correlation data file (for listing TFs) not found at {file_path} (Resolved: {os.path.abspath(file_path)})")
	return []

	tf_identifier_column = "TF Name"
	try:
	df = pd.read_excel(file_path)
	if tf_identifier_column not in df.columns:
	if not df.empty and len(df.columns) > 0:
	tf_identifier_column = df.columns[0]
	else:
	return []

	tf_list = df[tf_identifier_column].astype(str).str.strip().unique().tolist()
	return [tf for tf in tf_list if tf and tf.lower() != 'nan']
	except FileNotFoundError:
	print(f"Error: File not found at {file_path} (Resolved: {os.path.abspath(file_path)})")
	return []
	except Exception as e:
	print(f"Error processing {file_path}: {e} (Resolved: {os.path.abspath(file_path)})")
	return []

	def get_tf_community_sheet_data(community_type: str) -> list:
	"""
	Reads data from a specific TF community Excel file (trm or texterm).
	Uses caching for better performance.
	"""
	try:
	# Check cache first
	cache_key = f"community_{community_type}"
	cached_data = _get_cached_data(cache_key)
	if cached_data is not None:
	return cached_data

	if community_type.lower() == "trm":
	file_name = "trmcommunities.xlsx"
	elif community_type.lower() == "texterm":
	file_name = "texcommunities.xlsx"
	else:
	print(f"Error: Invalid community_type '{community_type}'. Must be 'trm' or 'texterm'.")
	return []

	file_path = os.path.join(TF_COMMUNITIES_PATH, file_name)
	if not os.path.exists(file_path):
	print(f"Error: TF community file not found at {file_path}")
	return []

	result = get_raw_excel_data(file_path)

	# Store in cache
	_set_cached_data(cache_key, result)
	return result
	except Exception as e:
	print(f"Error processing community data: {e}")
	return []

	# --- Mappings for static info tools ---

	# STATIC_IMAGE_PATHS stores paths relative to the project root, typically starting with "www/"
	# These are for constructing URLs or for components that expect paths relative to the web server root ('www').
	# When accessed from tools/agent_tools.py, os.path.join(BASE_WWW_PATH, ...) correctly points to the file system location.
	# The get_static_image_path tool should return the "web path" e.g. "www/images/logo.png"

	_STATIC_IMAGE_WEB_PATHS = {
	"home_page_diagram": "www/homedesc.png",
	"ucsd_logo": "www/ucsdlogo.png",
	"salk_logo": "www/salklogo.png",
	"unc_logo": "www/unclogo.jpg",
	"modal_cs_description_img": "www/csdescrip.jpeg",
	"tfcat_overview_img": "www/tfcat/onlycellstates.png",
	"tfcat_multistates_heatmap": "www/tfcat/multistatesheatmap.png",
	"naive_bubble_plot": "www/bubbleplots/naivebubble.jpg",
	"te_bubble_plot": "www/bubbleplots/tebubble.jpg",
	"mp_bubble_plot": "www/bubbleplots/mpbubble.jpg",
	"tcm_bubble_plot": "www/bubbleplots/tcmbubble.jpg",
	"tem_bubble_plot": "www/bubbleplots/tembubble.jpg",
	"trm_bubble_plot": "www/bubbleplots/trmbubble.jpg",
	"texprog_bubble_plot": "www/bubbleplots/texprogbubble.jpg",
	"texefflike_bubble_plot": "www/bubbleplots/texintbubble.jpg",
	"texterm_bubble_plot": "www/bubbleplots/textermbubble.jpg",
	"wave_analysis_overview_diagram": "www/waveanalysis/tfwaveanal.png",
	"wave1_main_img": "www/waveanalysis/c1.jpg",
	"wave1_gokegg_img": "www/waveanalysis/c1_selected_GO_KEGG.jpg",
	"wave1_ranked_text1_img": "www/waveanalysis/txtJPG/c1_ranked_1.jpg",
	"wave1_ranked_text2_img": "www/waveanalysis/txtJPG/c1_ranked_2.jpg",
	"wave2_main_img": "www/waveanalysis/c2.jpg",
	"wave2_gokegg_img": "www/waveanalysis/c2_selected_GO_KEGG_v2.jpg",
	"wave2_ranked_text_img": "www/waveanalysis/txtJPG/c2_ranked.jpg",
	# ... (add all other wave images similarly, ensuring paths start with "www/") ...
	"wave3_main_img": "www/waveanalysis/c3.jpg",
	"wave3_gokegg_img": "www/waveanalysis/c3_selected_GO_KEGG.jpg",
	"wave3_ranked_text_img": "www/waveanalysis/txtJPG/c3_ranked.jpg",
	"wave4_main_img": "www/waveanalysis/c4.jpg",
	"wave4_gokegg_img": "www/waveanalysis/c4_selected_GO_KEGG.jpg",
	"wave4_ranked_text_img": "www/waveanalysis/txtJPG/c4_ranked.jpg",
	"wave5_main_img": "www/waveanalysis/c5.jpg",
	"wave5_gokegg_img": "www/waveanalysis/c5_selected_GO_KEGG.jpg",
	"wave5_ranked_text_img": "www/waveanalysis/txtJPG/c5_ranked.jpg",
	"wave6_main_img": "www/waveanalysis/c6.jpg",
	"wave6_gokegg_img": "www/waveanalysis/c6_selected_GO_KEGG.jpg",
	"wave6_ranked_text_img": "www/waveanalysis/txtJPG/c6_ranked.jpg",
	"wave7_main_img": "www/waveanalysis/c7.jpg",
	"wave7_gokegg_img": "www/waveanalysis/c7_selected_GO_KEGG.jpg",
	"wave7_ranked_text_img": "www/waveanalysis/txtJPG/c7_ranked.jpg",
	"network_correlation_desc_img": "www/networkanalysis/tfcorrdesc.png",
	"network_community_overview_img": "www/networkanalysis/community.jpg",
	"network_trmtex_community_comparison_img": "www/networkanalysis/trmtexcom.png",
	"network_community_pathway_img": "www/networkanalysis/tfcompathway.png",
	}


	# Global variable to cache loaded UI texts
	_loaded_ui_texts = None

	def _load_ui_texts():
	"""Helper function to load UI texts from JSON file (now in tools/ directory)."""
	global _loaded_ui_texts
	if _loaded_ui_texts is None:
	try:
	# UI_TEXTS_FILE is already defined as "./ui_texts.json" or "ui_texts.json"
	# so it's relative to this agent_tools.py file in the tools/ directory.
	with open(UI_TEXTS_FILE, 'r', encoding='utf-8') as f:
	_loaded_ui_texts = json.load(f)
	except FileNotFoundError:
	print(f"Error: UI texts file not found at {os.path.abspath(UI_TEXTS_FILE)}")
	_loaded_ui_texts = {}
	except json.JSONDecodeError:
	print(f"Error: Could not decode JSON from {UI_TEXTS_FILE}")
	_loaded_ui_texts = {}
	except Exception as e:
	print(f"An unexpected error occurred while loading {UI_TEXTS_FILE}: {e}")
	_loaded_ui_texts = {}
	return _loaded_ui_texts

	INTERNAL_NAVIGATION_TARGETS = {
	"to_tfcat": "Navigates to the 'TF Catalog > Search TF Scores' tab.",
	"to_tfwave": "Navigates to the 'TF Wave Analysis > Overview' tab.",
	"to_tfnet": "Navigates to the 'TF Network Analysis > Search TF-TF correlation in TRM/TEXterm' tab.",
	"c1_link": "Navigates to the 'TF Wave Analysis > Wave 1' tab.",
	"c2_link": "Navigates to the 'TF Wave Analysis > Wave 2' tab.",
	"c3_link": "Navigates to the 'TF Wave Analysis > Wave 3' tab.",
	"c4_link": "Navigates to the 'TF Wave Analysis > Wave 4' tab.",
	"c5_link": "Navigates to the 'TF Wave Analysis > Wave 5' tab.",
	"c6_link": "Navigates to the 'TF Wave Analysis > Wave 6' tab.",
	"c7_link": "Navigates to the 'TF Wave Analysis > Wave 7' tab.",
	}

	def get_static_image_path(image_identifier: str) -> str:
	"""
	Returns the predefined relative web path (e.g., "www/images/logo.png") for a known static image asset.
	"""
	return _STATIC_IMAGE_WEB_PATHS.get(image_identifier.lower(), "")

	def get_ui_descriptive_text(text_identifier: str) -> str:
	"""
	Retrieves predefined descriptive text or methodology explanations from ui_texts.json.
	"""
	texts = _load_ui_texts()
	processed_text_identifier = text_identifier.lower()

	if processed_text_identifier.startswith("wave_") and processed_text_identifier.endswith("_analysis_placeholder_details"):
	try:
	generic_wave_key = "wave_x_analysis_placeholder_details"
	if generic_wave_key in texts:
	wave_num_str = processed_text_identifier.split("_")[1]
	return texts[generic_wave_key].replace("{X}", wave_num_str)
	else:
	return texts.get(processed_text_identifier, "")
	except Exception as e:
	print(f"Error processing placeholder for {text_identifier}: {e}")
	return texts.get(processed_text_identifier, "")

	return texts.get(processed_text_identifier, "")

	def list_available_tf_catalog_datasets() -> list:
	"""
	Returns a list of identifiers for available TF catalog datasets.
	These identifiers are used with get_processed_tf_data.
	"""
	return [
	"Overall_TF_PageRank",
	"Naive", "TE", "MP", "TCM", "TEM", "TRM",
	"TEXprog", "TEXeff", "TEXterm"
	]

	def get_tf_catalog_dataset_path(dataset_identifier: str) -> str:
	"""
	Helper to get the actual file system path for a TF catalog dataset identifier.
	Paths are constructed relative to BASE_WWW_PATH (which is ../www from this file's location).
	This function is mostly for internal use by get_processed_tf_data.
	"""
	mapping = {
	"overall_tf_pagerank": os.path.join(TF_PAGERANK_PATH, "Table_TF PageRank Scores for Audrey.xlsx"),
	"naive": os.path.join(TF_PAGERANK_PATH, "Naive.xlsx"),
	"te": os.path.join(TF_PAGERANK_PATH, "TE.xlsx"),
	"mp": os.path.join(TF_PAGERANK_PATH, "MP.xlsx"),
	"tcm": os.path.join(TF_PAGERANK_PATH, "TCM.xlsx"),
	"tem": os.path.join(TF_PAGERANK_PATH, "TEM.xlsx"),
	"trm": os.path.join(TF_PAGERANK_PATH, "TRM.xlsx"),
	"texprog": os.path.join(TF_PAGERANK_PATH, "TEXprog.xlsx"),
	"texeff": os.path.join(TF_PAGERANK_PATH, "TEXeff.xlsx"),
	"texterm": os.path.join(TF_PAGERANK_PATH, "TEXterm.xlsx"),
	}
	return mapping.get(dataset_identifier.lower(), "")


	def list_available_cell_state_bubble_plots() -> list:
	"""
	Returns a list of identifiers for available cell-state specific bubble plot images.
	These identifiers can be used with get_static_image_path.
	"""
	return [key for key in _STATIC_IMAGE_WEB_PATHS if "bubble_plot" in key]


	def list_available_wave_analysis_assets(wave_number: int) -> dict:
	"""
	Returns a structured list of available asset identifiers for a TF wave.
	Identifiers can be used with get_static_image_path.
	"""
	if not 1 <= wave_number <= 7:
	return {}

	assets = {
	"main_image_id": f"wave{wave_number}_main_img",
	"gokegg_image_id": f"wave{wave_number}_gokegg_img"
	}
	# Adjust for wave2 gokegg specific key if STATIC_IMAGE_WEB_PATHS uses the v2 name in its key
	if wave_number == 2 and "wave2_gokegg_img_v2" in _STATIC_IMAGE_WEB_PATHS: # Example if key was specific
	assets["gokegg_image_id"] = "wave2_gokegg_img_v2"
	elif wave_number == 2 and _STATIC_IMAGE_WEB_PATHS.get(f"wave{wave_number}_gokegg_img","").endswith("_v2.jpg"):
	pass # The key wave2_gokegg_img already points to the v2 file.

	if wave_number == 1:
	assets["ranked_text_image_ids"] = [
	f"wave{wave_number}_ranked_text1_img",
	f"wave{wave_number}_ranked_text2_img"
	]
	else: # Waves 2-7 have one ranked text image
	assets["ranked_text_image_ids"] = [f"wave{wave_number}_ranked_text_img"]

	# Verify these identifiers exist in _STATIC_IMAGE_WEB_PATHS
	if assets["main_image_id"] not in _STATIC_IMAGE_WEB_PATHS: assets["main_image_id"] = None
	if assets["gokegg_image_id"] not in _STATIC_IMAGE_WEB_PATHS: assets["gokegg_image_id"] = None

	valid_ranked_ids = []
	for r_id in assets.get("ranked_text_image_ids", []):
	if r_id in _STATIC_IMAGE_WEB_PATHS:
	valid_ranked_ids.append(r_id)
	assets["ranked_text_image_ids"] = valid_ranked_ids
	if not assets["ranked_text_image_ids"]: del assets["ranked_text_image_ids"]

	return {k:v for k,v in assets.items() if v is not None}


	def get_internal_navigation_info(link_id: str) -> str:
	"""
	Provides information about where an internal UI link is intended to navigate.
	"""
	return INTERNAL_NAVIGATION_TARGETS.get(link_id.lower(), "Navigation target not defined for this link ID.")

	def get_biorxiv_paper_url() -> str:
	"""
	Returns the URL for the main bioRxiv paper.
	"""
	return "https://doi.org/10.1101/2023.01.03.522354"

	# --- New Tool for Schema Discovery ---
	def discover_excel_files_and_schemas(base_scan_directory_name: str = "www") -> dict:
	"""
	Discovers Excel files (.xlsx) within a specified base directory (relative to project root),
	extracts their column headers, and returns a schema dictionary.
	Example base_scan_directory_name: "www"
	"""
	discovered_schema = {}
	# _PROJECT_ROOT should be defined earlier in the file as the absolute path to the project root.
	# Example: _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
	scan_root_abs = os.path.join(_PROJECT_ROOT, base_scan_directory_name)

	if not os.path.isdir(scan_root_abs):
	print(f"Error: Base directory for schema discovery not found: {scan_root_abs}")
	return {}

	for dirpath, _, filenames in os.walk(scan_root_abs):
	for filename in filenames:
	if filename.endswith(".xlsx") and not filename.startswith("~"): # Ignore temp Excel files
	file_abs_path = os.path.join(dirpath, filename)
	# Make file_path relative to project root for the schema key and value
	file_rel_path = os.path.relpath(file_abs_path, _PROJECT_ROOT).replace("\\\\", "/")

	table_identifier = os.path.splitext(filename)[0].replace("-", "_").replace(" ", "_")

	try:
	# Read only the header row to get column names (efficient)
	# Assuming data is on the first sheet by default.
	xls = pd.ExcelFile(file_abs_path)
	if not xls.sheet_names: # Check if there are any sheets
	print(f"[Schema Discovery] Warning: No sheets found in {file_abs_path}")
	columns = []
	else:
	# Use the first sheet name
	first_sheet_name = xls.sheet_names[0]
	df_header = pd.read_excel(xls, sheet_name=first_sheet_name, nrows=0) # Read 0 rows to just get headers
	columns = [str(col) for col in df_header.columns.tolist()] # Ensure columns are strings

	discovered_schema[file_rel_path] = {
	"file_path": file_rel_path,
	"table_identifier": table_identifier,
	"columns": columns,
	"sheets": xls.sheet_names if xls.sheet_names else [], # Store all sheet names
	"last_modified": datetime.now().isoformat(),
	"file_size_bytes": os.path.getsize(file_abs_path),
	"error": None
	}
	except Exception as e:
	print(f"[Schema Discovery] Error reading or processing headers for {file_abs_path}: {e}")
	discovered_schema[file_rel_path] = {
	"file_path": file_rel_path,
	"table_identifier": table_identifier,
	"columns": [],
	"sheets": [],
	"error": str(e)
	}

	if not discovered_schema:
	print(f"[Schema Discovery] No Excel files found in {scan_root_abs}")

	return discovered_schema

	# --- New Tool for Listing All Files in WWW ---
	def list_all_files_in_www_directory() -> list:
	"""
	Scans the entire BASE_WWW_PATH directory (and its subdirectories) and returns a list
	of dictionaries, each representing a file with its relative path from the project root,
	detected MIME type (best guess), and size in bytes.
	Excludes common hidden/system files like .DS_Store.
	"""
	file_manifest = []
	# _PROJECT_ROOT and BASE_WWW_PATH should be defined at the top of the file
	# BASE_WWW_PATH is already an absolute path to the www directory

	if not os.path.isdir(BASE_WWW_PATH):
	print(f"Error: WWW directory for file listing not found: {BASE_WWW_PATH}")
	return []

	# Common hidden/system files/folders to ignore
	ignore_list = [".DS_Store", "Thumbs.db"]
	ignore_prefixes = ["._"] # For macOS resource fork files

	for dirpath, dirnames, filenames in os.walk(BASE_WWW_PATH):
	# Optionally filter dirnames to prevent descending into certain hidden folders if needed
	# For now, walking all non-hidden listed above.

	for filename in filenames:
	if filename in ignore_list or any(filename.startswith(p) for p in ignore_prefixes):
	continue

	file_abs_path = os.path.join(dirpath, filename)
	# Make file_path relative to project root for the manifest
	file_rel_path_from_project_root = os.path.relpath(file_abs_path, _PROJECT_ROOT).replace("\\", "/")

	try:
	file_size = os.path.getsize(file_abs_path)

	# Guess MIME type
	# Ensure mimetypes is imported at the top of the file
	if not mimetypes.inited:
	mimetypes.init() # Initialize mimetypes if not already done

	mime_type, _ = mimetypes.guess_type(file_abs_path)
	if mime_type is None:
	# Basic fallback based on extension for common types if mimetypes fails
	ext = os.path.splitext(filename)[1].lower()
	if ext == ".txt" or ext == ".md":
	mime_type = "text/plain"
	elif ext == ".csv":
	mime_type = "text/csv"
	elif ext == ".json":
	mime_type = "application/json"
	else:
	mime_type = "application/octet-stream" # Generic binary

	file_manifest.append({
	"path": file_rel_path_from_project_root,
	"type": mime_type,
	"size": file_size,
	"last_modified": datetime.now().isoformat(),
	"error": None
	})
	except FileNotFoundError: # Should not happen if os.walk found it, but as a safeguard
	print(f"[File Manifest] Warning: File {file_abs_path} found by os.walk but then not accessible for size/type.")
	continue
	except Exception as e:
	print(f"[File Manifest] Error processing file {file_abs_path}: {e}")
	# Optionally add an error entry to the manifest for this file
	file_manifest.append({
	"path": file_rel_path_from_project_root,
	"type": "unknown/error",
	"size": 0,
	"last_modified": datetime.now().isoformat(),
	"error": str(e)
	})

	return file_manifest

	# --- START: Literature Search Tool Implementation ---

	def _normalize_authors(authors_data, source="Unknown"):
	"""Helper to normalize author lists from different APIs."""
	if not authors_data:
	return ["N/A"]
	if source == "SemanticScholar": # List of dicts with 'name' key
	return [author.get('name', "N/A") for author in authors_data]
	if source == "PubMed": # List of strings
	return authors_data
	if source == "ArXiv": # List of arxiv.Result.Author objects
	return [author.name for author in authors_data]
	return [str(a) for a in authors_data] # Generic fallback

	def _search_semanticscholar_internal(query: str, max_results: int = 2) -> list[dict]:
	papers = []
	# print(f"[Tool:_search_semanticscholar_internal] Querying Semantic Scholar for: '{query}' (max: {max_results})") # COMMENTED OUT
	try:
	s2 = SemanticScholar(timeout=15)
	# Corrected: 'doi' is not a direct field for search_paper, 'externalIds' should be used.
	results = s2.search_paper(query, limit=max_results, fields=['title', 'authors', 'year', 'abstract', 'url', 'venue', 'externalIds'])
	if results and results.items:
	for item in results.items:
	doi_val = item.externalIds.get('DOI') if item.externalIds else None
	papers.append({
	"title": getattr(item, 'title', "N/A"),
	"authors": _normalize_authors(getattr(item, 'authors', []), "SemanticScholar"),
	"year": getattr(item, 'year', "N/A"),
	"abstract": getattr(item, 'abstract', "N/A")[:500] + "..." if getattr(item, 'abstract', None) else "N/A",
	"doi": doi_val, # Use the extracted DOI
	"url": getattr(item, 'url', "N/A"),
	"venue": getattr(item, 'venue', "N/A"),
	"source_api": "Semantic Scholar"
	})
	except Exception as e:
	# This print goes to stderr if run directly, but might still be captured by a simple exec context.
	# For agent integration, actual errors should be raised or returned structured.
	# For now, we'll assume ManagerAgent's error handling for the overall tool call is preferred.
	# Let's comment this out for now to ensure no stdout interference.
	# print(f"[Tool:_search_semanticscholar_internal] Error: {e}", file=sys.stderr)
	pass # Allow the function to return an empty list on error.
	return papers

	def _search_pubmed_internal(query: str, max_results: int = 2) -> list[dict]:
	papers = []
	# print(f"[Tool:_search_pubmed_internal] Querying PubMed for: '{query}' (max: {max_results})") # COMMENTED OUT
	try:
	handle = Entrez.esearch(db="pubmed", term=query, retmax=str(max_results), sort="relevance")
	record = Entrez.read(handle)
	handle.close()
	ids = record["IdList"]
	if not ids:
	return papers

	handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="xml")
	records = Entrez.read(handle) # This is MedlineParser.parse, returns a generator usually
	handle.close()

	for pubmed_article in records.get('PubmedArticle', []): # records is a list of dicts if multiple ids
	article = pubmed_article.get('MedlineCitation', {}).get('Article', {})
	title = article.get('ArticleTitle', "N/A")
	abstract_text_list = article.get('Abstract', {}).get('AbstractText', [])
	abstract = " ".join(abstract_text_list)[:500] + "..." if abstract_text_list else "N/A"
	year = article.get('Journal', {}).get('JournalIssue', {}).get('PubDate', {}).get('Year', "N/A")
	authors_list = []
	author_info_list = article.get('AuthorList', [])
	for auth in author_info_list:
	if auth.get('LastName') and auth.get('ForeName'):
	authors_list.append(f"{auth.get('ForeName')} {auth.get('LastName')}")
	elif auth.get('CollectiveName'):
	authors_list.append(auth.get('CollectiveName'))

	doi = None
	article_ids = pubmed_article.get('PubmedData', {}).get('ArticleIdList', [])
	for aid in article_ids:
	if aid.attributes.get('IdType') == 'doi':
	doi = str(aid) # The content of the tag is the DOI
	break

	pmid = pubmed_article.get('MedlineCitation', {}).get('PMID', None)
	url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else "N/A"
	venue = article.get('Journal', {}).get('Title', "N/A")

	papers.append({
	"title": title,
	"authors": _normalize_authors(authors_list, "PubMed"),
	"year": year,
	"abstract": abstract,
	"doi": doi,
	"url": url,
	"venue": venue,
	"source_api": "PubMed"
	})
	if len(papers) >= max_results: # Ensure we don't exceed due to structure of efetch
	break

	except Exception as e:
	# print(f"[Tool:_search_pubmed_internal] Error: {e}", file=sys.stderr) # COMMENTED OUT
	pass
	return papers

	def _search_arxiv_internal(query: str, max_results: int = 2) -> list[dict]:
	papers = []
	# print(f"[Tool:_search_arxiv_internal] Querying ArXiv for: '{query}' (max: {max_results})") # COMMENTED OUT
	try:
	search = arxiv.Search(
	query = query,
	max_results = max_results,
	sort_by = arxiv.SortCriterion.Relevance
	)
	results = list(arxiv.Client().results(search)) # Convert generator to list

	for result in results:
	papers.append({
	"title": getattr(result, 'title', "N/A"),
	"authors": _normalize_authors(getattr(result, 'authors', []), "ArXiv"),
	"year": getattr(result, 'published').year if getattr(result, 'published', None) else "N/A",
	"abstract": getattr(result, 'summary', "N/A").replace('\n', ' ')[:500] + "...", # ArXiv abstracts can have newlines
	"doi": getattr(result, 'doi', None),
	"url": getattr(result, 'entry_id', "N/A"), # entry_id is the ArXiv URL like http://arxiv.org/abs/xxxx.xxxxx
	"venue": "ArXiv", # ArXiv is the venue
	"source_api": "ArXiv"
	})
	except Exception as e:
	# print(f"[Tool:_search_arxiv_internal] Error: {e}", file=sys.stderr) # COMMENTED OUT
	pass
	return papers

	def multi_source_literature_search(queries: list[str], max_results_per_query_per_source: int = 1, max_total_unique_papers: int = 10) -> list[dict]:
	# print(f"[Tool:multi_source_literature_search] Received {len(queries)} queries. Max results per source/query: {max_results_per_query_per_source}, Max total unique papers: {max_total_unique_papers}") # COMMENTED OUT

	unique_papers_found_so_far = []
	processed_dois = set()
	processed_titles_authors = set()

	for query_idx, query_str in enumerate(queries):
	if len(unique_papers_found_so_far) >= max_total_unique_papers:
	# print(f" Max total unique papers ({max_total_unique_papers}) reached. Stopping query processing early.") # COMMENTED OUT
	break

	# print(f" Processing query {query_idx+1}/{len(queries)}: '{query_str}'") # COMMENTED OUT

	current_query_results_from_all_sources = []

	# Semantic Scholar
	if len(unique_papers_found_so_far) < max_total_unique_papers:
	s2_results = _search_semanticscholar_internal(query_str, max_results_per_query_per_source)
	current_query_results_from_all_sources.extend(s2_results)

	# PubMed
	if len(unique_papers_found_so_far) < max_total_unique_papers:
	pubmed_results = _search_pubmed_internal(query_str, max_results_per_query_per_source)
	current_query_results_from_all_sources.extend(pubmed_results)

	# ArXiv
	if len(unique_papers_found_so_far) < max_total_unique_papers:
	arxiv_results = _search_arxiv_internal(query_str, max_results_per_query_per_source)
	current_query_results_from_all_sources.extend(arxiv_results)

	# De-duplicate results from current_query_results_from_all_sources and add to unique_papers_found_so_far
	for paper in current_query_results_from_all_sources:
	if len(unique_papers_found_so_far) >= max_total_unique_papers:
	break

	is_new_paper = False
	doi = paper.get("doi")
	if doi and doi != "N/A":
	normalized_doi = doi.lower().strip()
	if normalized_doi not in processed_dois:
	processed_dois.add(normalized_doi)
	is_new_paper = True
	else: # Fallback to title + first author
	title = paper.get("title", "").lower().strip()
	first_author_list = paper.get("authors", [])
	first_author = first_author_list[0].lower().strip() if first_author_list and first_author_list[0] != "N/A" else ""
	title_author_key = f"{title}\|{first_author}"
	if title and first_author and title_author_key not in processed_titles_authors:
	processed_titles_authors.add(title_author_key)
	is_new_paper = True
	elif title and not first_author and title not in processed_titles_authors:
	processed_titles_authors.add(title)
	is_new_paper = True

	if is_new_paper:
	unique_papers_found_so_far.append(paper)

	if len(unique_papers_found_so_far) >= max_total_unique_papers:
	# print(f" Max total unique papers ({max_total_unique_papers}) reached after processing query {query_idx+1}.") # COMMENTED OUT
	break

	final_results = unique_papers_found_so_far[:max_total_unique_papers]

	# print(f"[Tool:multi_source_literature_search] Total unique papers found (capped at {max_total_unique_papers}): {len(final_results)}") # COMMENTED OUT
	return final_results

	# --- END: Literature Search Tool Implementation ---

	# --- START: Text Fetching from URLs Tool Implementation ---

	def fetch_text_from_urls(paper_info_list: list[dict], max_chars_per_paper: int = 15000) -> list[dict]:
	# print(f"[Tool:fetch_text_from_urls] Attempting to fetch text for {len(paper_info_list)} papers.") # COMMENTED OUT

	updated_paper_info_list = []
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	for paper in paper_info_list:
	url = paper.get("url")
	retrieved_text = None
	source_api = paper.get("source_api", "Unknown") # Get source for potential specific handling

	if not url or not isinstance(url, str) or not url.startswith("http"):
	retrieved_text = "Error: Invalid or missing URL."
	paper["retrieved_text_content"] = retrieved_text
	updated_paper_info_list.append(paper)
	# print(f" Skipping paper '{paper.get('title', 'N/A')}' due to invalid URL: {url}") # COMMENTED OUT
	continue

	# print(f" Fetching text for: '{paper.get('title', 'N/A')}' from {url[:70]}...") # COMMENTED OUT
	try:
	response = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
	response.raise_for_status() # Raise an exception for HTTP errors

	soup = BeautifulSoup(response.content, 'html.parser')

	# Basic text extraction - attempt to find common article body tags or just get all text
	# This will need refinement for specific site structures (e.g., arXiv, PubMed Central)
	# For now, a general approach:

	body_content = soup.find('body')
	if body_content:
	# Remove script and style tags
	for script_or_style in body_content(["script", "style"]):
	script_or_style.decompose()

	# Try to get main article content if common tags exist
	main_article_tags = ['article', 'main', '.main-content', '.article-body', '.abstract'] # Add more specific selectors
	extracted_elements = []
	for tag_selector in main_article_tags:
	elements = body_content.select(tag_selector)
	if elements:
	for el in elements:
	extracted_elements.append(el.get_text(separator=" ", strip=True))
	break # Found a primary content block, assume this is good enough

	if extracted_elements:
	retrieved_text = " ".join(extracted_elements)
	else:
	retrieved_text = body_content.get_text(separator=" ", strip=True)
	else:
	retrieved_text = "Error: Could not find body content in HTML."

	if retrieved_text and not retrieved_text.startswith("Error:"):
	retrieved_text = retrieved_text[:max_chars_per_paper]
	if len(retrieved_text) == max_chars_per_paper:
	retrieved_text += "... (truncated)"
	# print(f" Successfully extracted ~{len(retrieved_text)} chars.") # COMMENTED OUT
	elif not retrieved_text:
	retrieved_text = "Error: No text could be extracted."

	except requests.exceptions.RequestException as e:
	retrieved_text = f"Error fetching URL: {str(e)}"
	# print(f" Error fetching URL {url}: {e}") # COMMENTED OUT
	except Exception as e:
	retrieved_text = f"Error processing HTML: {str(e)}"
	# print(f" Error processing HTML for {url}: {e}") # COMMENTED OUT

	paper["retrieved_text_content"] = retrieved_text
	updated_paper_info_list.append(paper)

	# Optional: add a small delay between requests if fetching from many URLs
	# time.sleep(0.25)

	# print(f"[Tool:fetch_text_from_urls] Finished fetching text for {len(updated_paper_info_list)} papers.") # COMMENTED OUT
	return updated_paper_info_list

	# --- END: Text Fetching from URLs Tool Implementation ---

	# Example of how GenerationAgent would call this tool:
	# Assume 'list_of_papers_from_search' is the output from multi_source_literature_search
	# print(json.dumps({'intermediate_data_for_llm': fetch_text_from_urls(paper_info_list=list_of_papers_from_search, max_chars_per_paper=10000)}))

	def describe_image(file_id: str, api_key: str = None) -> str:
	"""
	Process an uploaded image or document file and return a description using OpenAI's Vision model.

	Args:
	file_id (str): OpenAI file ID (starts with "file-")
	api_key (str, optional): OpenAI API key. If not provided, will try to use OPENAI_API_KEY environment variable.

	Returns:
	str: A detailed description of the image or document content
	"""
	try:
	# First check if OpenAI module is available
	try:
	from openai import OpenAI
	except ImportError:
	return "Error: The OpenAI module is not installed. Please install it with 'pip install openai'."

	if not file_id or not isinstance(file_id, str):
	return "Error: Invalid file ID format."

	# Determine the image_url based on file_id format
	image_url_to_use = {}
	if file_id.startswith("data:image/"):
	image_url_to_use = {"url": file_id}
	elif file_id.startswith("file-"):
	image_url_to_use = {"url": file_id} # Current behavior, might work for actual images
	else:
	# Try to prefix with "file-" if it's a raw ID without it.
	# This is a common case if GA forgets to add it from user input.
	if len(file_id) > 10 and not file_id.startswith("file-") and " " not in file_id: # basic check for a raw ID
	print(f"describe_image: received potentially raw file ID '{file_id}', prefixing with 'file-'.")
	image_url_to_use = {"url": f"file-{file_id}"}
	else:
	return f"Error: Invalid file_id format. Must be an OpenAI file ID (file-...) or a data URL (data:image/...). Received: {file_id}"

	# Create an OpenAI client with explicit API key if provided
	if api_key:
	client = OpenAI(api_key=api_key)
	else:
	# Try to use environment variable
	if not os.environ.get('OPENAI_API_KEY'):
	return "Error: OpenAI API key not found. Please provide an API key or set the OPENAI_API_KEY environment variable."
	client = OpenAI()

	# Prepare the message with the image
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Please describe this image in detail. If it contains text, transcribe important parts. If it's a scientific figure, explain what it shows. If it's a chart or graph, describe the data visualization and key insights."},
	{"type": "image_url", "image_url": image_url_to_use}
	]
	}
	]

	# Call the Vision API
	response = client.chat.completions.create(
	model="gpt-4o", # Model with vision capabilities
	messages=messages,
	max_tokens=1000,
	temperature=0.2 # Lower temperature for more accurate descriptions
	)

	# Capture token usage
	if hasattr(response, 'usage') and response.usage:
	usage_info = {
	'prompt_tokens': response.usage.prompt_tokens,
	'completion_tokens': response.usage.completion_tokens,
	'total_tokens': response.usage.total_tokens
	}
	# Store usage in global collector if available (set by ExecutorAgent)
	import builtins
	if hasattr(builtins, '__agent_usage_collector__'):
	builtins.__agent_usage_collector__.append(usage_info)

	# Extract the description from the response
	description = response.choices[0].message.content

	return description

	except Exception as e:
	# Return a detailed error message for debugging
	error_message = f"Error processing image: {str(e)}"
	print(error_message) # Log the error for server-side debugging
	return error_message

	if __name__ == '__main__':
	# Test basic Excel schema discovery
	# print("Testing Excel Schema Discovery:")
	# schemas = discover_excel_files_and_schemas(base_scan_directory_name="www")
	# print(json.dumps(schemas, indent=2))
	# print("\n")

	# Test WWW file manifest
	# print("Testing WWW File Manifest:")
	# manifest = list_all_files_in_www_directory(base_directory_name="www")
	# print(json.dumps(manifest, indent=2))
	# print("\n")

	# --- Test Literature Search ---
	print("Testing Multi-Source Literature Search Tool:")
	test_queries_lit = [
	"novel targets for CAR-T cell therapy in solid tumors",
	"role of microbiota in cancer immunotherapy response",
	"epigenetic regulation of T cell exhaustion"
	]
	# To see output like GenerationAgent expects:
	# search_results_for_llm = {"intermediate_data_for_llm": multi_source_literature_search(queries=test_queries_lit, max_results_per_query_per_source=1)}
	# print(json.dumps(search_results_for_llm, indent=2))

	# Simpler print for direct tool test:
	results = multi_source_literature_search(queries=test_queries_lit, max_results_per_query_per_source=1, max_total_unique_papers=2) # Fetch 2 papers for testing text fetch
	print(f"Found {len(results)} unique papers for text fetching test:")
	# print(json.dumps(results, indent=2))

	if results:
	print("\nTesting Text Fetching from URLs Tool:")
	# To see output like GenerationAgent expects for LLM:
	# fetched_text_data_for_llm = {"intermediate_data_for_llm": fetch_text_from_urls(paper_info_list=results, max_chars_per_paper=5000)}
	# print(json.dumps(fetched_text_data_for_llm, indent=2))

	# Simpler print for direct tool test:
	results_with_text = fetch_text_from_urls(paper_info_list=results, max_chars_per_paper=5000)
	print(f"Processed {len(results_with_text)} papers for text content:")
	for i, paper in enumerate(results_with_text):
	print(f"--- Paper {i+1} ---")
	print(f" Title: {paper.get('title')}")
	print(f" URL: {paper.get('url')}")
	text_content = paper.get('retrieved_text_content', 'Not found')
	print(f" Retrieved Text (first 200 chars): {text_content[:200]}...")
	print("\n")

	# No __main__ block here, this is a module of tools.