Spaces:

OxonTechnologies
/

CSS_EDA_Dashboard

Sleeping

App Files Files Community

CSS_EDA_Dashboard / src /datalake /athena.py

arash7920

Upload 38 files

e869d90 verified 3 months ago

raw

history blame contribute delete

12.2 kB

	"""
	AWS Athena query interface for data lake access.

	Provides methods to execute SQL queries against Athena and retrieve results
	as pandas DataFrames.
	"""

	from typing import Optional, List, Dict, Any
	import time
	import pandas as pd
	import boto3
	from botocore.exceptions import ClientError
	from urllib.parse import urlparse
	import io
	from .config import DataLakeConfig
	from .logger import setup_logger

	logger = setup_logger(__name__)


	class AthenaQuery:
	"""
	AWS Athena query interface.

	Executes SQL queries against Athena and retrieves results as pandas DataFrames.
	Handles query execution, polling, and result retrieval.
	"""

	def __init__(self, config: DataLakeConfig):
	"""
	Initialize Athena query interface.

	Args:
	config: DataLakeConfig instance with Athena configuration
	"""
	self.config = config
	session = config.get_boto3_session()
	self.athena_client = session.client('athena', region_name=config.region)
	self.s3_client = session.client('s3', region_name=config.region)
	logger.info(f"Initialized Athena client for database: {config.database_name}")

	def execute_query(
	self,
	query: str,
	wait: bool = True,
	timeout: int = 300,
	) -> Optional[str]:
	"""
	Execute SQL query in Athena.

	Args:
	query: SQL query string
	wait: If True, wait for query to complete and return execution ID
	timeout: Maximum time to wait for query completion (seconds)

	Returns:
	Query execution ID (if wait=False) or execution ID after completion (if wait=True)

	Raises:
	ClientError: If query execution fails
	TimeoutError: If query exceeds timeout
	"""
	query_execution_config = {
	'Database': self.config.database_name,
	}

	# OutputLocation should be in ResultConfiguration
	result_configuration = {
	'OutputLocation': self.config.s3_output_location,
	}

	logger.debug(f"Executing query: {query[:100]}...")

	try:
	start_params = {
	'QueryString': query,
	'QueryExecutionContext': query_execution_config,
	'ResultConfiguration': result_configuration,
	}

	# WorkGroup is a separate parameter, not in QueryExecutionContext
	if self.config.workgroup:
	start_params['WorkGroup'] = self.config.workgroup

	response = self.athena_client.start_query_execution(**start_params)
	execution_id = response['QueryExecutionId']
	logger.info(f"Query started with execution ID: {execution_id}")

	if not wait:
	return execution_id

	# Wait for query to complete
	return self._wait_for_completion(execution_id, timeout)
	except ClientError as e:
	logger.error(f"Query execution failed: {e}")
	raise

	def _wait_for_completion(self, execution_id: str, timeout: int = 300) -> str:
	"""
	Wait for query execution to complete.

	Args:
	execution_id: Query execution ID
	timeout: Maximum time to wait (seconds)

	Returns:
	Execution ID

	Raises:
	TimeoutError: If query exceeds timeout
	RuntimeError: If query fails
	"""
	start_time = time.time()

	while True:
	response = self.athena_client.get_query_execution(QueryExecutionId=execution_id)
	status = response['QueryExecution']['Status']['State']

	if status == 'SUCCEEDED':
	logger.info(f"Query {execution_id} completed successfully")
	return execution_id
	elif status == 'FAILED':
	reason = response['QueryExecution']['Status'].get('StateChangeReason', 'Unknown error')
	logger.error(f"Query {execution_id} failed: {reason}")
	raise RuntimeError(f"Query failed: {reason}")
	elif status == 'CANCELLED':
	logger.warning(f"Query {execution_id} was cancelled")
	raise RuntimeError("Query was cancelled")

	elapsed = time.time() - start_time
	if elapsed > timeout:
	raise TimeoutError(f"Query {execution_id} exceeded timeout of {timeout} seconds")

	time.sleep(1) # Poll every second

	def get_query_results(self, execution_id: str) -> pd.DataFrame:
	"""
	Get query results as pandas DataFrame.

	Optimized to read directly from S3 for large result sets, which is
	exponentially faster than paginated API calls.

	Args:
	execution_id: Query execution ID

	Returns:
	DataFrame with query results

	Raises:
	ClientError: If results cannot be retrieved
	"""
	logger.debug(f"Retrieving results for execution {execution_id}")

	# Try to read from S3 first (much faster for large result sets)
	try:
	return self._get_results_from_s3(execution_id)
	except Exception as e:
	logger.debug(f"Failed to read from S3, falling back to API: {e}")
	# Fall back to API method for backward compatibility
	return self._get_results_from_api(execution_id)

	def _get_results_from_s3(self, execution_id: str) -> pd.DataFrame:
	"""
	Get query results directly from S3 CSV file.

	This is exponentially faster than paginated API calls because:
	- Single file read vs hundreds/thousands of API calls
	- Pandas reads CSV in optimized C code
	- No row-by-row Python processing overhead

	Args:
	execution_id: Query execution ID

	Returns:
	DataFrame with query results

	Raises:
	Exception: If S3 read fails
	"""
	# Get query execution details to find S3 result location
	response = self.athena_client.get_query_execution(QueryExecutionId=execution_id)
	result_location = response['QueryExecution']['ResultConfiguration']['OutputLocation']

	# Parse S3 URI: s3://bucket/path/to/file.csv
	parsed = urlparse(result_location)
	bucket = parsed.netloc
	key = parsed.path.lstrip('/')

	logger.debug(f"Reading results from s3://{bucket}/{key}")

	# Read CSV directly from S3
	obj = self.s3_client.get_object(Bucket=bucket, Key=key)
	csv_content = obj['Body'].read()

	# Parse CSV with pandas (much faster than row-by-row processing)
	# Read as strings first to match original API behavior, then parse types
	df = pd.read_csv(io.BytesIO(csv_content), dtype=str, keep_default_na=False)

	# Apply type parsing to match original behavior
	# Convert to string first to handle any edge cases, then parse
	for col in df.columns:
	df[col] = df[col].astype(str).apply(self._parse_value)

	logger.info(f"Retrieved {len(df)} rows from S3 for query {execution_id}")
	return df

	def _get_results_from_api(self, execution_id: str) -> pd.DataFrame:
	"""
	Get query results using paginated API calls (fallback method).

	This is the original implementation, kept for backward compatibility
	when S3 read fails.

	Args:
	execution_id: Query execution ID

	Returns:
	DataFrame with query results

	Raises:
	ClientError: If results cannot be retrieved
	"""
	logger.debug(f"Using API method for execution {execution_id}")

	# Get result set
	paginator = self.athena_client.get_paginator('get_query_results')
	pages = paginator.paginate(QueryExecutionId=execution_id)

	rows = []
	column_names = None

	for page in pages:
	result_set = page['ResultSet']

	# Get column names from first page
	if column_names is None:
	column_names = [col['Name'] for col in result_set['ResultSetMetadata']['ColumnInfo']]

	# Get data rows (skip header row)
	for row in result_set['Rows'][1:]: # Skip header
	values = [self._parse_value(cell.get('VarCharValue', ''))
	for cell in row['Data']]
	rows.append(values)

	if not rows:
	logger.warning(f"No results returned for execution {execution_id}")
	return pd.DataFrame(columns=column_names or [])

	df = pd.DataFrame(rows, columns=column_names)
	logger.info(f"Retrieved {len(df)} rows from query {execution_id}")
	return df

	def _parse_value(self, value: str) -> Any:
	"""
	Parse string value to appropriate Python type.

	Args:
	value: String value from Athena result

	Returns:
	Parsed value (int, float, bool, or str)
	"""
	if value == '' or value is None:
	return None

	# Try to parse as number
	try:
	if '.' in value:
	return float(value)
	return int(value)
	except ValueError:
	pass

	# Try to parse as boolean
	if value.lower() in ('true', 'false'):
	return value.lower() == 'true'

	return value

	def query_to_dataframe(
	self,
	query: str,
	timeout: int = 300,
	) -> pd.DataFrame:
	"""
	Execute query and return results as DataFrame.

	Convenience method that combines execute_query and get_query_results.

	Args:
	query: SQL query string
	timeout: Maximum time to wait for query completion (seconds)

	Returns:
	DataFrame with query results
	"""
	execution_id = self.execute_query(query, wait=True, timeout=timeout)
	return self.get_query_results(execution_id)

	def list_tables(self, schema: Optional[str] = None) -> List[str]:
	"""
	List tables in the database.

	Args:
	schema: Optional schema name (defaults to database)

	Returns:
	List of table names
	"""
	if schema is None:
	schema = self.config.database_name

	query = f"""
	SELECT table_name
	FROM information_schema.tables
	WHERE table_schema = '{schema}'
	ORDER BY table_name
	"""

	try:
	df = self.query_to_dataframe(query)
	return df['table_name'].tolist() if not df.empty else []
	except Exception as e:
	logger.error(f"Failed to list tables: {e}")
	return []

	def describe_table(self, table_name: str, schema: Optional[str] = None) -> pd.DataFrame:
	"""
	Get table schema/columns.

	Args:
	table_name: Table name
	schema: Optional schema name (defaults to database)

	Returns:
	DataFrame with column information (column_name, data_type, etc.)
	"""
	if schema is None:
	schema = self.config.database_name

	query = f"""
	SELECT
	column_name,
	data_type,
	is_nullable
	FROM information_schema.columns
	WHERE table_schema = '{schema}'
	AND table_name = '{table_name}'
	ORDER BY ordinal_position
	"""

	try:
	return self.query_to_dataframe(query)
	except Exception as e:
	logger.error(f"Failed to describe table {table_name}: {e}")
	return pd.DataFrame()