Spaces:
Sleeping
Sleeping
Upload 38 files
Browse files- src/.DS_Store +0 -0
- src/README.md +439 -0
- src/config.yaml +137 -0
- src/datalake/__init__.py +22 -0
- src/datalake/__pycache__/__init__.cpython-310.pyc +0 -0
- src/datalake/__pycache__/athena.cpython-310.pyc +0 -0
- src/datalake/__pycache__/batch.cpython-310.pyc +0 -0
- src/datalake/__pycache__/catalog.cpython-310.pyc +0 -0
- src/datalake/__pycache__/config.cpython-310.pyc +0 -0
- src/datalake/__pycache__/logger.cpython-310.pyc +0 -0
- src/datalake/__pycache__/query.cpython-310.pyc +0 -0
- src/datalake/athena.py +356 -0
- src/datalake/batch.py +231 -0
- src/datalake/catalog.py +269 -0
- src/datalake/config.py +192 -0
- src/datalake/logger.py +33 -0
- src/datalake/query.py +277 -0
- src/examples/__init__.py +1 -0
- src/examples/batch_example.py +169 -0
- src/examples/explore_example.py +96 -0
- src/examples/query_example.py +188 -0
- src/explore_datalake.ipynb +1165 -0
- src/images/analysis.png +0 -0
- src/images/logo.png +0 -0
- src/images/oxon.jpeg +0 -0
- src/requirements.txt +10 -0
- src/setup.py +43 -0
- src/streamlit_app.py +1110 -35
- src/test_connection.py +78 -0
- src/utils/__init__.py +9 -0
- src/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- src/utils/__pycache__/correlation.cpython-310.pyc +0 -0
- src/utils/__pycache__/dimension_reduction.cpython-310.pyc +0 -0
- src/utils/__pycache__/feature_class.cpython-310.pyc +0 -0
- src/utils/correlation.py +248 -0
- src/utils/dimension_reduction.py +222 -0
- src/utils/feature_class.py +119 -0
- src/workshop.ipynb +1448 -0
src/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
src/README.md
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CANedge Data Lake Python SDK
|
| 2 |
+
|
| 3 |
+
Production-ready Python package for querying and analyzing CAN/LIN data lakes created from CSS Electronics CANedge MDF4 logs using AWS Athena.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **AWS Athena Integration**: Query Parquet data using SQL via Athena
|
| 8 |
+
- **CloudFormation Configuration**: Automatic configuration from CloudFormation stack outputs
|
| 9 |
+
- **Scalable**: Leverage Athena's distributed query engine for large datasets
|
| 10 |
+
- **Type-safe**: Full type hints and docstrings
|
| 11 |
+
- **Well-architected**: Clean module design with logging and error handling
|
| 12 |
+
|
| 13 |
+
## Installation
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
# Clone or download project
|
| 17 |
+
cd CSS
|
| 18 |
+
|
| 19 |
+
# Install in development mode
|
| 20 |
+
pip install -e .
|
| 21 |
+
|
| 22 |
+
# Or install from requirements
|
| 23 |
+
pip install -r requirements.txt
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## Prerequisites
|
| 27 |
+
|
| 28 |
+
1. **AWS Account** with:
|
| 29 |
+
- CloudFormation stack named `datalake-stack` (or specify custom name)
|
| 30 |
+
- Athena database configured
|
| 31 |
+
- S3 bucket with Parquet data
|
| 32 |
+
- AWS Glue catalog with table definitions
|
| 33 |
+
|
| 34 |
+
2. **CloudFormation Stack Outputs**:
|
| 35 |
+
Your `datalake-stack` must have the following outputs:
|
| 36 |
+
- `DatabaseName`: Athena database name
|
| 37 |
+
- `S3OutputLocation`: S3 location for Athena query results (e.g., `s3://bucket/athena-results/`)
|
| 38 |
+
- `WorkGroup`: (Optional) Athena workgroup name
|
| 39 |
+
- `Region`: (Optional) AWS region
|
| 40 |
+
|
| 41 |
+
3. **AWS Credentials**:
|
| 42 |
+
- AWS CLI configured: `aws configure`
|
| 43 |
+
- Or IAM role (for EC2/ECS/Lambda)
|
| 44 |
+
- Or environment variables
|
| 45 |
+
|
| 46 |
+
## Quick Start
|
| 47 |
+
|
| 48 |
+
### Option 1: Using Explicit Credentials (Recommended for Testing)
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
from datalake.config import DataLakeConfig
|
| 52 |
+
from datalake.athena import AthenaQuery
|
| 53 |
+
from datalake.catalog import DataLakeCatalog
|
| 54 |
+
from datalake.query import DataLakeQuery
|
| 55 |
+
|
| 56 |
+
# Load config with explicit credentials
|
| 57 |
+
config = DataLakeConfig.from_credentials(
|
| 58 |
+
database_name="dbparquetdatalake05",
|
| 59 |
+
workgroup="athenaworkgroup-datalake05",
|
| 60 |
+
s3_output_location="s3://canedge-raw-data-parquet/athena-results/",
|
| 61 |
+
region="eu-north-1",
|
| 62 |
+
access_key_id="YOUR_ACCESS_KEY_ID",
|
| 63 |
+
secret_access_key="YOUR_SECRET_ACCESS_KEY",
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Initialize Athena and catalog
|
| 67 |
+
athena = AthenaQuery(config)
|
| 68 |
+
catalog = DataLakeCatalog(athena, config)
|
| 69 |
+
query = DataLakeQuery(athena, catalog)
|
| 70 |
+
|
| 71 |
+
# List devices
|
| 72 |
+
devices = catalog.list_devices()
|
| 73 |
+
print(f"Devices: {devices}")
|
| 74 |
+
|
| 75 |
+
# Query data
|
| 76 |
+
df = query.read_device_message(
|
| 77 |
+
device_id="device_001",
|
| 78 |
+
message="EngineData",
|
| 79 |
+
date_range=("2024-01-15", "2024-01-20"),
|
| 80 |
+
limit=1000
|
| 81 |
+
)
|
| 82 |
+
print(f"Loaded {len(df)} records")
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Option 2: Using CloudFormation Stack
|
| 86 |
+
|
| 87 |
+
```python
|
| 88 |
+
from datalake.config import DataLakeConfig
|
| 89 |
+
from datalake.athena import AthenaQuery
|
| 90 |
+
from datalake.catalog import DataLakeCatalog
|
| 91 |
+
from datalake.query import DataLakeQuery
|
| 92 |
+
|
| 93 |
+
# Load config from CloudFormation stack
|
| 94 |
+
config = DataLakeConfig.from_cloudformation(
|
| 95 |
+
stack_name="datalake-stack",
|
| 96 |
+
region=None, # Auto-detect from stack or use default
|
| 97 |
+
profile=None, # Use default profile or IAM role
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Initialize Athena and catalog
|
| 101 |
+
athena = AthenaQuery(config)
|
| 102 |
+
catalog = DataLakeCatalog(athena, config)
|
| 103 |
+
query = DataLakeQuery(athena, catalog)
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
## Configuration
|
| 107 |
+
|
| 108 |
+
### Option 1: Using Explicit Credentials
|
| 109 |
+
|
| 110 |
+
For direct access with AWS credentials:
|
| 111 |
+
|
| 112 |
+
```python
|
| 113 |
+
config = DataLakeConfig.from_credentials(
|
| 114 |
+
database_name="dbparquetdatalake05",
|
| 115 |
+
workgroup="athenaworkgroup-datalake05",
|
| 116 |
+
s3_output_location="s3://canedge-raw-data-parquet/athena-results/",
|
| 117 |
+
region="eu-north-1",
|
| 118 |
+
access_key_id="AKIARJQJFFVASPMSGNNY",
|
| 119 |
+
secret_access_key="YOUR_SECRET_KEY",
|
| 120 |
+
)
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
**Parameters:**
|
| 124 |
+
- `database_name`: Athena database name
|
| 125 |
+
- `workgroup`: Athena workgroup name
|
| 126 |
+
- `s3_output_location`: S3 path for query results (must end with `/`)
|
| 127 |
+
- `region`: AWS region
|
| 128 |
+
- `access_key_id`: AWS access key ID
|
| 129 |
+
- `secret_access_key`: AWS secret access key
|
| 130 |
+
|
| 131 |
+
### Option 2: Using CloudFormation Stack
|
| 132 |
+
|
| 133 |
+
### CloudFormation Stack Setup
|
| 134 |
+
|
| 135 |
+
Your CloudFormation stack (`datalake-stack`) should output:
|
| 136 |
+
|
| 137 |
+
```yaml
|
| 138 |
+
Outputs:
|
| 139 |
+
DatabaseName:
|
| 140 |
+
Description: Athena database name
|
| 141 |
+
Value: canedge_datalake
|
| 142 |
+
|
| 143 |
+
S3OutputLocation:
|
| 144 |
+
Description: S3 location for Athena query results
|
| 145 |
+
Value: s3://my-bucket/athena-results/
|
| 146 |
+
|
| 147 |
+
WorkGroup:
|
| 148 |
+
Description: Athena workgroup name (optional)
|
| 149 |
+
Value: primary
|
| 150 |
+
|
| 151 |
+
Region:
|
| 152 |
+
Description: AWS region
|
| 153 |
+
Value: us-east-1
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
### Loading Configuration
|
| 157 |
+
|
| 158 |
+
```python
|
| 159 |
+
from datalake.config import DataLakeConfig
|
| 160 |
+
|
| 161 |
+
# Load from CloudFormation stack (default: 'datalake-stack')
|
| 162 |
+
config = DataLakeConfig.from_cloudformation()
|
| 163 |
+
|
| 164 |
+
# Or specify custom stack name
|
| 165 |
+
config = DataLakeConfig.from_cloudformation(
|
| 166 |
+
stack_name="my-custom-stack",
|
| 167 |
+
region="us-east-1", # Optional: override region
|
| 168 |
+
profile="myprofile", # Optional: use named AWS profile
|
| 169 |
+
)
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
## Data Lake Structure
|
| 173 |
+
|
| 174 |
+
### Athena Database Organization
|
| 175 |
+
|
| 176 |
+
The data lake is organized in Athena with:
|
| 177 |
+
- **Database**: Contains all tables (from CloudFormation output `DatabaseName`)
|
| 178 |
+
- **Tables**: Named by device and message (e.g., `device_001_EngineData`)
|
| 179 |
+
- **Partitions**: Date-based partitioning for efficient queries
|
| 180 |
+
- **Schema**: Each table has columns: `t` (timestamp), signal columns from DBC files
|
| 181 |
+
|
| 182 |
+
### Table Naming Convention
|
| 183 |
+
|
| 184 |
+
Tables are typically named:
|
| 185 |
+
- `{device_id}_{message_rule}` (e.g., `device_001_EngineData`)
|
| 186 |
+
- Or `{device_id}__{message_rule}` (double underscore)
|
| 187 |
+
- The catalog automatically detects the pattern
|
| 188 |
+
|
| 189 |
+
## Usage Patterns
|
| 190 |
+
|
| 191 |
+
### 1. Explore Data Lake
|
| 192 |
+
|
| 193 |
+
```python
|
| 194 |
+
from datalake.config import DataLakeConfig
|
| 195 |
+
from datalake.athena import AthenaQuery
|
| 196 |
+
from datalake.catalog import DataLakeCatalog
|
| 197 |
+
|
| 198 |
+
config = DataLakeConfig.from_cloudformation()
|
| 199 |
+
athena = AthenaQuery(config)
|
| 200 |
+
catalog = DataLakeCatalog(athena, config)
|
| 201 |
+
|
| 202 |
+
# List all tables
|
| 203 |
+
tables = catalog.list_tables()
|
| 204 |
+
print(f"Tables: {tables}")
|
| 205 |
+
|
| 206 |
+
# List devices
|
| 207 |
+
devices = catalog.list_devices()
|
| 208 |
+
print(f"Devices: {devices}")
|
| 209 |
+
|
| 210 |
+
# List messages for device
|
| 211 |
+
messages = catalog.list_messages("device_001")
|
| 212 |
+
print(f"Messages: {messages}")
|
| 213 |
+
|
| 214 |
+
# Get schema
|
| 215 |
+
schema = catalog.get_schema("device_001", "EngineData")
|
| 216 |
+
print(f"Columns: {list(schema.keys())}")
|
| 217 |
+
|
| 218 |
+
# List partitions (dates)
|
| 219 |
+
partitions = catalog.list_partitions("device_001", "EngineData")
|
| 220 |
+
print(f"Dates: {partitions}")
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
### 2. Query Data
|
| 224 |
+
|
| 225 |
+
```python
|
| 226 |
+
from datalake.query import DataLakeQuery
|
| 227 |
+
|
| 228 |
+
query = DataLakeQuery(athena, catalog)
|
| 229 |
+
|
| 230 |
+
# Read all data for device/message
|
| 231 |
+
df = query.read_device_message(
|
| 232 |
+
device_id="device_001",
|
| 233 |
+
message="EngineData",
|
| 234 |
+
date_range=("2024-01-15", "2024-01-20"),
|
| 235 |
+
columns=["t", "RPM", "Temperature"],
|
| 236 |
+
limit=10000
|
| 237 |
+
)
|
| 238 |
+
print(f"Loaded {len(df)} records")
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
### 3. Time Series Query
|
| 242 |
+
|
| 243 |
+
```python
|
| 244 |
+
# Query single signal over time window
|
| 245 |
+
df_ts = query.time_series_query(
|
| 246 |
+
device_id="device_001",
|
| 247 |
+
message="EngineData",
|
| 248 |
+
signal_name="RPM",
|
| 249 |
+
start_time=1000000000000000, # microseconds
|
| 250 |
+
end_time=2000000000000000,
|
| 251 |
+
limit=10000
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
# Convert timestamp and plot
|
| 255 |
+
df_ts['timestamp'] = pd.to_datetime(df_ts['t'], unit='us')
|
| 256 |
+
print(df_ts[['timestamp', 'RPM']].head())
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
### 4. Custom SQL Queries
|
| 260 |
+
|
| 261 |
+
```python
|
| 262 |
+
# Execute custom SQL
|
| 263 |
+
# Note: Use path-based filtering for date ranges
|
| 264 |
+
# Data structure: {device_id}/{message}/{year}/{month}/{day}/file.parquet
|
| 265 |
+
sql = """
|
| 266 |
+
SELECT
|
| 267 |
+
COUNT(*) as record_count,
|
| 268 |
+
AVG(RPM) as avg_rpm,
|
| 269 |
+
MAX(Temperature) as max_temp
|
| 270 |
+
FROM canedge_datalake.device_001_EngineData
|
| 271 |
+
WHERE try_cast(element_at(split("$path", '/'), -4) AS INTEGER) = 2024
|
| 272 |
+
AND try_cast(element_at(split("$path", '/'), -3) AS INTEGER) >= 1
|
| 273 |
+
AND try_cast(element_at(split("$path", '/'), -2) AS INTEGER) >= 15
|
| 274 |
+
"""
|
| 275 |
+
|
| 276 |
+
df = query.execute_sql(sql)
|
| 277 |
+
print(df)
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
### 5. Aggregation Queries
|
| 281 |
+
|
| 282 |
+
```python
|
| 283 |
+
# Use built-in aggregation method
|
| 284 |
+
# For date filtering, use path-based extraction
|
| 285 |
+
path_year = "try_cast(element_at(split(\"$path\", '/'), -4) AS INTEGER)"
|
| 286 |
+
path_month = "try_cast(element_at(split(\"$path\", '/'), -3) AS INTEGER)"
|
| 287 |
+
path_day = "try_cast(element_at(split(\"$path\", '/'), -2) AS INTEGER)"
|
| 288 |
+
where_clause = f"{path_year} = 2024 AND {path_month} >= 1 AND {path_day} >= 15"
|
| 289 |
+
|
| 290 |
+
df_agg = query.aggregate(
|
| 291 |
+
device_id="device_001",
|
| 292 |
+
message="EngineData",
|
| 293 |
+
aggregation="COUNT(*) as count, AVG(RPM) as avg_rpm, MIN(RPM) as min_rpm",
|
| 294 |
+
where_clause=where_clause
|
| 295 |
+
)
|
| 296 |
+
print(df_agg)
|
| 297 |
+
```
|
| 298 |
+
|
| 299 |
+
### 6. Batch Processing
|
| 300 |
+
|
| 301 |
+
```python
|
| 302 |
+
from datalake.batch import BatchProcessor
|
| 303 |
+
|
| 304 |
+
processor = BatchProcessor(query)
|
| 305 |
+
|
| 306 |
+
# Compute statistics across all data
|
| 307 |
+
stats = processor.aggregate_by_device_message(
|
| 308 |
+
aggregation_func=processor.compute_statistics,
|
| 309 |
+
message_filter="Engine.*"
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
for device, messages in stats.items():
|
| 313 |
+
for message, metrics in messages.items():
|
| 314 |
+
print(f"{device}/{message}: {metrics['count']} records")
|
| 315 |
+
|
| 316 |
+
# Export to CSV
|
| 317 |
+
processor.export_to_csv(
|
| 318 |
+
device_id="device_001",
|
| 319 |
+
message="EngineData",
|
| 320 |
+
output_path="engine_export.csv",
|
| 321 |
+
limit=100000
|
| 322 |
+
)
|
| 323 |
+
```
|
| 324 |
+
|
| 325 |
+
## Running Examples
|
| 326 |
+
|
| 327 |
+
```bash
|
| 328 |
+
# Test connection first
|
| 329 |
+
python test_connection.py
|
| 330 |
+
|
| 331 |
+
# Explore data lake structure
|
| 332 |
+
python examples/explore_example.py
|
| 333 |
+
|
| 334 |
+
# Query and analyze data
|
| 335 |
+
python examples/query_example.py
|
| 336 |
+
|
| 337 |
+
# Batch processing
|
| 338 |
+
python examples/batch_example.py
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
**Note:** All examples use explicit credentials. Update them with your actual credentials or modify to use CloudFormation stack.
|
| 342 |
+
|
| 343 |
+
## CloudFormation Stack Requirements
|
| 344 |
+
|
| 345 |
+
### Required Stack Outputs
|
| 346 |
+
|
| 347 |
+
1. **DatabaseName** (required)
|
| 348 |
+
- Athena database name containing your tables
|
| 349 |
+
- Example: `canedge_datalake`
|
| 350 |
+
|
| 351 |
+
2. **S3OutputLocation** (required)
|
| 352 |
+
- S3 bucket/path for Athena query results
|
| 353 |
+
- Must end with `/`
|
| 354 |
+
- Example: `s3://my-bucket/athena-results/`
|
| 355 |
+
- Must have write permissions for Athena
|
| 356 |
+
|
| 357 |
+
3. **WorkGroup** (optional)
|
| 358 |
+
- Athena workgroup name
|
| 359 |
+
- If not provided, uses default workgroup
|
| 360 |
+
|
| 361 |
+
4. **Region** (optional)
|
| 362 |
+
- AWS region
|
| 363 |
+
- If not provided, uses default region or stack region
|
| 364 |
+
|
| 365 |
+
### Example CloudFormation Template
|
| 366 |
+
|
| 367 |
+
```yaml
|
| 368 |
+
Resources:
|
| 369 |
+
AthenaDatabase:
|
| 370 |
+
Type: AWS::Glue::Database
|
| 371 |
+
Properties:
|
| 372 |
+
CatalogId: !Ref AWS::AccountId
|
| 373 |
+
DatabaseInput:
|
| 374 |
+
Name: canedge_datalake
|
| 375 |
+
|
| 376 |
+
Outputs:
|
| 377 |
+
DatabaseName:
|
| 378 |
+
Description: Athena database name
|
| 379 |
+
Value: canedge_datalake
|
| 380 |
+
Export:
|
| 381 |
+
Name: !Sub "${AWS::StackName}-DatabaseName"
|
| 382 |
+
|
| 383 |
+
S3OutputLocation:
|
| 384 |
+
Description: S3 location for Athena query results
|
| 385 |
+
Value: !Sub "s3://${ResultsBucket}/athena-results/"
|
| 386 |
+
Export:
|
| 387 |
+
Name: !Sub "${AWS::StackName}-S3OutputLocation"
|
| 388 |
+
|
| 389 |
+
WorkGroup:
|
| 390 |
+
Description: Athena workgroup name
|
| 391 |
+
Value: primary
|
| 392 |
+
Export:
|
| 393 |
+
Name: !Sub "${AWS::StackName}-WorkGroup"
|
| 394 |
+
|
| 395 |
+
Region:
|
| 396 |
+
Description: AWS region
|
| 397 |
+
Value: !Ref AWS::Region
|
| 398 |
+
Export:
|
| 399 |
+
Name: !Sub "${AWS::StackName}-Region"
|
| 400 |
+
```
|
| 401 |
+
|
| 402 |
+
## Performance Notes
|
| 403 |
+
|
| 404 |
+
- **Athena Query Limits**: Use `limit` parameter to control result size
|
| 405 |
+
- **Partition Pruning**: Date-based queries automatically use partition pruning
|
| 406 |
+
- **Query Costs**: Athena charges per TB scanned - use column selection and filters
|
| 407 |
+
- **Result Caching**: Athena caches query results for 24 hours
|
| 408 |
+
- **Concurrent Queries**: Athena supports multiple concurrent queries
|
| 409 |
+
|
| 410 |
+
## Troubleshooting
|
| 411 |
+
|
| 412 |
+
**"Stack not found"**
|
| 413 |
+
- Verify stack name: `aws cloudformation describe-stacks --stack-name datalake-stack`
|
| 414 |
+
- Check AWS credentials and region
|
| 415 |
+
- Ensure you have CloudFormation read permissions
|
| 416 |
+
|
| 417 |
+
**"Required output not found"**
|
| 418 |
+
- Verify stack outputs: `aws cloudformation describe-stacks --stack-name datalake-stack --query 'Stacks[0].Outputs'`
|
| 419 |
+
- Ensure `DatabaseName` and `S3OutputLocation` outputs exist
|
| 420 |
+
|
| 421 |
+
**"Query execution failed"**
|
| 422 |
+
- Check Athena permissions (Glue catalog access, S3 read permissions)
|
| 423 |
+
- Verify table names exist in the database
|
| 424 |
+
- Check S3 output location has write permissions
|
| 425 |
+
|
| 426 |
+
**"Table not found"**
|
| 427 |
+
- List tables: `catalog.list_tables()` to see available tables
|
| 428 |
+
- Verify table naming convention matches expected pattern
|
| 429 |
+
- Check Glue catalog for table definitions
|
| 430 |
+
|
| 431 |
+
## License
|
| 432 |
+
|
| 433 |
+
MIT
|
| 434 |
+
|
| 435 |
+
## References
|
| 436 |
+
|
| 437 |
+
- [CSS Electronics CANedge Documentation](https://www.csselectronics.com/pages/can-bus-logger-canedge)
|
| 438 |
+
- [AWS Athena Documentation](https://docs.aws.amazon.com/athena/)
|
| 439 |
+
- [AWS Glue Catalog](https://docs.aws.amazon.com/glue/latest/dg/catalog-and-crawler.html)
|
src/config.yaml
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AWS Configuration
|
| 2 |
+
aws:
|
| 3 |
+
database_name: "dbparquetdatalake05"
|
| 4 |
+
workgroup: "athenaworkgroup-datalake05"
|
| 5 |
+
s3_output_location: "s3://canedge-raw-data-parquet/athena-results/"
|
| 6 |
+
region: "eu-north-1"
|
| 7 |
+
access_key_id: "AKIARJQJFFVASPMSGNNY"
|
| 8 |
+
secret_access_key: "Z6ISPZJvvcv13JZKYyuUxiMRZvDrvfoWs4YTUBnh"
|
| 9 |
+
|
| 10 |
+
# Message Name Mapping
|
| 11 |
+
message_mapping:
|
| 12 |
+
"010C":
|
| 13 |
+
name: "Engine RPM"
|
| 14 |
+
tx_id: "0x7DF"
|
| 15 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 16 |
+
"010D":
|
| 17 |
+
name: "Vehicle speed"
|
| 18 |
+
tx_id: "0x7DF"
|
| 19 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 20 |
+
"0105":
|
| 21 |
+
name: "Engine coolant temperature"
|
| 22 |
+
tx_id: "0x7DF"
|
| 23 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 24 |
+
"010F":
|
| 25 |
+
name: "Intake air temperature (IAT)"
|
| 26 |
+
tx_id: "0x7DF"
|
| 27 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 28 |
+
"012F":
|
| 29 |
+
name: "Fuel level input"
|
| 30 |
+
tx_id: "0x7DF"
|
| 31 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 32 |
+
"0106":
|
| 33 |
+
name: "Short-term fuel trim (Bank 1)"
|
| 34 |
+
tx_id: "0x7DF"
|
| 35 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 36 |
+
"0107":
|
| 37 |
+
name: "Long-term fuel trim (Bank 1)"
|
| 38 |
+
tx_id: "0x7DF"
|
| 39 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 40 |
+
"0144":
|
| 41 |
+
name: "Commanded equivalence ratio (λ/EQR)"
|
| 42 |
+
tx_id: "0x7DF"
|
| 43 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 44 |
+
"0134":
|
| 45 |
+
name: "O₂ wideband B1S1 (equivalence/voltage)"
|
| 46 |
+
tx_id: "0x7DF"
|
| 47 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 48 |
+
"0132":
|
| 49 |
+
name: "Evaporative system vapor pressure"
|
| 50 |
+
tx_id: "0x7DF"
|
| 51 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 52 |
+
"0103":
|
| 53 |
+
name: "Fuel system status (open/closed loop)"
|
| 54 |
+
tx_id: "0x7DF"
|
| 55 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 56 |
+
"0104":
|
| 57 |
+
name: "Calculated engine load"
|
| 58 |
+
tx_id: "0x7DF"
|
| 59 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 60 |
+
"0143":
|
| 61 |
+
name: "Absolute engine load"
|
| 62 |
+
tx_id: "0x7DF"
|
| 63 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 64 |
+
"0110":
|
| 65 |
+
name: "Mass air flow (MAF)"
|
| 66 |
+
tx_id: "0x7DF"
|
| 67 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 68 |
+
"012E":
|
| 69 |
+
name: "Commanded evap purge"
|
| 70 |
+
tx_id: "0x7DF"
|
| 71 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 72 |
+
"010E":
|
| 73 |
+
name: "Ignition timing advance"
|
| 74 |
+
tx_id: "0x7DF"
|
| 75 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 76 |
+
"011F":
|
| 77 |
+
name: "Engine runtime (since start)"
|
| 78 |
+
tx_id: "0x7DF"
|
| 79 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 80 |
+
"015C":
|
| 81 |
+
name: "Engine oil temperature"
|
| 82 |
+
tx_id: "0x7DF"
|
| 83 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 84 |
+
"0135":
|
| 85 |
+
name: "O₂ wideband B1S2 (equivalence/voltage)"
|
| 86 |
+
tx_id: "0x7DF"
|
| 87 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 88 |
+
"013C":
|
| 89 |
+
name: "Catalyst temperature Bank1-Sensor1"
|
| 90 |
+
tx_id: "0x7DF"
|
| 91 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 92 |
+
"013D":
|
| 93 |
+
name: "Catalyst temperature Bank1-Sensor2"
|
| 94 |
+
tx_id: "0x7DF"
|
| 95 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 96 |
+
"0162":
|
| 97 |
+
name: "Engine commanded torque"
|
| 98 |
+
tx_id: "0x7DF"
|
| 99 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 100 |
+
"0163":
|
| 101 |
+
name: "Engine actual torque (percent)"
|
| 102 |
+
tx_id: "0x7DF"
|
| 103 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 104 |
+
"0164":
|
| 105 |
+
name: "Engine reference torque (N·m)"
|
| 106 |
+
tx_id: "0x7DF"
|
| 107 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 108 |
+
"0149":
|
| 109 |
+
name: "Accelerator pedal position D"
|
| 110 |
+
tx_id: "0x7DF"
|
| 111 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 112 |
+
"010B":
|
| 113 |
+
name: "Manifold absolute pressure (MAP)"
|
| 114 |
+
tx_id: "0x7DF"
|
| 115 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 116 |
+
"0133":
|
| 117 |
+
name: "Barometric pressure"
|
| 118 |
+
tx_id: "0x7DF"
|
| 119 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 120 |
+
"012C":
|
| 121 |
+
name: "Commanded EGR"
|
| 122 |
+
tx_id: "0x7DF"
|
| 123 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 124 |
+
"012D":
|
| 125 |
+
name: "EGR error"
|
| 126 |
+
tx_id: "0x7DF"
|
| 127 |
+
expected_rx_ids: ["0x7E8", "0x7E9", "0x7EA", "0x7EB", "0x7EC", "0x7ED", "0x7EE", "0x7EF"]
|
| 128 |
+
|
| 129 |
+
# Dashboard Configuration
|
| 130 |
+
dashboard:
|
| 131 |
+
page_title: "OXON Technologies"
|
| 132 |
+
page_icon: ":mag:"
|
| 133 |
+
layout: "wide"
|
| 134 |
+
sidebar_background_color: "#74b9ff"
|
| 135 |
+
logo_path: "images/logo.png"
|
| 136 |
+
header_logo_path: "images/analysis.png"
|
| 137 |
+
dosing_stage_date: "2025-12-16"
|
src/datalake/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CANedge Data Lake Python SDK
|
| 3 |
+
|
| 4 |
+
Production-ready Python package for querying and analyzing CAN/LIN data lakes
|
| 5 |
+
created from CSS Electronics CANedge MDF4 logs using AWS Athena.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "0.1.0"
|
| 9 |
+
|
| 10 |
+
from .config import DataLakeConfig
|
| 11 |
+
from .athena import AthenaQuery
|
| 12 |
+
from .catalog import DataLakeCatalog
|
| 13 |
+
from .query import DataLakeQuery
|
| 14 |
+
from .batch import BatchProcessor
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"DataLakeConfig",
|
| 18 |
+
"AthenaQuery",
|
| 19 |
+
"DataLakeCatalog",
|
| 20 |
+
"DataLakeQuery",
|
| 21 |
+
"BatchProcessor",
|
| 22 |
+
]
|
src/datalake/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (624 Bytes). View file
|
|
|
src/datalake/__pycache__/athena.cpython-310.pyc
ADDED
|
Binary file (10.4 kB). View file
|
|
|
src/datalake/__pycache__/batch.cpython-310.pyc
ADDED
|
Binary file (6.97 kB). View file
|
|
|
src/datalake/__pycache__/catalog.cpython-310.pyc
ADDED
|
Binary file (8.08 kB). View file
|
|
|
src/datalake/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (5.68 kB). View file
|
|
|
src/datalake/__pycache__/logger.cpython-310.pyc
ADDED
|
Binary file (950 Bytes). View file
|
|
|
src/datalake/__pycache__/query.cpython-310.pyc
ADDED
|
Binary file (7.51 kB). View file
|
|
|
src/datalake/athena.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AWS Athena query interface for data lake access.
|
| 3 |
+
|
| 4 |
+
Provides methods to execute SQL queries against Athena and retrieve results
|
| 5 |
+
as pandas DataFrames.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Optional, List, Dict, Any
|
| 9 |
+
import time
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import boto3
|
| 12 |
+
from botocore.exceptions import ClientError
|
| 13 |
+
from urllib.parse import urlparse
|
| 14 |
+
import io
|
| 15 |
+
from .config import DataLakeConfig
|
| 16 |
+
from .logger import setup_logger
|
| 17 |
+
|
| 18 |
+
logger = setup_logger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class AthenaQuery:
|
| 22 |
+
"""
|
| 23 |
+
AWS Athena query interface.
|
| 24 |
+
|
| 25 |
+
Executes SQL queries against Athena and retrieves results as pandas DataFrames.
|
| 26 |
+
Handles query execution, polling, and result retrieval.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, config: DataLakeConfig):
|
| 30 |
+
"""
|
| 31 |
+
Initialize Athena query interface.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
config: DataLakeConfig instance with Athena configuration
|
| 35 |
+
"""
|
| 36 |
+
self.config = config
|
| 37 |
+
session = config.get_boto3_session()
|
| 38 |
+
self.athena_client = session.client('athena', region_name=config.region)
|
| 39 |
+
self.s3_client = session.client('s3', region_name=config.region)
|
| 40 |
+
logger.info(f"Initialized Athena client for database: {config.database_name}")
|
| 41 |
+
|
| 42 |
+
def execute_query(
|
| 43 |
+
self,
|
| 44 |
+
query: str,
|
| 45 |
+
wait: bool = True,
|
| 46 |
+
timeout: int = 300,
|
| 47 |
+
) -> Optional[str]:
|
| 48 |
+
"""
|
| 49 |
+
Execute SQL query in Athena.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
query: SQL query string
|
| 53 |
+
wait: If True, wait for query to complete and return execution ID
|
| 54 |
+
timeout: Maximum time to wait for query completion (seconds)
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Query execution ID (if wait=False) or execution ID after completion (if wait=True)
|
| 58 |
+
|
| 59 |
+
Raises:
|
| 60 |
+
ClientError: If query execution fails
|
| 61 |
+
TimeoutError: If query exceeds timeout
|
| 62 |
+
"""
|
| 63 |
+
query_execution_config = {
|
| 64 |
+
'Database': self.config.database_name,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
# OutputLocation should be in ResultConfiguration
|
| 68 |
+
result_configuration = {
|
| 69 |
+
'OutputLocation': self.config.s3_output_location,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
logger.debug(f"Executing query: {query[:100]}...")
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
start_params = {
|
| 76 |
+
'QueryString': query,
|
| 77 |
+
'QueryExecutionContext': query_execution_config,
|
| 78 |
+
'ResultConfiguration': result_configuration,
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# WorkGroup is a separate parameter, not in QueryExecutionContext
|
| 82 |
+
if self.config.workgroup:
|
| 83 |
+
start_params['WorkGroup'] = self.config.workgroup
|
| 84 |
+
|
| 85 |
+
response = self.athena_client.start_query_execution(**start_params)
|
| 86 |
+
execution_id = response['QueryExecutionId']
|
| 87 |
+
logger.info(f"Query started with execution ID: {execution_id}")
|
| 88 |
+
|
| 89 |
+
if not wait:
|
| 90 |
+
return execution_id
|
| 91 |
+
|
| 92 |
+
# Wait for query to complete
|
| 93 |
+
return self._wait_for_completion(execution_id, timeout)
|
| 94 |
+
except ClientError as e:
|
| 95 |
+
logger.error(f"Query execution failed: {e}")
|
| 96 |
+
raise
|
| 97 |
+
|
| 98 |
+
def _wait_for_completion(self, execution_id: str, timeout: int = 300) -> str:
|
| 99 |
+
"""
|
| 100 |
+
Wait for query execution to complete.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
execution_id: Query execution ID
|
| 104 |
+
timeout: Maximum time to wait (seconds)
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Execution ID
|
| 108 |
+
|
| 109 |
+
Raises:
|
| 110 |
+
TimeoutError: If query exceeds timeout
|
| 111 |
+
RuntimeError: If query fails
|
| 112 |
+
"""
|
| 113 |
+
start_time = time.time()
|
| 114 |
+
|
| 115 |
+
while True:
|
| 116 |
+
response = self.athena_client.get_query_execution(QueryExecutionId=execution_id)
|
| 117 |
+
status = response['QueryExecution']['Status']['State']
|
| 118 |
+
|
| 119 |
+
if status == 'SUCCEEDED':
|
| 120 |
+
logger.info(f"Query {execution_id} completed successfully")
|
| 121 |
+
return execution_id
|
| 122 |
+
elif status == 'FAILED':
|
| 123 |
+
reason = response['QueryExecution']['Status'].get('StateChangeReason', 'Unknown error')
|
| 124 |
+
logger.error(f"Query {execution_id} failed: {reason}")
|
| 125 |
+
raise RuntimeError(f"Query failed: {reason}")
|
| 126 |
+
elif status == 'CANCELLED':
|
| 127 |
+
logger.warning(f"Query {execution_id} was cancelled")
|
| 128 |
+
raise RuntimeError("Query was cancelled")
|
| 129 |
+
|
| 130 |
+
elapsed = time.time() - start_time
|
| 131 |
+
if elapsed > timeout:
|
| 132 |
+
raise TimeoutError(f"Query {execution_id} exceeded timeout of {timeout} seconds")
|
| 133 |
+
|
| 134 |
+
time.sleep(1) # Poll every second
|
| 135 |
+
|
| 136 |
+
def get_query_results(self, execution_id: str) -> pd.DataFrame:
|
| 137 |
+
"""
|
| 138 |
+
Get query results as pandas DataFrame.
|
| 139 |
+
|
| 140 |
+
Optimized to read directly from S3 for large result sets, which is
|
| 141 |
+
exponentially faster than paginated API calls.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
execution_id: Query execution ID
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
DataFrame with query results
|
| 148 |
+
|
| 149 |
+
Raises:
|
| 150 |
+
ClientError: If results cannot be retrieved
|
| 151 |
+
"""
|
| 152 |
+
logger.debug(f"Retrieving results for execution {execution_id}")
|
| 153 |
+
|
| 154 |
+
# Try to read from S3 first (much faster for large result sets)
|
| 155 |
+
try:
|
| 156 |
+
return self._get_results_from_s3(execution_id)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.debug(f"Failed to read from S3, falling back to API: {e}")
|
| 159 |
+
# Fall back to API method for backward compatibility
|
| 160 |
+
return self._get_results_from_api(execution_id)
|
| 161 |
+
|
| 162 |
+
def _get_results_from_s3(self, execution_id: str) -> pd.DataFrame:
|
| 163 |
+
"""
|
| 164 |
+
Get query results directly from S3 CSV file.
|
| 165 |
+
|
| 166 |
+
This is exponentially faster than paginated API calls because:
|
| 167 |
+
- Single file read vs hundreds/thousands of API calls
|
| 168 |
+
- Pandas reads CSV in optimized C code
|
| 169 |
+
- No row-by-row Python processing overhead
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
execution_id: Query execution ID
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
DataFrame with query results
|
| 176 |
+
|
| 177 |
+
Raises:
|
| 178 |
+
Exception: If S3 read fails
|
| 179 |
+
"""
|
| 180 |
+
# Get query execution details to find S3 result location
|
| 181 |
+
response = self.athena_client.get_query_execution(QueryExecutionId=execution_id)
|
| 182 |
+
result_location = response['QueryExecution']['ResultConfiguration']['OutputLocation']
|
| 183 |
+
|
| 184 |
+
# Parse S3 URI: s3://bucket/path/to/file.csv
|
| 185 |
+
parsed = urlparse(result_location)
|
| 186 |
+
bucket = parsed.netloc
|
| 187 |
+
key = parsed.path.lstrip('/')
|
| 188 |
+
|
| 189 |
+
logger.debug(f"Reading results from s3://{bucket}/{key}")
|
| 190 |
+
|
| 191 |
+
# Read CSV directly from S3
|
| 192 |
+
obj = self.s3_client.get_object(Bucket=bucket, Key=key)
|
| 193 |
+
csv_content = obj['Body'].read()
|
| 194 |
+
|
| 195 |
+
# Parse CSV with pandas (much faster than row-by-row processing)
|
| 196 |
+
# Read as strings first to match original API behavior, then parse types
|
| 197 |
+
df = pd.read_csv(io.BytesIO(csv_content), dtype=str, keep_default_na=False)
|
| 198 |
+
|
| 199 |
+
# Apply type parsing to match original behavior
|
| 200 |
+
# Convert to string first to handle any edge cases, then parse
|
| 201 |
+
for col in df.columns:
|
| 202 |
+
df[col] = df[col].astype(str).apply(self._parse_value)
|
| 203 |
+
|
| 204 |
+
logger.info(f"Retrieved {len(df)} rows from S3 for query {execution_id}")
|
| 205 |
+
return df
|
| 206 |
+
|
| 207 |
+
def _get_results_from_api(self, execution_id: str) -> pd.DataFrame:
|
| 208 |
+
"""
|
| 209 |
+
Get query results using paginated API calls (fallback method).
|
| 210 |
+
|
| 211 |
+
This is the original implementation, kept for backward compatibility
|
| 212 |
+
when S3 read fails.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
execution_id: Query execution ID
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
DataFrame with query results
|
| 219 |
+
|
| 220 |
+
Raises:
|
| 221 |
+
ClientError: If results cannot be retrieved
|
| 222 |
+
"""
|
| 223 |
+
logger.debug(f"Using API method for execution {execution_id}")
|
| 224 |
+
|
| 225 |
+
# Get result set
|
| 226 |
+
paginator = self.athena_client.get_paginator('get_query_results')
|
| 227 |
+
pages = paginator.paginate(QueryExecutionId=execution_id)
|
| 228 |
+
|
| 229 |
+
rows = []
|
| 230 |
+
column_names = None
|
| 231 |
+
|
| 232 |
+
for page in pages:
|
| 233 |
+
result_set = page['ResultSet']
|
| 234 |
+
|
| 235 |
+
# Get column names from first page
|
| 236 |
+
if column_names is None:
|
| 237 |
+
column_names = [col['Name'] for col in result_set['ResultSetMetadata']['ColumnInfo']]
|
| 238 |
+
|
| 239 |
+
# Get data rows (skip header row)
|
| 240 |
+
for row in result_set['Rows'][1:]: # Skip header
|
| 241 |
+
values = [self._parse_value(cell.get('VarCharValue', ''))
|
| 242 |
+
for cell in row['Data']]
|
| 243 |
+
rows.append(values)
|
| 244 |
+
|
| 245 |
+
if not rows:
|
| 246 |
+
logger.warning(f"No results returned for execution {execution_id}")
|
| 247 |
+
return pd.DataFrame(columns=column_names or [])
|
| 248 |
+
|
| 249 |
+
df = pd.DataFrame(rows, columns=column_names)
|
| 250 |
+
logger.info(f"Retrieved {len(df)} rows from query {execution_id}")
|
| 251 |
+
return df
|
| 252 |
+
|
| 253 |
+
def _parse_value(self, value: str) -> Any:
|
| 254 |
+
"""
|
| 255 |
+
Parse string value to appropriate Python type.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
value: String value from Athena result
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
Parsed value (int, float, bool, or str)
|
| 262 |
+
"""
|
| 263 |
+
if value == '' or value is None:
|
| 264 |
+
return None
|
| 265 |
+
|
| 266 |
+
# Try to parse as number
|
| 267 |
+
try:
|
| 268 |
+
if '.' in value:
|
| 269 |
+
return float(value)
|
| 270 |
+
return int(value)
|
| 271 |
+
except ValueError:
|
| 272 |
+
pass
|
| 273 |
+
|
| 274 |
+
# Try to parse as boolean
|
| 275 |
+
if value.lower() in ('true', 'false'):
|
| 276 |
+
return value.lower() == 'true'
|
| 277 |
+
|
| 278 |
+
return value
|
| 279 |
+
|
| 280 |
+
def query_to_dataframe(
|
| 281 |
+
self,
|
| 282 |
+
query: str,
|
| 283 |
+
timeout: int = 300,
|
| 284 |
+
) -> pd.DataFrame:
|
| 285 |
+
"""
|
| 286 |
+
Execute query and return results as DataFrame.
|
| 287 |
+
|
| 288 |
+
Convenience method that combines execute_query and get_query_results.
|
| 289 |
+
|
| 290 |
+
Args:
|
| 291 |
+
query: SQL query string
|
| 292 |
+
timeout: Maximum time to wait for query completion (seconds)
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
DataFrame with query results
|
| 296 |
+
"""
|
| 297 |
+
execution_id = self.execute_query(query, wait=True, timeout=timeout)
|
| 298 |
+
return self.get_query_results(execution_id)
|
| 299 |
+
|
| 300 |
+
def list_tables(self, schema: Optional[str] = None) -> List[str]:
|
| 301 |
+
"""
|
| 302 |
+
List tables in the database.
|
| 303 |
+
|
| 304 |
+
Args:
|
| 305 |
+
schema: Optional schema name (defaults to database)
|
| 306 |
+
|
| 307 |
+
Returns:
|
| 308 |
+
List of table names
|
| 309 |
+
"""
|
| 310 |
+
if schema is None:
|
| 311 |
+
schema = self.config.database_name
|
| 312 |
+
|
| 313 |
+
query = f"""
|
| 314 |
+
SELECT table_name
|
| 315 |
+
FROM information_schema.tables
|
| 316 |
+
WHERE table_schema = '{schema}'
|
| 317 |
+
ORDER BY table_name
|
| 318 |
+
"""
|
| 319 |
+
|
| 320 |
+
try:
|
| 321 |
+
df = self.query_to_dataframe(query)
|
| 322 |
+
return df['table_name'].tolist() if not df.empty else []
|
| 323 |
+
except Exception as e:
|
| 324 |
+
logger.error(f"Failed to list tables: {e}")
|
| 325 |
+
return []
|
| 326 |
+
|
| 327 |
+
def describe_table(self, table_name: str, schema: Optional[str] = None) -> pd.DataFrame:
|
| 328 |
+
"""
|
| 329 |
+
Get table schema/columns.
|
| 330 |
+
|
| 331 |
+
Args:
|
| 332 |
+
table_name: Table name
|
| 333 |
+
schema: Optional schema name (defaults to database)
|
| 334 |
+
|
| 335 |
+
Returns:
|
| 336 |
+
DataFrame with column information (column_name, data_type, etc.)
|
| 337 |
+
"""
|
| 338 |
+
if schema is None:
|
| 339 |
+
schema = self.config.database_name
|
| 340 |
+
|
| 341 |
+
query = f"""
|
| 342 |
+
SELECT
|
| 343 |
+
column_name,
|
| 344 |
+
data_type,
|
| 345 |
+
is_nullable
|
| 346 |
+
FROM information_schema.columns
|
| 347 |
+
WHERE table_schema = '{schema}'
|
| 348 |
+
AND table_name = '{table_name}'
|
| 349 |
+
ORDER BY ordinal_position
|
| 350 |
+
"""
|
| 351 |
+
|
| 352 |
+
try:
|
| 353 |
+
return self.query_to_dataframe(query)
|
| 354 |
+
except Exception as e:
|
| 355 |
+
logger.error(f"Failed to describe table {table_name}: {e}")
|
| 356 |
+
return pd.DataFrame()
|
src/datalake/batch.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Batch processing utilities for scalable data lake analysis.
|
| 3 |
+
|
| 4 |
+
Provides patterns for aggregating, analyzing, and exporting data across
|
| 5 |
+
the entire data lake or subsets thereof.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Callable, Dict, Any, Optional
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from .query import DataLakeQuery
|
| 11 |
+
from .logger import setup_logger
|
| 12 |
+
|
| 13 |
+
logger = setup_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class BatchProcessor:
|
| 17 |
+
"""
|
| 18 |
+
Batch processing utilities for scalable data lake analysis.
|
| 19 |
+
|
| 20 |
+
Provides high-level patterns for common analysis tasks across
|
| 21 |
+
multiple devices and messages.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, query: DataLakeQuery):
|
| 25 |
+
"""
|
| 26 |
+
Initialize batch processor.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
query: DataLakeQuery instance
|
| 30 |
+
"""
|
| 31 |
+
self.query = query
|
| 32 |
+
logger.info("Initialized BatchProcessor")
|
| 33 |
+
|
| 34 |
+
def aggregate_by_device_message(
|
| 35 |
+
self,
|
| 36 |
+
aggregation_func: Callable[[pd.DataFrame], Dict[str, Any]],
|
| 37 |
+
device_filter: Optional[str] = None,
|
| 38 |
+
message_filter: Optional[str] = None,
|
| 39 |
+
) -> Dict[str, Dict[str, Any]]:
|
| 40 |
+
"""
|
| 41 |
+
Apply aggregation function to each device/message combination.
|
| 42 |
+
|
| 43 |
+
Pattern for scalable analysis across entire data lake. Processes
|
| 44 |
+
each device/message combination separately to manage memory.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
aggregation_func: Function (df) -> dict of metrics/statistics
|
| 48 |
+
device_filter: Device regex filter (applied via catalog)
|
| 49 |
+
message_filter: Message regex filter
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
Nested dict: {device_id: {message: aggregation_result}}
|
| 53 |
+
|
| 54 |
+
Example:
|
| 55 |
+
>>> def compute_stats(df):
|
| 56 |
+
... return {
|
| 57 |
+
... 'count': len(df),
|
| 58 |
+
... 'rpm_mean': df['RPM'].mean() if 'RPM' in df else None
|
| 59 |
+
... }
|
| 60 |
+
>>> results = processor.aggregate_by_device_message(compute_stats)
|
| 61 |
+
"""
|
| 62 |
+
results: Dict[str, Dict[str, Any]] = {}
|
| 63 |
+
|
| 64 |
+
# Get filtered device list
|
| 65 |
+
devices = self.query.catalog.list_devices(device_filter)
|
| 66 |
+
|
| 67 |
+
for device in devices:
|
| 68 |
+
messages = self.query.catalog.list_messages(device, message_filter)
|
| 69 |
+
for message in messages:
|
| 70 |
+
try:
|
| 71 |
+
# Read data for this device/message
|
| 72 |
+
df = self.query.read_device_message(device, message)
|
| 73 |
+
if device not in results:
|
| 74 |
+
results[device] = {}
|
| 75 |
+
results[device][message] = aggregation_func(df)
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Aggregation failed for {device}/{message}: {e}")
|
| 78 |
+
if device not in results:
|
| 79 |
+
results[device] = {}
|
| 80 |
+
results[device][message] = {"error": str(e)}
|
| 81 |
+
|
| 82 |
+
logger.info(f"Aggregation completed for {len(results)} devices")
|
| 83 |
+
return results
|
| 84 |
+
|
| 85 |
+
def export_to_csv(
|
| 86 |
+
self,
|
| 87 |
+
device_id: str,
|
| 88 |
+
message: str,
|
| 89 |
+
output_path: str,
|
| 90 |
+
date_range: Optional[tuple[str, str]] = None,
|
| 91 |
+
limit: Optional[int] = None,
|
| 92 |
+
) -> None:
|
| 93 |
+
"""
|
| 94 |
+
Export device/message data to CSV.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
device_id: Device identifier
|
| 98 |
+
message: Message name
|
| 99 |
+
output_path: Output CSV file path
|
| 100 |
+
date_range: Optional (start_date, end_date) tuple
|
| 101 |
+
limit: Optional row limit
|
| 102 |
+
|
| 103 |
+
Raises:
|
| 104 |
+
Exception: If export fails
|
| 105 |
+
"""
|
| 106 |
+
logger.info(f"Exporting {device_id}/{message} to {output_path}")
|
| 107 |
+
df = self.query.read_device_message(
|
| 108 |
+
device_id=device_id,
|
| 109 |
+
message=message,
|
| 110 |
+
date_range=date_range,
|
| 111 |
+
limit=limit,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
if df.empty:
|
| 115 |
+
logger.warning(f"No data to export for {device_id}/{message}")
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
df.to_csv(output_path, index=False)
|
| 119 |
+
logger.info(f"Exported {len(df)} rows to {output_path}")
|
| 120 |
+
|
| 121 |
+
def export_to_parquet(
|
| 122 |
+
self,
|
| 123 |
+
device_id: str,
|
| 124 |
+
message: str,
|
| 125 |
+
output_path: str,
|
| 126 |
+
date_range: Optional[tuple[str, str]] = None,
|
| 127 |
+
) -> None:
|
| 128 |
+
"""
|
| 129 |
+
Export device/message data to Parquet file.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
device_id: Device identifier
|
| 133 |
+
message: Message name
|
| 134 |
+
output_path: Output Parquet file path
|
| 135 |
+
date_range: Optional (start_date, end_date) tuple
|
| 136 |
+
|
| 137 |
+
Raises:
|
| 138 |
+
Exception: If export fails
|
| 139 |
+
"""
|
| 140 |
+
logger.info(f"Exporting {device_id}/{message} to {output_path}")
|
| 141 |
+
df = self.query.read_device_message(
|
| 142 |
+
device_id=device_id,
|
| 143 |
+
message=message,
|
| 144 |
+
date_range=date_range,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
if df.empty:
|
| 148 |
+
logger.warning(f"No data to export for {device_id}/{message}")
|
| 149 |
+
return
|
| 150 |
+
|
| 151 |
+
df.to_parquet(output_path, index=False, compression='snappy')
|
| 152 |
+
logger.info(f"Exported {len(df)} rows to {output_path}")
|
| 153 |
+
|
| 154 |
+
def compute_statistics(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 155 |
+
"""
|
| 156 |
+
Compute basic statistics for aggregation.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
df: Input DataFrame
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
Dict with count, mean, min, max, std for numeric columns
|
| 163 |
+
|
| 164 |
+
Note:
|
| 165 |
+
Skips timestamp column 't' in statistics computation.
|
| 166 |
+
"""
|
| 167 |
+
stats: Dict[str, Any] = {"count": len(df)}
|
| 168 |
+
|
| 169 |
+
if df.empty:
|
| 170 |
+
return stats
|
| 171 |
+
|
| 172 |
+
# Compute statistics for numeric columns (excluding timestamp)
|
| 173 |
+
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
| 174 |
+
numeric_cols = [c for c in numeric_cols if c != 't']
|
| 175 |
+
|
| 176 |
+
for col in numeric_cols:
|
| 177 |
+
try:
|
| 178 |
+
stats[f"{col}_mean"] = float(df[col].mean())
|
| 179 |
+
stats[f"{col}_min"] = float(df[col].min())
|
| 180 |
+
stats[f"{col}_max"] = float(df[col].max())
|
| 181 |
+
stats[f"{col}_std"] = float(df[col].std())
|
| 182 |
+
stats[f"{col}_null_count"] = int(df[col].isna().sum())
|
| 183 |
+
except Exception as e:
|
| 184 |
+
logger.warning(f"Failed to compute stats for {col}: {e}")
|
| 185 |
+
|
| 186 |
+
return stats
|
| 187 |
+
|
| 188 |
+
def find_anomalies(
|
| 189 |
+
self,
|
| 190 |
+
device_id: str,
|
| 191 |
+
message: str,
|
| 192 |
+
signal_name: str,
|
| 193 |
+
threshold_std: float = 3.0,
|
| 194 |
+
) -> pd.DataFrame:
|
| 195 |
+
"""
|
| 196 |
+
Find anomalous values in a signal using z-score method.
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
device_id: Device identifier
|
| 200 |
+
message: Message name
|
| 201 |
+
signal_name: Signal column name
|
| 202 |
+
threshold_std: Number of standard deviations for anomaly threshold
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
DataFrame with anomalous records
|
| 206 |
+
"""
|
| 207 |
+
df = self.query.read_device_message(
|
| 208 |
+
device_id=device_id,
|
| 209 |
+
message=message,
|
| 210 |
+
columns=['t', signal_name],
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
if df.empty or signal_name not in df.columns:
|
| 214 |
+
logger.warning(f"No data or signal not found: {signal_name}")
|
| 215 |
+
return pd.DataFrame()
|
| 216 |
+
|
| 217 |
+
# Compute z-scores
|
| 218 |
+
mean = df[signal_name].mean()
|
| 219 |
+
std = df[signal_name].std()
|
| 220 |
+
|
| 221 |
+
if std == 0:
|
| 222 |
+
logger.warning(f"Zero standard deviation for {signal_name}")
|
| 223 |
+
return pd.DataFrame()
|
| 224 |
+
|
| 225 |
+
z_scores = (df[signal_name] - mean) / std
|
| 226 |
+
anomalies = df[abs(z_scores) > threshold_std].copy()
|
| 227 |
+
|
| 228 |
+
logger.info(f"Found {len(anomalies)} anomalies in {signal_name} "
|
| 229 |
+
f"(threshold: {threshold_std} std)")
|
| 230 |
+
|
| 231 |
+
return anomalies
|
src/datalake/catalog.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data lake catalog for discovering structure and metadata using AWS Athena/Glue.
|
| 3 |
+
|
| 4 |
+
Provides methods to explore the data lake organization using Athena metadata:
|
| 5 |
+
- List devices, messages, and dates from table structure
|
| 6 |
+
- Get schemas for message/rule tables
|
| 7 |
+
- Understand data availability
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from typing import List, Dict, Optional
|
| 11 |
+
import re
|
| 12 |
+
from .athena import AthenaQuery
|
| 13 |
+
from .config import DataLakeConfig
|
| 14 |
+
from .logger import setup_logger
|
| 15 |
+
|
| 16 |
+
logger = setup_logger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DataLakeCatalog:
|
| 20 |
+
"""
|
| 21 |
+
Catalog for exploring data lake structure using Athena/Glue.
|
| 22 |
+
|
| 23 |
+
Assumes Athena database contains tables organized by device and message.
|
| 24 |
+
Table naming convention: {device_id}_{message_rule} or similar
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, athena_query: AthenaQuery, config: DataLakeConfig):
|
| 28 |
+
"""
|
| 29 |
+
Initialize catalog.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
athena_query: AthenaQuery instance
|
| 33 |
+
config: DataLakeConfig instance
|
| 34 |
+
"""
|
| 35 |
+
self.athena = athena_query
|
| 36 |
+
self.config = config
|
| 37 |
+
self._cache: Dict[str, Dict] = {}
|
| 38 |
+
logger.info(f"Initialized catalog for database: {config.database_name}")
|
| 39 |
+
|
| 40 |
+
def list_tables(self) -> List[str]:
|
| 41 |
+
"""
|
| 42 |
+
List all tables in the database.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Sorted list of table names
|
| 46 |
+
"""
|
| 47 |
+
tables = self.athena.list_tables()
|
| 48 |
+
logger.info(f"Found {len(tables)} tables in database")
|
| 49 |
+
return sorted(tables)
|
| 50 |
+
|
| 51 |
+
def list_devices(self, device_filter: Optional[str] = None) -> List[str]:
|
| 52 |
+
"""
|
| 53 |
+
List all device IDs by extracting from table names.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
device_filter: Optional regex pattern to filter devices
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Sorted list of device IDs
|
| 60 |
+
|
| 61 |
+
Note:
|
| 62 |
+
Extracts device IDs from table names. Assumes table naming like:
|
| 63 |
+
- {device_id}_{message_rule}
|
| 64 |
+
- {device_id}__{message_rule}
|
| 65 |
+
- Or similar pattern
|
| 66 |
+
"""
|
| 67 |
+
tables = self.list_tables()
|
| 68 |
+
devices = set()
|
| 69 |
+
|
| 70 |
+
for table in tables:
|
| 71 |
+
# Try common patterns: device_message, device__message, device.message
|
| 72 |
+
parts = re.split(r'_', table, maxsplit=2)
|
| 73 |
+
if len(parts) >= 1:
|
| 74 |
+
device = parts[1]
|
| 75 |
+
if device == 'aggregations': # skip aggregations table
|
| 76 |
+
continue
|
| 77 |
+
if device_filter is None or re.search(device_filter, device):
|
| 78 |
+
devices.add(device)
|
| 79 |
+
|
| 80 |
+
result = sorted(devices)
|
| 81 |
+
logger.info(f"Found {len(result)} device(s)")
|
| 82 |
+
return result
|
| 83 |
+
|
| 84 |
+
def list_messages(self, device_id: str, message_filter: Optional[str] = None) -> List[str]:
|
| 85 |
+
"""
|
| 86 |
+
List all message/rule names for a device.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
device_id: Device identifier
|
| 90 |
+
message_filter: Optional regex pattern to filter messages
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Sorted list of message/rule names
|
| 94 |
+
|
| 95 |
+
Note:
|
| 96 |
+
Extracts message names from table names. Assumes table naming like:
|
| 97 |
+
- prefix_{device_id}_{message_rule}
|
| 98 |
+
- Or {device_id}_{message_rule}
|
| 99 |
+
"""
|
| 100 |
+
tables = self.list_tables()
|
| 101 |
+
messages = set()
|
| 102 |
+
|
| 103 |
+
for table in tables:
|
| 104 |
+
# Split table name by underscore (consistent with list_devices)
|
| 105 |
+
parts = re.split(r'_', table, maxsplit=2)
|
| 106 |
+
|
| 107 |
+
# Try pattern: prefix_device_message
|
| 108 |
+
if len(parts) >= 3:
|
| 109 |
+
table_device = parts[1]
|
| 110 |
+
if table_device == device_id:
|
| 111 |
+
message = parts[2]
|
| 112 |
+
if message_filter is None or re.search(message_filter, message):
|
| 113 |
+
messages.add(message)
|
| 114 |
+
# Try pattern: device_message (no prefix)
|
| 115 |
+
elif len(parts) >= 2:
|
| 116 |
+
table_device = parts[0]
|
| 117 |
+
if table_device == device_id:
|
| 118 |
+
message = parts[1]
|
| 119 |
+
if message_filter is None or re.search(message_filter, message):
|
| 120 |
+
messages.add(message)
|
| 121 |
+
|
| 122 |
+
result = sorted(messages)
|
| 123 |
+
logger.info(f"Found {len(result)} messages for device {device_id}")
|
| 124 |
+
return result
|
| 125 |
+
|
| 126 |
+
def get_table_name(self, device_id: str, message: str) -> str:
|
| 127 |
+
"""
|
| 128 |
+
Get table name for device/message combination.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
device_id: Device identifier
|
| 132 |
+
message: Message/rule name
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
Table name (tries common patterns)
|
| 136 |
+
|
| 137 |
+
Raises:
|
| 138 |
+
ValueError: If table not found
|
| 139 |
+
"""
|
| 140 |
+
tables = self.list_tables()
|
| 141 |
+
|
| 142 |
+
# Try patterns consistent with list_devices/list_messages
|
| 143 |
+
# Pattern 1: prefix_device_message
|
| 144 |
+
for table in tables:
|
| 145 |
+
parts = re.split(r'_', table, maxsplit=2)
|
| 146 |
+
if len(parts) >= 3:
|
| 147 |
+
if parts[1] == device_id and parts[2] == message:
|
| 148 |
+
return table
|
| 149 |
+
|
| 150 |
+
# Pattern 2: device_message (no prefix)
|
| 151 |
+
for table in tables:
|
| 152 |
+
parts = re.split(r'_', table, maxsplit=1)
|
| 153 |
+
if len(parts) >= 2:
|
| 154 |
+
if parts[0] == device_id and parts[1] == message:
|
| 155 |
+
return table
|
| 156 |
+
|
| 157 |
+
# Fallback: try exact matches
|
| 158 |
+
patterns = [
|
| 159 |
+
f"{device_id}_{message}",
|
| 160 |
+
f"{device_id}__{message}",
|
| 161 |
+
f"{device_id}_{message}".lower(),
|
| 162 |
+
f"{device_id}__{message}".lower(),
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
for pattern in patterns:
|
| 166 |
+
if pattern in tables:
|
| 167 |
+
return pattern
|
| 168 |
+
|
| 169 |
+
raise ValueError(
|
| 170 |
+
f"Table not found for {device_id}/{message}. "
|
| 171 |
+
f"Available tables: {tables[:10]}..."
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
def get_schema(self, device_id: str, message: str) -> Optional[Dict[str, str]]:
|
| 175 |
+
"""
|
| 176 |
+
Get schema for a message table.
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
device_id: Device identifier
|
| 180 |
+
message: Message/rule name
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
Dict mapping column names to data types, or None if not found
|
| 184 |
+
"""
|
| 185 |
+
cache_key = f"{device_id}/{message}"
|
| 186 |
+
if cache_key in self._cache:
|
| 187 |
+
logger.debug(f"Using cached schema for {cache_key}")
|
| 188 |
+
return self._cache[cache_key]
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
table_name = self.get_table_name(device_id, message)
|
| 192 |
+
schema_df = self.athena.describe_table(table_name)
|
| 193 |
+
|
| 194 |
+
if schema_df.empty:
|
| 195 |
+
logger.warning(f"No schema found for {device_id}/{message}")
|
| 196 |
+
return None
|
| 197 |
+
|
| 198 |
+
schema_dict = {
|
| 199 |
+
row['column_name']: row['data_type']
|
| 200 |
+
for _, row in schema_df.iterrows()
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
self._cache[cache_key] = schema_dict
|
| 204 |
+
logger.info(f"Schema for {cache_key}: {len(schema_dict)} columns")
|
| 205 |
+
return schema_dict
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Failed to get schema for {device_id}/{message}: {e}")
|
| 208 |
+
return None
|
| 209 |
+
|
| 210 |
+
def list_partitions(self, device_id: str, message: str) -> List[str]:
|
| 211 |
+
"""
|
| 212 |
+
List partition values (dates) for a table.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
device_id: Device identifier
|
| 216 |
+
message: Message/rule name
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
List of partition values (dates) in YYYY-MM-DD format
|
| 220 |
+
|
| 221 |
+
Note:
|
| 222 |
+
Handles hierarchical partitioning format: year=YYYY/month=MM/day=DD
|
| 223 |
+
Data structure: {device_id}/{message}/{year}/{month}/{day}/file.parquet
|
| 224 |
+
"""
|
| 225 |
+
try:
|
| 226 |
+
table_name = self.get_table_name(device_id, message)
|
| 227 |
+
|
| 228 |
+
# Query partition information
|
| 229 |
+
# query = f"SHOW PARTITIONS {self.config.database_name}.{table_name}"
|
| 230 |
+
query = f"""
|
| 231 |
+
WITH files AS (
|
| 232 |
+
SELECT DISTINCT "$path" AS p
|
| 233 |
+
FROM {self.config.database_name}.{table_name}
|
| 234 |
+
WHERE "$path" LIKE '%.parquet'
|
| 235 |
+
),
|
| 236 |
+
parts AS (
|
| 237 |
+
SELECT
|
| 238 |
+
try_cast(element_at(split(p, '/'), -4) AS INTEGER) AS year,
|
| 239 |
+
try_cast(element_at(split(p, '/'), -3) AS INTEGER) AS month,
|
| 240 |
+
try_cast(element_at(split(p, '/'), -2) AS INTEGER) AS day
|
| 241 |
+
FROM files
|
| 242 |
+
)
|
| 243 |
+
SELECT DISTINCT year, month, day
|
| 244 |
+
FROM parts
|
| 245 |
+
WHERE year IS NOT NULL AND month IS NOT NULL AND day IS NOT NULL
|
| 246 |
+
ORDER BY year DESC, month DESC, day DESC
|
| 247 |
+
"""
|
| 248 |
+
df = self.athena.query_to_dataframe(query)
|
| 249 |
+
|
| 250 |
+
if df.empty:
|
| 251 |
+
logger.warning(f"No partitions found for {table_name}")
|
| 252 |
+
return []
|
| 253 |
+
|
| 254 |
+
# Extract date from partition string
|
| 255 |
+
# Format: YYYY-MM-DD
|
| 256 |
+
dates = []
|
| 257 |
+
for _, row in df.iterrows():
|
| 258 |
+
dates.append(f'{row.iloc[0]}-{row.iloc[1]}-{row.iloc[2]:02d}')
|
| 259 |
+
logger.info(f"Found {len(dates)} partitions for {table_name}")
|
| 260 |
+
return sorted(set(dates))
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.warning(f"Could not list partitions for {device_id}/{message}: {e}")
|
| 263 |
+
# Table might not be partitioned or query might have failed
|
| 264 |
+
return []
|
| 265 |
+
|
| 266 |
+
def clear_cache(self) -> None:
|
| 267 |
+
"""Clear schema cache."""
|
| 268 |
+
self._cache.clear()
|
| 269 |
+
logger.debug("Schema cache cleared")
|
src/datalake/config.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management for data lake access.
|
| 3 |
+
|
| 4 |
+
Supports AWS Athena-based data lakes with configuration from
|
| 5 |
+
CloudFormation stack outputs.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional
|
| 11 |
+
import boto3
|
| 12 |
+
from botocore.exceptions import ClientError
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class DataLakeConfig:
|
| 17 |
+
"""
|
| 18 |
+
Data lake configuration for AWS Athena-based data lakes.
|
| 19 |
+
|
| 20 |
+
Configuration can be loaded from CloudFormation stack outputs or
|
| 21 |
+
created directly with credentials.
|
| 22 |
+
|
| 23 |
+
Attributes:
|
| 24 |
+
stack_name: CloudFormation stack name (default: 'datalake-stack')
|
| 25 |
+
database_name: Athena database name
|
| 26 |
+
workgroup: Athena workgroup name (optional)
|
| 27 |
+
s3_output_location: S3 location for query results (must end with /)
|
| 28 |
+
region: AWS region
|
| 29 |
+
profile: AWS profile name for credentials (optional)
|
| 30 |
+
access_key_id: AWS access key ID (optional, for explicit credentials)
|
| 31 |
+
secret_access_key: AWS secret access key (optional, for explicit credentials)
|
| 32 |
+
device_filter: Optional device ID filter (e.g., 'device_001')
|
| 33 |
+
message_filter: Optional message/rule filter (e.g., 'CAN_Message_001')
|
| 34 |
+
cache_enabled: Enable schema caching
|
| 35 |
+
"""
|
| 36 |
+
stack_name: str = "datalake-stack"
|
| 37 |
+
database_name: Optional[str] = None
|
| 38 |
+
workgroup: Optional[str] = None
|
| 39 |
+
s3_output_location: Optional[str] = None
|
| 40 |
+
region: str = "us-east-1"
|
| 41 |
+
profile: Optional[str] = None
|
| 42 |
+
access_key_id: Optional[str] = None
|
| 43 |
+
secret_access_key: Optional[str] = None
|
| 44 |
+
device_filter: Optional[str] = None
|
| 45 |
+
message_filter: Optional[str] = None
|
| 46 |
+
cache_enabled: bool = True
|
| 47 |
+
|
| 48 |
+
@classmethod
|
| 49 |
+
def from_cloudformation(
|
| 50 |
+
cls,
|
| 51 |
+
stack_name: str = "datalake-stack",
|
| 52 |
+
region: Optional[str] = None,
|
| 53 |
+
profile: Optional[str] = None,
|
| 54 |
+
) -> "DataLakeConfig":
|
| 55 |
+
"""
|
| 56 |
+
Load config from CloudFormation stack outputs.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
stack_name: CloudFormation stack name (default: 'datalake-stack')
|
| 60 |
+
region: AWS region (if None, will try to get from stack or use default)
|
| 61 |
+
profile: AWS profile name for credentials (optional)
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
DataLakeConfig instance with values from stack outputs
|
| 65 |
+
|
| 66 |
+
Raises:
|
| 67 |
+
ClientError: If stack doesn't exist or can't be accessed
|
| 68 |
+
KeyError: If required stack outputs are missing
|
| 69 |
+
|
| 70 |
+
Expected CloudFormation stack outputs:
|
| 71 |
+
- DatabaseName: Athena database name (required)
|
| 72 |
+
- WorkGroup: Athena workgroup name (optional)
|
| 73 |
+
- S3OutputLocation: S3 location for Athena query results (required)
|
| 74 |
+
- Region: AWS region (optional, will use provided region or default)
|
| 75 |
+
"""
|
| 76 |
+
session = boto3.Session(profile_name=profile)
|
| 77 |
+
if region:
|
| 78 |
+
cf_client = session.client('cloudformation', region_name=region)
|
| 79 |
+
else:
|
| 80 |
+
# Try to get region from default config
|
| 81 |
+
try:
|
| 82 |
+
region = session.region_name or "us-east-1"
|
| 83 |
+
except:
|
| 84 |
+
region = "us-east-1"
|
| 85 |
+
cf_client = session.client('cloudformation', region_name=region)
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
response = cf_client.describe_stacks(StackName=stack_name)
|
| 89 |
+
except ClientError as e:
|
| 90 |
+
raise ClientError(
|
| 91 |
+
{
|
| 92 |
+
'Error': {
|
| 93 |
+
'Code': 'StackNotFound',
|
| 94 |
+
'Message': f"CloudFormation stack '{stack_name}' not found. "
|
| 95 |
+
f"Make sure the stack exists and you have permissions to access it."
|
| 96 |
+
}
|
| 97 |
+
},
|
| 98 |
+
'DescribeStacks'
|
| 99 |
+
) from e
|
| 100 |
+
|
| 101 |
+
if not response['Stacks']:
|
| 102 |
+
raise ValueError(f"Stack '{stack_name}' not found")
|
| 103 |
+
|
| 104 |
+
stack = response['Stacks'][0]
|
| 105 |
+
outputs = {output['OutputKey']: output['OutputValue']
|
| 106 |
+
for output in stack.get('Outputs', [])}
|
| 107 |
+
|
| 108 |
+
# Get region from stack or use provided/default
|
| 109 |
+
if not region:
|
| 110 |
+
region = outputs.get('Region', session.region_name or "us-east-1")
|
| 111 |
+
|
| 112 |
+
# Required outputs
|
| 113 |
+
database_name = outputs.get('DatabaseName')
|
| 114 |
+
if not database_name:
|
| 115 |
+
raise KeyError(
|
| 116 |
+
f"Required output 'DatabaseName' not found in stack '{stack_name}'. "
|
| 117 |
+
f"Available outputs: {list(outputs.keys())}"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
s3_output_location = outputs.get('S3OutputLocation')
|
| 121 |
+
if not s3_output_location:
|
| 122 |
+
raise KeyError(
|
| 123 |
+
f"Required output 'S3OutputLocation' not found in stack '{stack_name}'. "
|
| 124 |
+
f"Available outputs: {list(outputs.keys())}"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Optional outputs
|
| 128 |
+
workgroup = outputs.get('WorkGroup')
|
| 129 |
+
|
| 130 |
+
return cls(
|
| 131 |
+
stack_name=stack_name,
|
| 132 |
+
database_name=database_name,
|
| 133 |
+
workgroup=workgroup,
|
| 134 |
+
s3_output_location=s3_output_location,
|
| 135 |
+
region=region,
|
| 136 |
+
profile=profile,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
@classmethod
|
| 140 |
+
def from_credentials(
|
| 141 |
+
cls,
|
| 142 |
+
database_name: str,
|
| 143 |
+
workgroup: str,
|
| 144 |
+
s3_output_location: str,
|
| 145 |
+
region: str,
|
| 146 |
+
access_key_id: str,
|
| 147 |
+
secret_access_key: str,
|
| 148 |
+
) -> "DataLakeConfig":
|
| 149 |
+
"""
|
| 150 |
+
Create config directly with AWS credentials.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
database_name: Athena database name
|
| 154 |
+
workgroup: Athena workgroup name
|
| 155 |
+
s3_output_location: S3 location for query results (must end with /)
|
| 156 |
+
region: AWS region
|
| 157 |
+
access_key_id: AWS access key ID
|
| 158 |
+
secret_access_key: AWS secret access key
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
DataLakeConfig instance
|
| 162 |
+
"""
|
| 163 |
+
# Ensure S3 output location ends with /
|
| 164 |
+
if s3_output_location and not s3_output_location.endswith('/'):
|
| 165 |
+
s3_output_location = s3_output_location + '/'
|
| 166 |
+
|
| 167 |
+
return cls(
|
| 168 |
+
database_name=database_name,
|
| 169 |
+
workgroup=workgroup,
|
| 170 |
+
s3_output_location=s3_output_location,
|
| 171 |
+
region=region,
|
| 172 |
+
access_key_id=access_key_id,
|
| 173 |
+
secret_access_key=secret_access_key,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
def get_boto3_session(self) -> boto3.Session:
|
| 177 |
+
"""
|
| 178 |
+
Get boto3 session with configured credentials, profile, and region.
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
boto3.Session instance
|
| 182 |
+
"""
|
| 183 |
+
if self.access_key_id and self.secret_access_key:
|
| 184 |
+
# Use explicit credentials
|
| 185 |
+
return boto3.Session(
|
| 186 |
+
aws_access_key_id=self.access_key_id,
|
| 187 |
+
aws_secret_access_key=self.secret_access_key,
|
| 188 |
+
region_name=self.region,
|
| 189 |
+
)
|
| 190 |
+
else:
|
| 191 |
+
# Use profile or IAM role
|
| 192 |
+
return boto3.Session(profile_name=self.profile, region_name=self.region)
|
src/datalake/logger.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Structured logging utilities for the datalake package.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def setup_logger(name: str, level: str = "INFO") -> logging.Logger:
|
| 10 |
+
"""
|
| 11 |
+
Initialize logger with structured output.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
name: Logger module name
|
| 15 |
+
level: Logging level (INFO, DEBUG, WARNING, ERROR)
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
Configured logger instance
|
| 19 |
+
"""
|
| 20 |
+
logger = logging.getLogger(name)
|
| 21 |
+
|
| 22 |
+
# Avoid adding duplicate handlers
|
| 23 |
+
if logger.handlers:
|
| 24 |
+
return logger
|
| 25 |
+
|
| 26 |
+
handler = logging.StreamHandler()
|
| 27 |
+
formatter = logging.Formatter(
|
| 28 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 29 |
+
)
|
| 30 |
+
handler.setFormatter(formatter)
|
| 31 |
+
logger.addHandler(handler)
|
| 32 |
+
logger.setLevel(getattr(logging, level.upper(), logging.INFO))
|
| 33 |
+
return logger
|
src/datalake/query.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Query interface for data lake using AWS Athena SQL queries.
|
| 3 |
+
|
| 4 |
+
Provides methods to read and filter data from the Athena data lake
|
| 5 |
+
using SQL queries with support for device/message filtering and date ranges.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import List, Optional, Tuple
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from .athena import AthenaQuery
|
| 11 |
+
from .catalog import DataLakeCatalog
|
| 12 |
+
from .config import DataLakeConfig
|
| 13 |
+
from .logger import setup_logger
|
| 14 |
+
|
| 15 |
+
logger = setup_logger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DataLakeQuery:
|
| 19 |
+
"""
|
| 20 |
+
Query interface for Athena-based data lake.
|
| 21 |
+
|
| 22 |
+
Provides efficient methods to read data using SQL queries,
|
| 23 |
+
with support for filtering by device, message, date range, and time windows.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, athena_query: AthenaQuery, catalog: DataLakeCatalog):
|
| 27 |
+
"""
|
| 28 |
+
Initialize query engine.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
athena_query: AthenaQuery instance
|
| 32 |
+
catalog: Data lake catalog
|
| 33 |
+
"""
|
| 34 |
+
self.athena = athena_query
|
| 35 |
+
self.catalog = catalog
|
| 36 |
+
logger.info("Initialized DataLakeQuery")
|
| 37 |
+
|
| 38 |
+
def read_device_message(
|
| 39 |
+
self,
|
| 40 |
+
device_id: str,
|
| 41 |
+
message: str,
|
| 42 |
+
date_range: Optional[Tuple[str, str]] = None,
|
| 43 |
+
columns: Optional[List[str]] = None,
|
| 44 |
+
limit: Optional[int] = None,
|
| 45 |
+
) -> pd.DataFrame:
|
| 46 |
+
"""
|
| 47 |
+
Read all data for a device/message combination using SQL.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
device_id: Device identifier
|
| 51 |
+
message: Message/rule name
|
| 52 |
+
date_range: Optional (start_date, end_date) tuple (YYYY-MM-DD format)
|
| 53 |
+
columns: Optional column subset to read (improves performance)
|
| 54 |
+
limit: Optional row limit
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
DataFrame with query results
|
| 58 |
+
"""
|
| 59 |
+
table_name = self.catalog.get_table_name(device_id, message)
|
| 60 |
+
|
| 61 |
+
# Build SELECT clause
|
| 62 |
+
if columns:
|
| 63 |
+
# Validate columns exist
|
| 64 |
+
schema = self.catalog.get_schema(device_id, message)
|
| 65 |
+
if schema:
|
| 66 |
+
valid_columns = [c for c in columns if c in schema]
|
| 67 |
+
if not valid_columns:
|
| 68 |
+
logger.warning(f"None of requested columns found, using all columns")
|
| 69 |
+
select_clause = "*"
|
| 70 |
+
else:
|
| 71 |
+
select_clause = ", ".join(valid_columns)
|
| 72 |
+
else:
|
| 73 |
+
select_clause = "*"
|
| 74 |
+
else:
|
| 75 |
+
select_clause = "*"
|
| 76 |
+
|
| 77 |
+
# Build WHERE clause
|
| 78 |
+
where_conditions = []
|
| 79 |
+
|
| 80 |
+
if date_range:
|
| 81 |
+
start_date, end_date = date_range
|
| 82 |
+
# Parse dates and filter using $path column
|
| 83 |
+
# Format: YYYY-MM-DD
|
| 84 |
+
# Data structure: {device_id}/{message}/{year}/{month}/{day}/file.parquet
|
| 85 |
+
start_parts = start_date.split('-')
|
| 86 |
+
end_parts = end_date.split('-')
|
| 87 |
+
|
| 88 |
+
if len(start_parts) == 3 and len(end_parts) == 3:
|
| 89 |
+
start_year, start_month, start_day = start_parts
|
| 90 |
+
end_year, end_month, end_day = end_parts
|
| 91 |
+
|
| 92 |
+
# Extract year, month, day from path and filter
|
| 93 |
+
# Path structure: .../year/month/day/file.parquet
|
| 94 |
+
# Use element_at(split($path, '/'), -4) for year, -3 for month, -2 for day
|
| 95 |
+
path_year = "try_cast(element_at(split(\"$path\", '/'), -4) AS INTEGER)"
|
| 96 |
+
path_month = "try_cast(element_at(split(\"$path\", '/'), -3) AS INTEGER)"
|
| 97 |
+
path_day = "try_cast(element_at(split(\"$path\", '/'), -2) AS INTEGER)"
|
| 98 |
+
|
| 99 |
+
# Build partition filter using path-based extraction
|
| 100 |
+
# This handles hierarchical partitioning: {device_id}/{message}/{year}/{month}/{day}/file.parquet
|
| 101 |
+
where_conditions.append(
|
| 102 |
+
f"({path_year} > {start_year} OR "
|
| 103 |
+
f"({path_year} = {start_year} AND "
|
| 104 |
+
f"({path_month} > {start_month} OR "
|
| 105 |
+
f"({path_month} = {start_month} AND {path_day} >= {start_day}))))"
|
| 106 |
+
)
|
| 107 |
+
where_conditions.append(
|
| 108 |
+
f"({path_year} < {end_year} OR "
|
| 109 |
+
f"({path_year} = {end_year} AND "
|
| 110 |
+
f"({path_month} < {end_month} OR "
|
| 111 |
+
f"({path_month} = {end_month} AND {path_day} <= {end_day}))))"
|
| 112 |
+
)
|
| 113 |
+
else:
|
| 114 |
+
# Fallback: try date column if it exists
|
| 115 |
+
where_conditions.append(f"date >= '{start_date}' AND date <= '{end_date}'")
|
| 116 |
+
|
| 117 |
+
where_clause = ""
|
| 118 |
+
if where_conditions:
|
| 119 |
+
where_clause = "WHERE " + " AND ".join(where_conditions)
|
| 120 |
+
|
| 121 |
+
# Build LIMIT clause
|
| 122 |
+
limit_clause = f"LIMIT {limit}" if limit else ""
|
| 123 |
+
|
| 124 |
+
query = f"""
|
| 125 |
+
SELECT {select_clause}
|
| 126 |
+
FROM {self.catalog.config.database_name}.{table_name}
|
| 127 |
+
{where_clause}
|
| 128 |
+
{limit_clause}
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
+
logger.info(f"Executing query for {device_id}/{message}")
|
| 132 |
+
return self.athena.query_to_dataframe(query)
|
| 133 |
+
|
| 134 |
+
def read_date_range(
|
| 135 |
+
self,
|
| 136 |
+
device_id: str,
|
| 137 |
+
message: str,
|
| 138 |
+
start_date: str,
|
| 139 |
+
end_date: str,
|
| 140 |
+
columns: Optional[List[str]] = None,
|
| 141 |
+
) -> pd.DataFrame:
|
| 142 |
+
"""
|
| 143 |
+
Read data for a specific date range.
|
| 144 |
+
|
| 145 |
+
Convenience method wrapping read_device_message with date range.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
device_id: Device identifier
|
| 149 |
+
message: Message name
|
| 150 |
+
start_date: Start date (YYYY-MM-DD format)
|
| 151 |
+
end_date: End date (YYYY-MM-DD format)
|
| 152 |
+
columns: Optional column subset
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
DataFrame with data for the date range
|
| 156 |
+
"""
|
| 157 |
+
return self.read_device_message(
|
| 158 |
+
device_id=device_id,
|
| 159 |
+
message=message,
|
| 160 |
+
date_range=(start_date, end_date),
|
| 161 |
+
columns=columns,
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
def time_series_query(
|
| 165 |
+
self,
|
| 166 |
+
device_id: str,
|
| 167 |
+
message: str,
|
| 168 |
+
signal_name: str,
|
| 169 |
+
start_time: Optional[int] = None,
|
| 170 |
+
end_time: Optional[int] = None,
|
| 171 |
+
limit: Optional[int] = None,
|
| 172 |
+
) -> pd.DataFrame:
|
| 173 |
+
"""
|
| 174 |
+
Query single signal as time series.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
device_id: Device identifier
|
| 178 |
+
message: Message name
|
| 179 |
+
signal_name: Signal column name
|
| 180 |
+
start_time: Min timestamp (microseconds since epoch)
|
| 181 |
+
end_time: Max timestamp (microseconds since epoch)
|
| 182 |
+
limit: Optional row limit
|
| 183 |
+
|
| 184 |
+
Returns:
|
| 185 |
+
DataFrame with 't' (timestamp) and signal columns, sorted by time
|
| 186 |
+
"""
|
| 187 |
+
table_name = self.catalog.get_table_name(device_id, message)
|
| 188 |
+
|
| 189 |
+
# Build WHERE clause
|
| 190 |
+
where_conditions = []
|
| 191 |
+
|
| 192 |
+
if start_time is not None:
|
| 193 |
+
where_conditions.append(f"t >= {start_time}")
|
| 194 |
+
if end_time is not None:
|
| 195 |
+
where_conditions.append(f"t <= {end_time}")
|
| 196 |
+
|
| 197 |
+
where_clause = ""
|
| 198 |
+
if where_conditions:
|
| 199 |
+
where_clause = "WHERE " + " AND ".join(where_conditions)
|
| 200 |
+
|
| 201 |
+
limit_clause = f"LIMIT {limit}" if limit else ""
|
| 202 |
+
|
| 203 |
+
query = f"""
|
| 204 |
+
SELECT t, {signal_name}
|
| 205 |
+
FROM {self.catalog.config.database_name}.{table_name}
|
| 206 |
+
{where_clause}
|
| 207 |
+
ORDER BY t
|
| 208 |
+
{limit_clause}
|
| 209 |
+
"""
|
| 210 |
+
|
| 211 |
+
logger.info(f"Time series query for {device_id}/{message}/{signal_name}")
|
| 212 |
+
return self.athena.query_to_dataframe(query)
|
| 213 |
+
|
| 214 |
+
def execute_sql(self, sql: str) -> pd.DataFrame:
|
| 215 |
+
"""
|
| 216 |
+
Execute custom SQL query.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
sql: SQL query string
|
| 220 |
+
|
| 221 |
+
Returns:
|
| 222 |
+
DataFrame with query results
|
| 223 |
+
|
| 224 |
+
Note:
|
| 225 |
+
Query should reference tables in the format:
|
| 226 |
+
{database_name}.{table_name}
|
| 227 |
+
"""
|
| 228 |
+
logger.info("Executing custom SQL query")
|
| 229 |
+
return self.athena.query_to_dataframe(sql)
|
| 230 |
+
|
| 231 |
+
def aggregate(
|
| 232 |
+
self,
|
| 233 |
+
device_id: str,
|
| 234 |
+
message: str,
|
| 235 |
+
aggregation: str,
|
| 236 |
+
group_by: Optional[List[str]] = None,
|
| 237 |
+
where_clause: Optional[str] = None,
|
| 238 |
+
) -> pd.DataFrame:
|
| 239 |
+
"""
|
| 240 |
+
Execute aggregation query.
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
device_id: Device identifier
|
| 244 |
+
message: Message name
|
| 245 |
+
aggregation: Aggregation expression (e.g., "COUNT(*), AVG(RPM)")
|
| 246 |
+
group_by: Optional list of columns to group by
|
| 247 |
+
where_clause: Optional WHERE clause (without WHERE keyword)
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
DataFrame with aggregation results
|
| 251 |
+
|
| 252 |
+
Example:
|
| 253 |
+
df = query.aggregate(
|
| 254 |
+
"device_001", "EngineData",
|
| 255 |
+
"COUNT(*) as count, AVG(RPM) as avg_rpm, MIN(RPM) as min_rpm",
|
| 256 |
+
group_by=["date"]
|
| 257 |
+
)
|
| 258 |
+
"""
|
| 259 |
+
table_name = self.catalog.get_table_name(device_id, message)
|
| 260 |
+
|
| 261 |
+
group_by_clause = ""
|
| 262 |
+
if group_by:
|
| 263 |
+
group_by_clause = f"GROUP BY {', '.join(group_by)}"
|
| 264 |
+
|
| 265 |
+
where_clause_sql = ""
|
| 266 |
+
if where_clause:
|
| 267 |
+
where_clause_sql = f"WHERE {where_clause}"
|
| 268 |
+
|
| 269 |
+
query = f"""
|
| 270 |
+
SELECT {aggregation}
|
| 271 |
+
FROM {self.catalog.config.database_name}.{table_name}
|
| 272 |
+
{where_clause_sql}
|
| 273 |
+
{group_by_clause}
|
| 274 |
+
"""
|
| 275 |
+
|
| 276 |
+
logger.info(f"Aggregation query for {device_id}/{message}")
|
| 277 |
+
return self.athena.query_to_dataframe(query)
|
src/examples/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Examples package
|
src/examples/batch_example.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example: Batch processing patterns for large-scale analysis using Athena.
|
| 3 |
+
|
| 4 |
+
This script demonstrates memory-efficient batch processing across
|
| 5 |
+
the entire data lake using SQL queries.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from datalake.config import DataLakeConfig
|
| 9 |
+
from datalake.athena import AthenaQuery
|
| 10 |
+
from datalake.catalog import DataLakeCatalog
|
| 11 |
+
from datalake.query import DataLakeQuery
|
| 12 |
+
from datalake.batch import BatchProcessor
|
| 13 |
+
import pandas as pd
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def main():
|
| 17 |
+
"""Batch process data lake."""
|
| 18 |
+
# Setup
|
| 19 |
+
# Load config with explicit credentials
|
| 20 |
+
config = DataLakeConfig.from_credentials(
|
| 21 |
+
database_name="dbparquetdatalake05",
|
| 22 |
+
workgroup="athenaworkgroup-datalake05",
|
| 23 |
+
s3_output_location="s3://canedge-raw-data-parquet/athena-results/",
|
| 24 |
+
region="eu-north-1",
|
| 25 |
+
access_key_id="AKIARJQJFFVASPMSGNNY",
|
| 26 |
+
secret_access_key="Z6ISPZJvvcv13JZKYyuUxiMRZvDrvfoWs4YTUBnh",
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
athena = AthenaQuery(config)
|
| 30 |
+
catalog = DataLakeCatalog(athena, config)
|
| 31 |
+
query = DataLakeQuery(athena, catalog)
|
| 32 |
+
processor = BatchProcessor(query)
|
| 33 |
+
|
| 34 |
+
print("=" * 60)
|
| 35 |
+
print("Batch Processing Examples (Athena)")
|
| 36 |
+
print("=" * 60)
|
| 37 |
+
print()
|
| 38 |
+
|
| 39 |
+
# Example 1: Compute statistics across all data
|
| 40 |
+
print("Example 1: Compute statistics across all devices/messages")
|
| 41 |
+
print("-" * 60)
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
stats = processor.aggregate_by_device_message(
|
| 45 |
+
aggregation_func=processor.compute_statistics,
|
| 46 |
+
message_filter=config.message_filter, # Optional filter
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
print(f"Processed {len(stats)} device(s):")
|
| 50 |
+
for device, messages in stats.items():
|
| 51 |
+
print(f"\n Device: {device}")
|
| 52 |
+
for message, metrics in messages.items():
|
| 53 |
+
print(f" Message: {message}")
|
| 54 |
+
print(f" Record count: {metrics.get('count', 0):,}")
|
| 55 |
+
|
| 56 |
+
# Show statistics for first numeric column found
|
| 57 |
+
for key, value in metrics.items():
|
| 58 |
+
if key != 'count' and '_mean' in key:
|
| 59 |
+
signal = key.replace('_mean', '')
|
| 60 |
+
print(f" {signal}:")
|
| 61 |
+
print(f" Mean: {value:.2f}")
|
| 62 |
+
print(f" Min: {metrics.get(f'{signal}_min', 'N/A')}")
|
| 63 |
+
print(f" Max: {metrics.get(f'{signal}_max', 'N/A')}")
|
| 64 |
+
break
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Error in batch aggregation: {e}")
|
| 67 |
+
print()
|
| 68 |
+
|
| 69 |
+
# Example 2: Custom aggregation using SQL
|
| 70 |
+
print("Example 2: Custom SQL aggregation")
|
| 71 |
+
print("-" * 60)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
devices = catalog.list_devices()
|
| 75 |
+
if devices:
|
| 76 |
+
device_id = devices[0]
|
| 77 |
+
messages = catalog.list_messages(device_id)
|
| 78 |
+
if messages:
|
| 79 |
+
message = messages[0]
|
| 80 |
+
table_name = catalog.get_table_name(device_id, message)
|
| 81 |
+
|
| 82 |
+
# Use SQL for aggregation
|
| 83 |
+
sql = f"""
|
| 84 |
+
SELECT
|
| 85 |
+
COUNT(*) as record_count,
|
| 86 |
+
MIN(t) as min_timestamp,
|
| 87 |
+
MAX(t) as max_timestamp
|
| 88 |
+
FROM {config.database_name}.{table_name}
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
df_agg = query.execute_sql(sql)
|
| 92 |
+
print(f"Aggregation for {device_id}/{message}:")
|
| 93 |
+
print(df_agg)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print(f"Error in SQL aggregation: {e}")
|
| 96 |
+
print()
|
| 97 |
+
|
| 98 |
+
# Example 3: Export specific data
|
| 99 |
+
print("Example 3: Export data to CSV")
|
| 100 |
+
print("-" * 60)
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
devices = catalog.list_devices()
|
| 104 |
+
if devices:
|
| 105 |
+
device_id = devices[0]
|
| 106 |
+
messages = catalog.list_messages(device_id)
|
| 107 |
+
if messages:
|
| 108 |
+
message = messages[0]
|
| 109 |
+
output_path = f"{device_id}_{message}_export.csv"
|
| 110 |
+
|
| 111 |
+
processor.export_to_csv(
|
| 112 |
+
device_id=device_id,
|
| 113 |
+
message=message,
|
| 114 |
+
output_path=output_path,
|
| 115 |
+
limit=10000, # Limit for example
|
| 116 |
+
)
|
| 117 |
+
print(f"Exported to: {output_path}")
|
| 118 |
+
except Exception as e:
|
| 119 |
+
print(f"Error exporting data: {e}")
|
| 120 |
+
print()
|
| 121 |
+
|
| 122 |
+
# Example 4: Find anomalies using SQL
|
| 123 |
+
print("Example 4: Find anomalies using SQL")
|
| 124 |
+
print("-" * 60)
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
devices = catalog.list_devices()
|
| 128 |
+
if devices:
|
| 129 |
+
device_id = devices[0]
|
| 130 |
+
messages = catalog.list_messages(device_id)
|
| 131 |
+
if messages:
|
| 132 |
+
message = messages[0]
|
| 133 |
+
schema = catalog.get_schema(device_id, message)
|
| 134 |
+
|
| 135 |
+
if schema:
|
| 136 |
+
signal_cols = [c for c in schema.keys() if c != 't' and c.lower() != 'date']
|
| 137 |
+
if signal_cols:
|
| 138 |
+
signal_name = signal_cols[0]
|
| 139 |
+
table_name = catalog.get_table_name(device_id, message)
|
| 140 |
+
|
| 141 |
+
# Use SQL to find outliers (3 standard deviations)
|
| 142 |
+
sql = f"""
|
| 143 |
+
WITH stats AS (
|
| 144 |
+
SELECT
|
| 145 |
+
AVG({signal_name}) as mean_val,
|
| 146 |
+
STDDEV({signal_name}) as std_val
|
| 147 |
+
FROM {config.database_name}.{table_name}
|
| 148 |
+
WHERE {signal_name} IS NOT NULL
|
| 149 |
+
)
|
| 150 |
+
SELECT t, {signal_name}
|
| 151 |
+
FROM {config.database_name}.{table_name}, stats
|
| 152 |
+
WHERE {signal_name} IS NOT NULL
|
| 153 |
+
AND ABS({signal_name} - mean_val) > 3 * std_val
|
| 154 |
+
ORDER BY ABS({signal_name} - mean_val) DESC
|
| 155 |
+
LIMIT 10
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
anomalies = query.execute_sql(sql)
|
| 159 |
+
if not anomalies.empty:
|
| 160 |
+
print(f"Found {len(anomalies)} anomalies in {signal_name}")
|
| 161 |
+
print(anomalies.head())
|
| 162 |
+
else:
|
| 163 |
+
print("No anomalies found")
|
| 164 |
+
except Exception as e:
|
| 165 |
+
print(f"Error finding anomalies: {e}")
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
if __name__ == "__main__":
|
| 169 |
+
main()
|
src/examples/explore_example.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example: Explore data lake structure using Athena.
|
| 3 |
+
|
| 4 |
+
This script demonstrates how to discover devices, messages, dates,
|
| 5 |
+
and schemas in the CANedge Athena data lake.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from datalake.config import DataLakeConfig
|
| 9 |
+
from datalake.athena import AthenaQuery
|
| 10 |
+
from datalake.catalog import DataLakeCatalog
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main():
|
| 14 |
+
"""Explore data lake structure."""
|
| 15 |
+
# Load config with explicit credentials
|
| 16 |
+
config = DataLakeConfig.from_credentials(
|
| 17 |
+
database_name="dbparquetdatalake05",
|
| 18 |
+
workgroup="athenaworkgroup-datalake05",
|
| 19 |
+
s3_output_location="s3://canedge-raw-data-parquet/athena-results/",
|
| 20 |
+
region="eu-north-1",
|
| 21 |
+
access_key_id="AKIARJQJFFVASPMSGNNY",
|
| 22 |
+
secret_access_key="Z6ISPZJvvcv13JZKYyuUxiMRZvDrvfoWs4YTUBnh",
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Initialize Athena and catalog
|
| 26 |
+
athena = AthenaQuery(config)
|
| 27 |
+
catalog = DataLakeCatalog(athena, config)
|
| 28 |
+
|
| 29 |
+
# List available devices
|
| 30 |
+
print("=" * 60)
|
| 31 |
+
print("Exploring Data Lake (Athena)")
|
| 32 |
+
print("=" * 60)
|
| 33 |
+
print(f"Database: {config.database_name}")
|
| 34 |
+
print(f"Region: {config.region}")
|
| 35 |
+
print(f"Workgroup: {config.workgroup}")
|
| 36 |
+
print()
|
| 37 |
+
|
| 38 |
+
# List all tables
|
| 39 |
+
try:
|
| 40 |
+
tables = catalog.list_tables()
|
| 41 |
+
print(f"Found {len(tables)} table(s) in database")
|
| 42 |
+
if tables:
|
| 43 |
+
print(f"Sample tables: {tables[:10]}")
|
| 44 |
+
print()
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"Error listing tables: {e}")
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
# List devices
|
| 50 |
+
try:
|
| 51 |
+
devices = catalog.list_devices(device_filter=config.device_filter)
|
| 52 |
+
print(f"Found {len(devices)} device(s):")
|
| 53 |
+
for device in devices:
|
| 54 |
+
print(f" - {device}")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"Error listing devices: {e}")
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
# List messages for first device
|
| 60 |
+
if devices:
|
| 61 |
+
device_id = devices[0]
|
| 62 |
+
print(f"\nMessages for device '{device_id}':")
|
| 63 |
+
try:
|
| 64 |
+
messages = catalog.list_messages(device_id, message_filter=config.message_filter)
|
| 65 |
+
|
| 66 |
+
for message in messages:
|
| 67 |
+
print(f" - {message}")
|
| 68 |
+
|
| 69 |
+
# Get schema
|
| 70 |
+
try:
|
| 71 |
+
schema = catalog.get_schema(device_id, message)
|
| 72 |
+
|
| 73 |
+
if schema:
|
| 74 |
+
print(f" Schema: {len(schema)} column(s)")
|
| 75 |
+
print(f" Columns: {', '.join(list(schema.keys())[:5])}")
|
| 76 |
+
if len(schema) > 5:
|
| 77 |
+
print(f" ... and {len(schema) - 5} more")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f" Error getting schema: {e}")
|
| 80 |
+
|
| 81 |
+
# Try to list partitions (dates)
|
| 82 |
+
try:
|
| 83 |
+
partitions = catalog.list_partitions(device_id, message)
|
| 84 |
+
if partitions:
|
| 85 |
+
print(f" Partitions: {len(partitions)} date(s)")
|
| 86 |
+
if partitions:
|
| 87 |
+
print(f" Date range: {partitions[0]} to {partitions[-1]}")
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f" Could not list partitions: {e}")
|
| 90 |
+
print()
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"Error listing messages: {e}")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
main()
|
src/examples/query_example.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example: Query and analyze data from the Athena data lake.
|
| 3 |
+
|
| 4 |
+
This script demonstrates how to read data for specific devices/messages,
|
| 5 |
+
perform time series queries, and filter by date ranges using SQL.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from datalake.config import DataLakeConfig
|
| 9 |
+
from datalake.athena import AthenaQuery
|
| 10 |
+
from datalake.catalog import DataLakeCatalog
|
| 11 |
+
from datalake.query import DataLakeQuery
|
| 12 |
+
import pandas as pd
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def main():
|
| 16 |
+
"""Query and analyze data."""
|
| 17 |
+
# Setup
|
| 18 |
+
# Load config with explicit credentials
|
| 19 |
+
config = DataLakeConfig.from_credentials(
|
| 20 |
+
database_name="dbparquetdatalake05",
|
| 21 |
+
workgroup="athenaworkgroup-datalake05",
|
| 22 |
+
s3_output_location="s3://canedge-raw-data-parquet/athena-results/",
|
| 23 |
+
region="eu-north-1",
|
| 24 |
+
access_key_id="AKIARJQJFFVASPMSGNNY",
|
| 25 |
+
secret_access_key="Z6ISPZJvvcv13JZKYyuUxiMRZvDrvfoWs4YTUBnh",
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
athena = AthenaQuery(config)
|
| 29 |
+
catalog = DataLakeCatalog(athena, config)
|
| 30 |
+
query = DataLakeQuery(athena, catalog)
|
| 31 |
+
|
| 32 |
+
# Get first available device and message
|
| 33 |
+
try:
|
| 34 |
+
devices = catalog.list_devices()
|
| 35 |
+
if not devices:
|
| 36 |
+
print("No devices found in data lake")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
device_id = devices[0]
|
| 40 |
+
messages = catalog.list_messages(device_id)
|
| 41 |
+
if not messages:
|
| 42 |
+
print(f"No messages found for device {device_id}")
|
| 43 |
+
return
|
| 44 |
+
|
| 45 |
+
message = messages[0]
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error discovering devices/messages: {e}")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
print("=" * 60)
|
| 51 |
+
print("Querying Data Lake (Athena)")
|
| 52 |
+
print("=" * 60)
|
| 53 |
+
print(f"Device: {device_id}")
|
| 54 |
+
print(f"Message: {message}")
|
| 55 |
+
print()
|
| 56 |
+
|
| 57 |
+
# Example 1: Read all data for device/message
|
| 58 |
+
print("Example 1: Read all data")
|
| 59 |
+
print("-" * 60)
|
| 60 |
+
try:
|
| 61 |
+
df = query.read_device_message(
|
| 62 |
+
device_id=device_id,
|
| 63 |
+
message=message,
|
| 64 |
+
columns=["t"], # Only read timestamp initially to check structure
|
| 65 |
+
limit=100, # Limit for example
|
| 66 |
+
)
|
| 67 |
+
print(f"Loaded {len(df)} records")
|
| 68 |
+
if not df.empty:
|
| 69 |
+
print(f"Columns: {list(df.columns)}")
|
| 70 |
+
if 't' in df.columns:
|
| 71 |
+
print(f"Time range: {df['t'].min()} to {df['t'].max()} microseconds")
|
| 72 |
+
print(f"Sample data:")
|
| 73 |
+
print(df.head())
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"Error reading data: {e}")
|
| 76 |
+
print()
|
| 77 |
+
|
| 78 |
+
# Example 2: Read with date range
|
| 79 |
+
print("Example 2: Read with date range")
|
| 80 |
+
print("-" * 60)
|
| 81 |
+
try:
|
| 82 |
+
partitions = catalog.list_partitions(device_id, message)
|
| 83 |
+
if partitions:
|
| 84 |
+
start_date = partitions[0]
|
| 85 |
+
end_date = partitions[-1] if len(partitions) > 1 else partitions[0]
|
| 86 |
+
print(f"Date range: {start_date} to {end_date}")
|
| 87 |
+
|
| 88 |
+
df_date = query.read_date_range(
|
| 89 |
+
device_id=device_id,
|
| 90 |
+
message=message,
|
| 91 |
+
start_date=start_date,
|
| 92 |
+
end_date=end_date,
|
| 93 |
+
limit=100,
|
| 94 |
+
)
|
| 95 |
+
print(f"Loaded {len(df_date)} records for date range")
|
| 96 |
+
except Exception as e:
|
| 97 |
+
print(f"Error reading date range: {e}")
|
| 98 |
+
print()
|
| 99 |
+
|
| 100 |
+
# Example 3: Time series query (if signal columns exist)
|
| 101 |
+
print("Example 3: Time series query")
|
| 102 |
+
print("-" * 60)
|
| 103 |
+
try:
|
| 104 |
+
schema = catalog.get_schema(device_id, message)
|
| 105 |
+
if schema:
|
| 106 |
+
# Find first signal column (not 't')
|
| 107 |
+
signal_cols = [c for c in schema.keys() if c != 't' and c.lower() != 'date']
|
| 108 |
+
if signal_cols:
|
| 109 |
+
signal_name = signal_cols[0]
|
| 110 |
+
print(f"Querying signal: {signal_name}")
|
| 111 |
+
|
| 112 |
+
df_ts = query.time_series_query(
|
| 113 |
+
device_id=device_id,
|
| 114 |
+
message=message,
|
| 115 |
+
signal_name=signal_name,
|
| 116 |
+
limit=100,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
if not df_ts.empty:
|
| 120 |
+
print(f"Time series: {len(df_ts)} records")
|
| 121 |
+
# Convert timestamp to datetime for display
|
| 122 |
+
if 't' in df_ts.columns:
|
| 123 |
+
df_ts['timestamp'] = pd.to_datetime(df_ts['t'], unit='us')
|
| 124 |
+
print(df_ts[['timestamp', signal_name]].head())
|
| 125 |
+
|
| 126 |
+
# Basic statistics
|
| 127 |
+
print(f"\nStatistics for {signal_name}:")
|
| 128 |
+
print(f" Mean: {df_ts[signal_name].mean():.2f}")
|
| 129 |
+
print(f" Min: {df_ts[signal_name].min():.2f}")
|
| 130 |
+
print(f" Max: {df_ts[signal_name].max():.2f}")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"Error in time series query: {e}")
|
| 133 |
+
print()
|
| 134 |
+
|
| 135 |
+
# Example 4: Custom SQL query
|
| 136 |
+
print("Example 4: Custom SQL query")
|
| 137 |
+
print("-" * 60)
|
| 138 |
+
try:
|
| 139 |
+
table_name = catalog.get_table_name(device_id, message)
|
| 140 |
+
custom_sql = f"""
|
| 141 |
+
SELECT COUNT(*) as record_count,
|
| 142 |
+
MIN(t) as min_time,
|
| 143 |
+
MAX(t) as max_time
|
| 144 |
+
FROM {config.database_name}.{table_name}
|
| 145 |
+
LIMIT 1
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
df_custom = query.execute_sql(custom_sql)
|
| 149 |
+
print("Custom query results:")
|
| 150 |
+
print(df_custom)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"Error in custom SQL query: {e}")
|
| 153 |
+
print()
|
| 154 |
+
|
| 155 |
+
# Example 5: Aggregation query
|
| 156 |
+
print("Example 5: Aggregation query")
|
| 157 |
+
print("-" * 60)
|
| 158 |
+
try:
|
| 159 |
+
partitions = catalog.list_partitions(device_id, message)
|
| 160 |
+
if partitions:
|
| 161 |
+
# Filter by date using path-based extraction
|
| 162 |
+
# Data structure: {device_id}/{message}/{year}/{month}/{day}/file.parquet
|
| 163 |
+
target_date = partitions[0]
|
| 164 |
+
date_parts = target_date.split('-')
|
| 165 |
+
if len(date_parts) == 3:
|
| 166 |
+
year, month, day = date_parts
|
| 167 |
+
# Use path-based filtering consistent with data architecture
|
| 168 |
+
path_year = "try_cast(element_at(split(\"$path\", '/'), -4) AS INTEGER)"
|
| 169 |
+
path_month = "try_cast(element_at(split(\"$path\", '/'), -3) AS INTEGER)"
|
| 170 |
+
path_day = "try_cast(element_at(split(\"$path\", '/'), -2) AS INTEGER)"
|
| 171 |
+
where_clause = f"{path_year} = {year} AND {path_month} = {month} AND {path_day} = {day}"
|
| 172 |
+
else:
|
| 173 |
+
where_clause = None
|
| 174 |
+
|
| 175 |
+
df_agg = query.aggregate(
|
| 176 |
+
device_id=device_id,
|
| 177 |
+
message=message,
|
| 178 |
+
aggregation="COUNT(*) as count, AVG(t) as avg_time",
|
| 179 |
+
where_clause=where_clause,
|
| 180 |
+
)
|
| 181 |
+
print("Aggregation results:")
|
| 182 |
+
print(df_agg)
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f"Error in aggregation query: {e}")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
if __name__ == "__main__":
|
| 188 |
+
main()
|
src/explore_datalake.ipynb
ADDED
|
@@ -0,0 +1,1165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# CANedge Data Lake Explorer\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"This notebook helps you explore and analyze your CANedge data lake using AWS Athena.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"## Setup\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"First, let's configure the connection and test it."
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "code",
|
| 18 |
+
"execution_count": 1,
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"outputs": [
|
| 21 |
+
{
|
| 22 |
+
"name": "stdout",
|
| 23 |
+
"output_type": "stream",
|
| 24 |
+
"text": [
|
| 25 |
+
"✓ Libraries imported successfully\n"
|
| 26 |
+
]
|
| 27 |
+
}
|
| 28 |
+
],
|
| 29 |
+
"source": [
|
| 30 |
+
"# Import required libraries\n",
|
| 31 |
+
"import pandas as pd\n",
|
| 32 |
+
"import matplotlib.pyplot as plt\n",
|
| 33 |
+
"import seaborn as sns\n",
|
| 34 |
+
"from datalake.config import DataLakeConfig\n",
|
| 35 |
+
"from datalake.athena import AthenaQuery\n",
|
| 36 |
+
"from datalake.catalog import DataLakeCatalog\n",
|
| 37 |
+
"from datalake.query import DataLakeQuery\n",
|
| 38 |
+
"from datalake.batch import BatchProcessor\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"# Set up plotting\n",
|
| 41 |
+
"%matplotlib inline\n",
|
| 42 |
+
"plt.style.use('seaborn-v0_8')\n",
|
| 43 |
+
"sns.set_palette(\"husl\")\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"print(\"✓ Libraries imported successfully\")"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"cell_type": "code",
|
| 50 |
+
"execution_count": 2,
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [
|
| 53 |
+
{
|
| 54 |
+
"name": "stdout",
|
| 55 |
+
"output_type": "stream",
|
| 56 |
+
"text": [
|
| 57 |
+
"✓ Configuration loaded\n",
|
| 58 |
+
" Database: dbparquetdatalake05\n",
|
| 59 |
+
" Workgroup: athenaworkgroup-datalake05\n",
|
| 60 |
+
" Region: eu-north-1\n"
|
| 61 |
+
]
|
| 62 |
+
}
|
| 63 |
+
],
|
| 64 |
+
"source": [
|
| 65 |
+
"# Configure connection with your credentials\n",
|
| 66 |
+
"config = DataLakeConfig.from_credentials(\n",
|
| 67 |
+
" database_name=\"dbparquetdatalake05\",\n",
|
| 68 |
+
" workgroup=\"athenaworkgroup-datalake05\",\n",
|
| 69 |
+
" s3_output_location=\"s3://canedge-raw-data-parquet/athena-results/\",\n",
|
| 70 |
+
" region=\"eu-north-1\",\n",
|
| 71 |
+
" access_key_id=\"AKIARJQJFFVASPMSGNNY\",\n",
|
| 72 |
+
" secret_access_key=\"Z6ISPZJvvcv13JZKYyuUxiMRZvDrvfoWs4YTUBnh\",\n",
|
| 73 |
+
")\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"print(f\"✓ Configuration loaded\")\n",
|
| 76 |
+
"print(f\" Database: {config.database_name}\")\n",
|
| 77 |
+
"print(f\" Workgroup: {config.workgroup}\")\n",
|
| 78 |
+
"print(f\" Region: {config.region}\")"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"cell_type": "code",
|
| 83 |
+
"execution_count": 3,
|
| 84 |
+
"metadata": {},
|
| 85 |
+
"outputs": [
|
| 86 |
+
{
|
| 87 |
+
"name": "stderr",
|
| 88 |
+
"output_type": "stream",
|
| 89 |
+
"text": [
|
| 90 |
+
"2026-01-25 16:42:53,113 - datalake.athena - INFO - Initialized Athena client for database: dbparquetdatalake05\n",
|
| 91 |
+
"2026-01-25 16:42:53,113 - datalake.catalog - INFO - Initialized catalog for database: dbparquetdatalake05\n",
|
| 92 |
+
"2026-01-25 16:42:53,114 - datalake.query - INFO - Initialized DataLakeQuery\n",
|
| 93 |
+
"2026-01-25 16:42:53,114 - datalake.batch - INFO - Initialized BatchProcessor\n"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"name": "stdout",
|
| 98 |
+
"output_type": "stream",
|
| 99 |
+
"text": [
|
| 100 |
+
"✓ Athena client and catalog initialized\n"
|
| 101 |
+
]
|
| 102 |
+
}
|
| 103 |
+
],
|
| 104 |
+
"source": [
|
| 105 |
+
"# Initialize Athena and catalog\n",
|
| 106 |
+
"athena = AthenaQuery(config)\n",
|
| 107 |
+
"catalog = DataLakeCatalog(athena, config)\n",
|
| 108 |
+
"query = DataLakeQuery(athena, catalog)\n",
|
| 109 |
+
"processor = BatchProcessor(query)\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"print(\"✓ Athena client and catalog initialized\")"
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"cell_type": "markdown",
|
| 116 |
+
"metadata": {},
|
| 117 |
+
"source": [
|
| 118 |
+
"## Test Connection\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"Let's verify the connection works by listing tables."
|
| 121 |
+
]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "code",
|
| 125 |
+
"execution_count": 4,
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"outputs": [
|
| 128 |
+
{
|
| 129 |
+
"name": "stderr",
|
| 130 |
+
"output_type": "stream",
|
| 131 |
+
"text": [
|
| 132 |
+
"2026-01-25 16:43:00,494 - datalake.athena - INFO - Query started with execution ID: fb177297-ccc0-4c3d-b0ee-44078f0d3fa8\n",
|
| 133 |
+
"2026-01-25 16:43:01,953 - datalake.athena - INFO - Query fb177297-ccc0-4c3d-b0ee-44078f0d3fa8 completed successfully\n",
|
| 134 |
+
"2026-01-25 16:43:02,379 - datalake.athena - INFO - Retrieved 77 rows from query fb177297-ccc0-4c3d-b0ee-44078f0d3fa8\n"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"name": "stdout",
|
| 139 |
+
"output_type": "stream",
|
| 140 |
+
"text": [
|
| 141 |
+
"✓ Connection successful!\n",
|
| 142 |
+
" Found 77 tables in database\n",
|
| 143 |
+
"\n",
|
| 144 |
+
" First 10 tables:\n",
|
| 145 |
+
" tab_name\n",
|
| 146 |
+
"0 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m03\n",
|
| 147 |
+
"1 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m04\n",
|
| 148 |
+
"2 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m05\n",
|
| 149 |
+
"3 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m06\n",
|
| 150 |
+
"4 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m07\n",
|
| 151 |
+
"5 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0c\n",
|
| 152 |
+
"6 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0d\n",
|
| 153 |
+
"7 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0e\n",
|
| 154 |
+
"8 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0f\n",
|
| 155 |
+
"9 tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m10\n"
|
| 156 |
+
]
|
| 157 |
+
}
|
| 158 |
+
],
|
| 159 |
+
"source": [
|
| 160 |
+
"# Test connection with a simple query\n",
|
| 161 |
+
"try:\n",
|
| 162 |
+
" test_query = f\"SHOW TABLES IN {config.database_name}\"\n",
|
| 163 |
+
" df_tables = athena.query_to_dataframe(test_query, timeout=60)\n",
|
| 164 |
+
" print(f\"✓ Connection successful!\")\n",
|
| 165 |
+
" print(f\" Found {len(df_tables)} tables in database\")\n",
|
| 166 |
+
" if not df_tables.empty:\n",
|
| 167 |
+
" print(f\"\\n First 10 tables:\")\n",
|
| 168 |
+
" print(df_tables.head(10))\n",
|
| 169 |
+
"except Exception as e:\n",
|
| 170 |
+
" print(f\"✗ Connection failed: {e}\")\n",
|
| 171 |
+
" import traceback\n",
|
| 172 |
+
" traceback.print_exc()"
|
| 173 |
+
]
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"cell_type": "markdown",
|
| 177 |
+
"metadata": {},
|
| 178 |
+
"source": [
|
| 179 |
+
"## Explore Data Lake Structure\n",
|
| 180 |
+
"\n",
|
| 181 |
+
"Discover devices, messages, and available data."
|
| 182 |
+
]
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"cell_type": "code",
|
| 186 |
+
"execution_count": null,
|
| 187 |
+
"metadata": {},
|
| 188 |
+
"outputs": [
|
| 189 |
+
{
|
| 190 |
+
"name": "stderr",
|
| 191 |
+
"output_type": "stream",
|
| 192 |
+
"text": [
|
| 193 |
+
"2026-01-25 16:45:30,372 - datalake.athena - INFO - Query started with execution ID: f341e52d-c3ea-4baf-b805-eb1f327b1d1c\n",
|
| 194 |
+
"2026-01-25 16:45:31,482 - datalake.athena - INFO - Query f341e52d-c3ea-4baf-b805-eb1f327b1d1c completed successfully\n",
|
| 195 |
+
"2026-01-25 16:45:31,613 - datalake.athena - INFO - Retrieved 78 rows from query f341e52d-c3ea-4baf-b805-eb1f327b1d1c\n",
|
| 196 |
+
"2026-01-25 16:45:31,614 - datalake.catalog - INFO - Found 78 tables in database\n"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"name": "stdout",
|
| 201 |
+
"output_type": "stream",
|
| 202 |
+
"text": [
|
| 203 |
+
"Total tables: 78\n",
|
| 204 |
+
"\n",
|
| 205 |
+
"Sample tables:\n",
|
| 206 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m00\n",
|
| 207 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m03\n",
|
| 208 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m04\n",
|
| 209 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m05\n",
|
| 210 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m06\n",
|
| 211 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m07\n",
|
| 212 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0c\n",
|
| 213 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0d\n",
|
| 214 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0e\n",
|
| 215 |
+
" - tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m0f\n"
|
| 216 |
+
]
|
| 217 |
+
}
|
| 218 |
+
],
|
| 219 |
+
"source": [
|
| 220 |
+
"# List all tables\n",
|
| 221 |
+
"tables = catalog.list_tables()\n",
|
| 222 |
+
"print(f\"Total tables: {len(tables)}\")\n",
|
| 223 |
+
"print(f\"\\nSample tables:\")\n",
|
| 224 |
+
"for table in tables[:10]:\n",
|
| 225 |
+
" print(f\" - {table}\")"
|
| 226 |
+
]
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"cell_type": "code",
|
| 230 |
+
"execution_count": 14,
|
| 231 |
+
"metadata": {},
|
| 232 |
+
"outputs": [
|
| 233 |
+
{
|
| 234 |
+
"name": "stderr",
|
| 235 |
+
"output_type": "stream",
|
| 236 |
+
"text": [
|
| 237 |
+
"2026-01-25 21:21:04,434 - datalake.athena - INFO - Query started with execution ID: 4f1cfb71-2b52-4226-bd01-412f44cf23e3\n",
|
| 238 |
+
"2026-01-25 21:21:05,589 - datalake.athena - INFO - Query 4f1cfb71-2b52-4226-bd01-412f44cf23e3 completed successfully\n",
|
| 239 |
+
"2026-01-25 21:21:05,720 - datalake.athena - INFO - Retrieved 78 rows from query 4f1cfb71-2b52-4226-bd01-412f44cf23e3\n",
|
| 240 |
+
"2026-01-25 21:21:05,721 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 241 |
+
"2026-01-25 21:21:05,721 - datalake.catalog - INFO - Found 1 device(s)\n"
|
| 242 |
+
]
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"name": "stdout",
|
| 246 |
+
"output_type": "stream",
|
| 247 |
+
"text": [
|
| 248 |
+
"Found 1 device(s):\n",
|
| 249 |
+
" - tbl\n"
|
| 250 |
+
]
|
| 251 |
+
}
|
| 252 |
+
],
|
| 253 |
+
"source": [
|
| 254 |
+
"# Discover devices\n",
|
| 255 |
+
"devices = catalog.list_devices()\n",
|
| 256 |
+
"print(f\"Found {len(devices)} device(s):\")\n",
|
| 257 |
+
"for device in devices:\n",
|
| 258 |
+
" print(f\" - {device}\")"
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"cell_type": "code",
|
| 263 |
+
"execution_count": 15,
|
| 264 |
+
"metadata": {},
|
| 265 |
+
"outputs": [
|
| 266 |
+
{
|
| 267 |
+
"name": "stderr",
|
| 268 |
+
"output_type": "stream",
|
| 269 |
+
"text": [
|
| 270 |
+
"2026-01-25 21:21:12,744 - datalake.athena - INFO - Query started with execution ID: 3e5558cf-432c-4beb-8217-97bcbbf71694\n"
|
| 271 |
+
]
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"name": "stdout",
|
| 275 |
+
"output_type": "stream",
|
| 276 |
+
"text": [
|
| 277 |
+
"\n",
|
| 278 |
+
"Exploring device: tbl\n",
|
| 279 |
+
"============================================================\n"
|
| 280 |
+
]
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"name": "stderr",
|
| 284 |
+
"output_type": "stream",
|
| 285 |
+
"text": [
|
| 286 |
+
"2026-01-25 21:21:13,885 - datalake.athena - INFO - Query 3e5558cf-432c-4beb-8217-97bcbbf71694 completed successfully\n",
|
| 287 |
+
"2026-01-25 21:21:14,016 - datalake.athena - INFO - Retrieved 78 rows from query 3e5558cf-432c-4beb-8217-97bcbbf71694\n",
|
| 288 |
+
"2026-01-25 21:21:14,017 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 289 |
+
"2026-01-25 21:21:14,017 - datalake.catalog - INFO - Found 78 messages for device tbl\n"
|
| 290 |
+
]
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"name": "stdout",
|
| 294 |
+
"output_type": "stream",
|
| 295 |
+
"text": [
|
| 296 |
+
"Found 78 message(s):\n",
|
| 297 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m00\n",
|
| 298 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m03\n",
|
| 299 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m04\n",
|
| 300 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m05\n",
|
| 301 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m06\n",
|
| 302 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m07\n",
|
| 303 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m0c\n",
|
| 304 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m0d\n",
|
| 305 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m0e\n",
|
| 306 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m0f\n",
|
| 307 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m10\n",
|
| 308 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m11\n",
|
| 309 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m1f\n",
|
| 310 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m2e\n",
|
| 311 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m2f\n",
|
| 312 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m33\n",
|
| 313 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m34\n",
|
| 314 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m35\n",
|
| 315 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m43\n",
|
| 316 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m44\n",
|
| 317 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m49\n",
|
| 318 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m55\n",
|
| 319 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m56\n",
|
| 320 |
+
" - 97a4aaf4_can1_obd2_s_m41_s01pid_m5c\n",
|
| 321 |
+
" - 97a4aaf4_can9_gnssaltitude\n",
|
| 322 |
+
" - 97a4aaf4_can9_gnssdistance\n",
|
| 323 |
+
" - 97a4aaf4_can9_gnsspos\n",
|
| 324 |
+
" - 97a4aaf4_can9_gnssspeed\n",
|
| 325 |
+
" - 97a4aaf4_can9_gnssstatus\n",
|
| 326 |
+
" - 97a4aaf4_can9_gnsstime\n",
|
| 327 |
+
" - 97a4aaf4_can9_heartbeat\n",
|
| 328 |
+
" - 97a4aaf4_can9_imudata\n",
|
| 329 |
+
" - 97a4aaf4_can9_timecalendar\n",
|
| 330 |
+
" - 97a4aaf4_can9_timeexternal\n",
|
| 331 |
+
" - 97a4aaf4_messages\n",
|
| 332 |
+
" - aggregations_devicemeta\n",
|
| 333 |
+
" - aggregations_tripsummary\n",
|
| 334 |
+
" - b8280fd1_can9_gnssaltitude\n",
|
| 335 |
+
" - b8280fd1_can9_gnssdistance\n",
|
| 336 |
+
" - b8280fd1_can9_gnsspos\n",
|
| 337 |
+
" - b8280fd1_can9_gnssspeed\n",
|
| 338 |
+
" - b8280fd1_can9_gnssstatus\n",
|
| 339 |
+
" - b8280fd1_can9_gnsstime\n",
|
| 340 |
+
" - b8280fd1_can9_heartbeat\n",
|
| 341 |
+
" - b8280fd1_can9_imudata\n",
|
| 342 |
+
" - b8280fd1_can9_timecalendar\n",
|
| 343 |
+
" - b8280fd1_can9_timeexternal\n",
|
| 344 |
+
" - b8280fd1_messages\n",
|
| 345 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m03\n",
|
| 346 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m04\n",
|
| 347 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m05\n",
|
| 348 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m06\n",
|
| 349 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m07\n",
|
| 350 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m0c\n",
|
| 351 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m0d\n",
|
| 352 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m0e\n",
|
| 353 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m0f\n",
|
| 354 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m10\n",
|
| 355 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m1f\n",
|
| 356 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m2e\n",
|
| 357 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m33\n",
|
| 358 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m34\n",
|
| 359 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m35\n",
|
| 360 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m43\n",
|
| 361 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m44\n",
|
| 362 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m49\n",
|
| 363 |
+
" - f1da612a_can1_obd2_s_m41_s01pid_m5c\n",
|
| 364 |
+
" - f1da612a_can9_gnssaltitude\n",
|
| 365 |
+
" - f1da612a_can9_gnssdistance\n",
|
| 366 |
+
" - f1da612a_can9_gnsspos\n",
|
| 367 |
+
" - f1da612a_can9_gnssspeed\n",
|
| 368 |
+
" - f1da612a_can9_gnssstatus\n",
|
| 369 |
+
" - f1da612a_can9_gnsstime\n",
|
| 370 |
+
" - f1da612a_can9_heartbeat\n",
|
| 371 |
+
" - f1da612a_can9_imudata\n",
|
| 372 |
+
" - f1da612a_can9_timecalendar\n",
|
| 373 |
+
" - f1da612a_can9_timeexternal\n",
|
| 374 |
+
" - f1da612a_messages\n"
|
| 375 |
+
]
|
| 376 |
+
}
|
| 377 |
+
],
|
| 378 |
+
"source": [
|
| 379 |
+
"# Explore messages for the first device\n",
|
| 380 |
+
"if devices:\n",
|
| 381 |
+
" device_id = devices[0]\n",
|
| 382 |
+
" print(f\"\\nExploring device: {device_id}\")\n",
|
| 383 |
+
" print(\"=\" * 60)\n",
|
| 384 |
+
" \n",
|
| 385 |
+
" messages = catalog.list_messages(device_id)\n",
|
| 386 |
+
" print(f\"Found {len(messages)} message(s):\")\n",
|
| 387 |
+
" for message in messages:\n",
|
| 388 |
+
" print(f\" - {message}\")"
|
| 389 |
+
]
|
| 390 |
+
},
|
| 391 |
+
{
|
| 392 |
+
"cell_type": "code",
|
| 393 |
+
"execution_count": null,
|
| 394 |
+
"metadata": {},
|
| 395 |
+
"outputs": [
|
| 396 |
+
{
|
| 397 |
+
"name": "stdout",
|
| 398 |
+
"output_type": "stream",
|
| 399 |
+
"text": [
|
| 400 |
+
"\n",
|
| 401 |
+
"Schema for tbl/97a4aaf4_can1_obd2_s_m41_s01pid_m00:\n",
|
| 402 |
+
"============================================================\n"
|
| 403 |
+
]
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"name": "stderr",
|
| 407 |
+
"output_type": "stream",
|
| 408 |
+
"text": [
|
| 409 |
+
"2026-01-25 17:31:48,556 - datalake.athena - INFO - Query started with execution ID: 6916c876-c526-4474-bbe0-ad626b4786e7\n",
|
| 410 |
+
"2026-01-25 17:31:49,686 - datalake.athena - INFO - Query 6916c876-c526-4474-bbe0-ad626b4786e7 completed successfully\n",
|
| 411 |
+
"2026-01-25 17:31:49,816 - datalake.athena - INFO - Retrieved 78 rows from query 6916c876-c526-4474-bbe0-ad626b4786e7\n",
|
| 412 |
+
"2026-01-25 17:31:49,817 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 413 |
+
"2026-01-25 17:31:49,993 - datalake.athena - INFO - Query started with execution ID: b71d0e14-e0e8-4e3f-8c8c-ceffcd9984f2\n",
|
| 414 |
+
"2026-01-25 17:31:51,132 - datalake.athena - INFO - Query b71d0e14-e0e8-4e3f-8c8c-ceffcd9984f2 completed successfully\n",
|
| 415 |
+
"2026-01-25 17:31:51,371 - datalake.athena - INFO - Retrieved 3 rows from query b71d0e14-e0e8-4e3f-8c8c-ceffcd9984f2\n",
|
| 416 |
+
"2026-01-25 17:31:51,372 - datalake.catalog - INFO - Schema for tbl/97a4aaf4_can1_obd2_s_m41_s01pid_m00: 3 columns\n"
|
| 417 |
+
]
|
| 418 |
+
},
|
| 419 |
+
{
|
| 420 |
+
"name": "stdout",
|
| 421 |
+
"output_type": "stream",
|
| 422 |
+
"text": [
|
| 423 |
+
" Column Type\n",
|
| 424 |
+
" t timestamp(3)\n",
|
| 425 |
+
"s01pid00_pidssupported_01_20 double\n",
|
| 426 |
+
" date_created varchar\n",
|
| 427 |
+
"\n",
|
| 428 |
+
"Total columns: 3\n"
|
| 429 |
+
]
|
| 430 |
+
}
|
| 431 |
+
],
|
| 432 |
+
"source": [
|
| 433 |
+
"# Get schema for first device/message combination\n",
|
| 434 |
+
"if devices and messages:\n",
|
| 435 |
+
" device_id = devices[0]\n",
|
| 436 |
+
" message = messages[0]\n",
|
| 437 |
+
" \n",
|
| 438 |
+
" print(f\"\\nSchema for {device_id}/{message}:\")\n",
|
| 439 |
+
" print(\"=\" * 60)\n",
|
| 440 |
+
" \n",
|
| 441 |
+
" schema = catalog.get_schema(device_id, message)\n",
|
| 442 |
+
" if schema:\n",
|
| 443 |
+
" schema_df = pd.DataFrame([\n",
|
| 444 |
+
" {\"Column\": col, \"Type\": dtype}\n",
|
| 445 |
+
" for col, dtype in schema.items()\n",
|
| 446 |
+
" ])\n",
|
| 447 |
+
" print(schema_df.to_string(index=False))\n",
|
| 448 |
+
" print(f\"\\nTotal columns: {len(schema)}\")"
|
| 449 |
+
]
|
| 450 |
+
},
|
| 451 |
+
{
|
| 452 |
+
"cell_type": "code",
|
| 453 |
+
"execution_count": 9,
|
| 454 |
+
"metadata": {},
|
| 455 |
+
"outputs": [
|
| 456 |
+
{
|
| 457 |
+
"name": "stderr",
|
| 458 |
+
"output_type": "stream",
|
| 459 |
+
"text": [
|
| 460 |
+
"2026-01-25 17:31:58,489 - datalake.athena - INFO - Query started with execution ID: 844cf5ba-7756-46cf-a0e4-d6bfe8c98f74\n"
|
| 461 |
+
]
|
| 462 |
+
},
|
| 463 |
+
{
|
| 464 |
+
"name": "stdout",
|
| 465 |
+
"output_type": "stream",
|
| 466 |
+
"text": [
|
| 467 |
+
"\n",
|
| 468 |
+
"Partitions (dates) for tbl/97a4aaf4_can1_obd2_s_m41_s01pid_m00:\n",
|
| 469 |
+
"============================================================\n"
|
| 470 |
+
]
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"name": "stderr",
|
| 474 |
+
"output_type": "stream",
|
| 475 |
+
"text": [
|
| 476 |
+
"2026-01-25 17:31:59,938 - datalake.athena - INFO - Query 844cf5ba-7756-46cf-a0e4-d6bfe8c98f74 completed successfully\n",
|
| 477 |
+
"2026-01-25 17:32:00,137 - datalake.athena - INFO - Retrieved 78 rows from query 844cf5ba-7756-46cf-a0e4-d6bfe8c98f74\n",
|
| 478 |
+
"2026-01-25 17:32:00,137 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 479 |
+
"2026-01-25 17:32:00,265 - datalake.athena - INFO - Query started with execution ID: c4a13ea7-d58e-4658-90aa-9413d04b9417\n",
|
| 480 |
+
"2026-01-25 17:32:02,108 - datalake.athena - INFO - Query c4a13ea7-d58e-4658-90aa-9413d04b9417 completed successfully\n",
|
| 481 |
+
"2026-01-25 17:32:02,219 - datalake.athena - WARNING - No results returned for execution c4a13ea7-d58e-4658-90aa-9413d04b9417\n",
|
| 482 |
+
"2026-01-25 17:32:02,222 - datalake.catalog - WARNING - No partitions found for tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m00\n"
|
| 483 |
+
]
|
| 484 |
+
},
|
| 485 |
+
{
|
| 486 |
+
"name": "stdout",
|
| 487 |
+
"output_type": "stream",
|
| 488 |
+
"text": [
|
| 489 |
+
"No partitions found (table may not be partitioned)\n"
|
| 490 |
+
]
|
| 491 |
+
}
|
| 492 |
+
],
|
| 493 |
+
"source": [
|
| 494 |
+
"# Check available partitions (dates)\n",
|
| 495 |
+
"if devices and messages:\n",
|
| 496 |
+
" device_id = devices[0]\n",
|
| 497 |
+
" message = messages[0]\n",
|
| 498 |
+
" \n",
|
| 499 |
+
" print(f\"\\nPartitions (dates) for {device_id}/{message}:\")\n",
|
| 500 |
+
" print(\"=\" * 60)\n",
|
| 501 |
+
" \n",
|
| 502 |
+
" try:\n",
|
| 503 |
+
" partitions = catalog.list_partitions(device_id, message)\n",
|
| 504 |
+
" if partitions:\n",
|
| 505 |
+
" print(f\"Found {len(partitions)} partition(s):\")\n",
|
| 506 |
+
" print(f\" Date range: {partitions[0]} to {partitions[-1]}\")\n",
|
| 507 |
+
" print(f\"\\n All dates:\")\n",
|
| 508 |
+
" for date in partitions[:20]: # Show first 20\n",
|
| 509 |
+
" print(f\" - {date}\")\n",
|
| 510 |
+
" if len(partitions) > 20:\n",
|
| 511 |
+
" print(f\" ... and {len(partitions) - 20} more\")\n",
|
| 512 |
+
" else:\n",
|
| 513 |
+
" print(\"No partitions found (table may not be partitioned)\")\n",
|
| 514 |
+
" except Exception as e:\n",
|
| 515 |
+
" print(f\"Could not list partitions: {e}\")"
|
| 516 |
+
]
|
| 517 |
+
},
|
| 518 |
+
{
|
| 519 |
+
"cell_type": "markdown",
|
| 520 |
+
"metadata": {},
|
| 521 |
+
"source": [
|
| 522 |
+
"## Query Data\n",
|
| 523 |
+
"\n",
|
| 524 |
+
"Now let's query some actual data."
|
| 525 |
+
]
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"cell_type": "code",
|
| 529 |
+
"execution_count": null,
|
| 530 |
+
"metadata": {},
|
| 531 |
+
"outputs": [
|
| 532 |
+
{
|
| 533 |
+
"name": "stdout",
|
| 534 |
+
"output_type": "stream",
|
| 535 |
+
"text": [
|
| 536 |
+
"Reading sample data from tbl/97a4aaf4_can1_obd2_s_m41_s01pid_m00...\n",
|
| 537 |
+
"============================================================\n"
|
| 538 |
+
]
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"name": "stderr",
|
| 542 |
+
"output_type": "stream",
|
| 543 |
+
"text": [
|
| 544 |
+
"2026-01-25 17:32:42,022 - datalake.athena - INFO - Query started with execution ID: 2a7e2ed0-8c44-46e7-a5b4-1a57fab4938b\n",
|
| 545 |
+
"2026-01-25 17:32:43,601 - datalake.athena - INFO - Query 2a7e2ed0-8c44-46e7-a5b4-1a57fab4938b completed successfully\n",
|
| 546 |
+
"2026-01-25 17:32:43,731 - datalake.athena - INFO - Retrieved 78 rows from query 2a7e2ed0-8c44-46e7-a5b4-1a57fab4938b\n",
|
| 547 |
+
"2026-01-25 17:32:43,732 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 548 |
+
"2026-01-25 17:32:43,732 - datalake.query - INFO - Executing query for tbl/97a4aaf4_can1_obd2_s_m41_s01pid_m00\n",
|
| 549 |
+
"2026-01-25 17:32:43,859 - datalake.athena - INFO - Query started with execution ID: 02fe08ed-4f1c-4363-b167-c1a4a0196094\n",
|
| 550 |
+
"2026-01-25 17:32:48,300 - datalake.athena - INFO - Query 02fe08ed-4f1c-4363-b167-c1a4a0196094 completed successfully\n",
|
| 551 |
+
"2026-01-25 17:32:48,430 - datalake.athena - INFO - Retrieved 100 rows from query 02fe08ed-4f1c-4363-b167-c1a4a0196094\n"
|
| 552 |
+
]
|
| 553 |
+
},
|
| 554 |
+
{
|
| 555 |
+
"name": "stdout",
|
| 556 |
+
"output_type": "stream",
|
| 557 |
+
"text": [
|
| 558 |
+
"✓ Loaded 100 records\n",
|
| 559 |
+
"\n",
|
| 560 |
+
"Data shape: (100, 3)\n",
|
| 561 |
+
"\n",
|
| 562 |
+
"Columns: ['t', 's01pid00_pidssupported_01_20', 'date_created']\n",
|
| 563 |
+
"\n",
|
| 564 |
+
"First few rows:\n"
|
| 565 |
+
]
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"data": {
|
| 569 |
+
"text/html": [
|
| 570 |
+
"<div>\n",
|
| 571 |
+
"<style scoped>\n",
|
| 572 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 573 |
+
" vertical-align: middle;\n",
|
| 574 |
+
" }\n",
|
| 575 |
+
"\n",
|
| 576 |
+
" .dataframe tbody tr th {\n",
|
| 577 |
+
" vertical-align: top;\n",
|
| 578 |
+
" }\n",
|
| 579 |
+
"\n",
|
| 580 |
+
" .dataframe thead th {\n",
|
| 581 |
+
" text-align: right;\n",
|
| 582 |
+
" }\n",
|
| 583 |
+
"</style>\n",
|
| 584 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 585 |
+
" <thead>\n",
|
| 586 |
+
" <tr style=\"text-align: right;\">\n",
|
| 587 |
+
" <th></th>\n",
|
| 588 |
+
" <th>t</th>\n",
|
| 589 |
+
" <th>s01pid00_pidssupported_01_20</th>\n",
|
| 590 |
+
" <th>date_created</th>\n",
|
| 591 |
+
" </tr>\n",
|
| 592 |
+
" </thead>\n",
|
| 593 |
+
" <tbody>\n",
|
| 594 |
+
" <tr>\n",
|
| 595 |
+
" <th>0</th>\n",
|
| 596 |
+
" <td>2025-10-29 05:45:53.063</td>\n",
|
| 597 |
+
" <td>3.189744e+09</td>\n",
|
| 598 |
+
" <td>2025/10/29</td>\n",
|
| 599 |
+
" </tr>\n",
|
| 600 |
+
" <tr>\n",
|
| 601 |
+
" <th>1</th>\n",
|
| 602 |
+
" <td>2025-10-29 05:46:18.062</td>\n",
|
| 603 |
+
" <td>3.189744e+09</td>\n",
|
| 604 |
+
" <td>2025/10/29</td>\n",
|
| 605 |
+
" </tr>\n",
|
| 606 |
+
" <tr>\n",
|
| 607 |
+
" <th>2</th>\n",
|
| 608 |
+
" <td>2025-10-29 05:46:48.063</td>\n",
|
| 609 |
+
" <td>3.189744e+09</td>\n",
|
| 610 |
+
" <td>2025/10/29</td>\n",
|
| 611 |
+
" </tr>\n",
|
| 612 |
+
" <tr>\n",
|
| 613 |
+
" <th>3</th>\n",
|
| 614 |
+
" <td>2025-10-29 05:47:43.062</td>\n",
|
| 615 |
+
" <td>3.189744e+09</td>\n",
|
| 616 |
+
" <td>2025/10/29</td>\n",
|
| 617 |
+
" </tr>\n",
|
| 618 |
+
" <tr>\n",
|
| 619 |
+
" <th>4</th>\n",
|
| 620 |
+
" <td>2025-10-29 05:48:08.062</td>\n",
|
| 621 |
+
" <td>3.189744e+09</td>\n",
|
| 622 |
+
" <td>2025/10/29</td>\n",
|
| 623 |
+
" </tr>\n",
|
| 624 |
+
" <tr>\n",
|
| 625 |
+
" <th>5</th>\n",
|
| 626 |
+
" <td>2025-10-29 05:49:33.063</td>\n",
|
| 627 |
+
" <td>3.189744e+09</td>\n",
|
| 628 |
+
" <td>2025/10/29</td>\n",
|
| 629 |
+
" </tr>\n",
|
| 630 |
+
" <tr>\n",
|
| 631 |
+
" <th>6</th>\n",
|
| 632 |
+
" <td>2025-10-29 05:49:48.063</td>\n",
|
| 633 |
+
" <td>3.189744e+09</td>\n",
|
| 634 |
+
" <td>2025/10/29</td>\n",
|
| 635 |
+
" </tr>\n",
|
| 636 |
+
" <tr>\n",
|
| 637 |
+
" <th>7</th>\n",
|
| 638 |
+
" <td>2025-10-29 05:50:03.063</td>\n",
|
| 639 |
+
" <td>3.189744e+09</td>\n",
|
| 640 |
+
" <td>2025/10/29</td>\n",
|
| 641 |
+
" </tr>\n",
|
| 642 |
+
" <tr>\n",
|
| 643 |
+
" <th>8</th>\n",
|
| 644 |
+
" <td>2025-10-29 05:50:33.064</td>\n",
|
| 645 |
+
" <td>3.189744e+09</td>\n",
|
| 646 |
+
" <td>2025/10/29</td>\n",
|
| 647 |
+
" </tr>\n",
|
| 648 |
+
" <tr>\n",
|
| 649 |
+
" <th>9</th>\n",
|
| 650 |
+
" <td>2025-10-29 05:50:58.064</td>\n",
|
| 651 |
+
" <td>3.189744e+09</td>\n",
|
| 652 |
+
" <td>2025/10/29</td>\n",
|
| 653 |
+
" </tr>\n",
|
| 654 |
+
" </tbody>\n",
|
| 655 |
+
"</table>\n",
|
| 656 |
+
"</div>"
|
| 657 |
+
],
|
| 658 |
+
"text/plain": [
|
| 659 |
+
" t s01pid00_pidssupported_01_20 date_created\n",
|
| 660 |
+
"0 2025-10-29 05:45:53.063 3.189744e+09 2025/10/29\n",
|
| 661 |
+
"1 2025-10-29 05:46:18.062 3.189744e+09 2025/10/29\n",
|
| 662 |
+
"2 2025-10-29 05:46:48.063 3.189744e+09 2025/10/29\n",
|
| 663 |
+
"3 2025-10-29 05:47:43.062 3.189744e+09 2025/10/29\n",
|
| 664 |
+
"4 2025-10-29 05:48:08.062 3.189744e+09 2025/10/29\n",
|
| 665 |
+
"5 2025-10-29 05:49:33.063 3.189744e+09 2025/10/29\n",
|
| 666 |
+
"6 2025-10-29 05:49:48.063 3.189744e+09 2025/10/29\n",
|
| 667 |
+
"7 2025-10-29 05:50:03.063 3.189744e+09 2025/10/29\n",
|
| 668 |
+
"8 2025-10-29 05:50:33.064 3.189744e+09 2025/10/29\n",
|
| 669 |
+
"9 2025-10-29 05:50:58.064 3.189744e+09 2025/10/29"
|
| 670 |
+
]
|
| 671 |
+
},
|
| 672 |
+
"metadata": {},
|
| 673 |
+
"output_type": "display_data"
|
| 674 |
+
},
|
| 675 |
+
{
|
| 676 |
+
"name": "stdout",
|
| 677 |
+
"output_type": "stream",
|
| 678 |
+
"text": [
|
| 679 |
+
"\n",
|
| 680 |
+
"Data types:\n",
|
| 681 |
+
"t object\n",
|
| 682 |
+
"s01pid00_pidssupported_01_20 float64\n",
|
| 683 |
+
"date_created object\n",
|
| 684 |
+
"dtype: object\n",
|
| 685 |
+
"\n",
|
| 686 |
+
"Basic statistics:\n"
|
| 687 |
+
]
|
| 688 |
+
},
|
| 689 |
+
{
|
| 690 |
+
"data": {
|
| 691 |
+
"text/html": [
|
| 692 |
+
"<div>\n",
|
| 693 |
+
"<style scoped>\n",
|
| 694 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 695 |
+
" vertical-align: middle;\n",
|
| 696 |
+
" }\n",
|
| 697 |
+
"\n",
|
| 698 |
+
" .dataframe tbody tr th {\n",
|
| 699 |
+
" vertical-align: top;\n",
|
| 700 |
+
" }\n",
|
| 701 |
+
"\n",
|
| 702 |
+
" .dataframe thead th {\n",
|
| 703 |
+
" text-align: right;\n",
|
| 704 |
+
" }\n",
|
| 705 |
+
"</style>\n",
|
| 706 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 707 |
+
" <thead>\n",
|
| 708 |
+
" <tr style=\"text-align: right;\">\n",
|
| 709 |
+
" <th></th>\n",
|
| 710 |
+
" <th>s01pid00_pidssupported_01_20</th>\n",
|
| 711 |
+
" </tr>\n",
|
| 712 |
+
" </thead>\n",
|
| 713 |
+
" <tbody>\n",
|
| 714 |
+
" <tr>\n",
|
| 715 |
+
" <th>count</th>\n",
|
| 716 |
+
" <td>1.000000e+02</td>\n",
|
| 717 |
+
" </tr>\n",
|
| 718 |
+
" <tr>\n",
|
| 719 |
+
" <th>mean</th>\n",
|
| 720 |
+
" <td>3.189744e+09</td>\n",
|
| 721 |
+
" </tr>\n",
|
| 722 |
+
" <tr>\n",
|
| 723 |
+
" <th>std</th>\n",
|
| 724 |
+
" <td>0.000000e+00</td>\n",
|
| 725 |
+
" </tr>\n",
|
| 726 |
+
" <tr>\n",
|
| 727 |
+
" <th>min</th>\n",
|
| 728 |
+
" <td>3.189744e+09</td>\n",
|
| 729 |
+
" </tr>\n",
|
| 730 |
+
" <tr>\n",
|
| 731 |
+
" <th>25%</th>\n",
|
| 732 |
+
" <td>3.189744e+09</td>\n",
|
| 733 |
+
" </tr>\n",
|
| 734 |
+
" <tr>\n",
|
| 735 |
+
" <th>50%</th>\n",
|
| 736 |
+
" <td>3.189744e+09</td>\n",
|
| 737 |
+
" </tr>\n",
|
| 738 |
+
" <tr>\n",
|
| 739 |
+
" <th>75%</th>\n",
|
| 740 |
+
" <td>3.189744e+09</td>\n",
|
| 741 |
+
" </tr>\n",
|
| 742 |
+
" <tr>\n",
|
| 743 |
+
" <th>max</th>\n",
|
| 744 |
+
" <td>3.189744e+09</td>\n",
|
| 745 |
+
" </tr>\n",
|
| 746 |
+
" </tbody>\n",
|
| 747 |
+
"</table>\n",
|
| 748 |
+
"</div>"
|
| 749 |
+
],
|
| 750 |
+
"text/plain": [
|
| 751 |
+
" s01pid00_pidssupported_01_20\n",
|
| 752 |
+
"count 1.000000e+02\n",
|
| 753 |
+
"mean 3.189744e+09\n",
|
| 754 |
+
"std 0.000000e+00\n",
|
| 755 |
+
"min 3.189744e+09\n",
|
| 756 |
+
"25% 3.189744e+09\n",
|
| 757 |
+
"50% 3.189744e+09\n",
|
| 758 |
+
"75% 3.189744e+09\n",
|
| 759 |
+
"max 3.189744e+09"
|
| 760 |
+
]
|
| 761 |
+
},
|
| 762 |
+
"metadata": {},
|
| 763 |
+
"output_type": "display_data"
|
| 764 |
+
}
|
| 765 |
+
],
|
| 766 |
+
"source": [
|
| 767 |
+
"# Read a sample of data\n",
|
| 768 |
+
"if devices and messages:\n",
|
| 769 |
+
" device_id = devices[0]\n",
|
| 770 |
+
" message = messages[0]\n",
|
| 771 |
+
" \n",
|
| 772 |
+
" print(f\"Reading sample data from {device_id}/{message}...\")\n",
|
| 773 |
+
" print(\"=\" * 60)\n",
|
| 774 |
+
" \n",
|
| 775 |
+
" try:\n",
|
| 776 |
+
" df = query.read_device_message(\n",
|
| 777 |
+
" device_id=device_id,\n",
|
| 778 |
+
" message=message,\n",
|
| 779 |
+
" limit=100 # Limit for quick preview\n",
|
| 780 |
+
" )\n",
|
| 781 |
+
" \n",
|
| 782 |
+
" print(f\"✓ Loaded {len(df)} records\")\n",
|
| 783 |
+
" print(f\"\\nData shape: {df.shape}\")\n",
|
| 784 |
+
" print(f\"\\nColumns: {list(df.columns)}\")\n",
|
| 785 |
+
" print(f\"\\nFirst few rows:\")\n",
|
| 786 |
+
" display(df.head(10))\n",
|
| 787 |
+
" \n",
|
| 788 |
+
" print(f\"\\nData types:\")\n",
|
| 789 |
+
" print(df.dtypes)\n",
|
| 790 |
+
" \n",
|
| 791 |
+
" print(f\"\\nBasic statistics:\")\n",
|
| 792 |
+
" display(df.describe())\n",
|
| 793 |
+
" \n",
|
| 794 |
+
" except Exception as e:\n",
|
| 795 |
+
" print(f\"✗ Error reading data: {e}\")\n",
|
| 796 |
+
" import traceback\n",
|
| 797 |
+
" traceback.print_exc()"
|
| 798 |
+
]
|
| 799 |
+
},
|
| 800 |
+
{
|
| 801 |
+
"cell_type": "markdown",
|
| 802 |
+
"metadata": {},
|
| 803 |
+
"source": [
|
| 804 |
+
"## Query ALL Data (No Limits)\n",
|
| 805 |
+
"\n",
|
| 806 |
+
"To see all your data, remove the `limit` parameter or set it to `None`. \n",
|
| 807 |
+
"**Note:** This may take longer and use more memory for large datasets."
|
| 808 |
+
]
|
| 809 |
+
},
|
| 810 |
+
{
|
| 811 |
+
"cell_type": "code",
|
| 812 |
+
"execution_count": null,
|
| 813 |
+
"metadata": {},
|
| 814 |
+
"outputs": [],
|
| 815 |
+
"source": [
|
| 816 |
+
"# Query ALL data (no limit) - use with caution for large datasets\n",
|
| 817 |
+
"if 'first_device' in locals() and 'first_message' in locals() and first_message:\n",
|
| 818 |
+
" device_id = first_device\n",
|
| 819 |
+
" message = first_message\n",
|
| 820 |
+
" \n",
|
| 821 |
+
" print(f\"Querying ALL data from {device_id}/{message}...\")\n",
|
| 822 |
+
" print(\"=\" * 60)\n",
|
| 823 |
+
" print(\"⚠️ This may take a while for large datasets!\")\n",
|
| 824 |
+
" print()\n",
|
| 825 |
+
" \n",
|
| 826 |
+
" # Uncomment the lines below to query all data (remove limit)\n",
|
| 827 |
+
" # try:\n",
|
| 828 |
+
" # df_all = query.read_device_message(\n",
|
| 829 |
+
" # device_id=device_id,\n",
|
| 830 |
+
" # message=message,\n",
|
| 831 |
+
" # limit=None # No limit - gets ALL data\n",
|
| 832 |
+
" # )\n",
|
| 833 |
+
" # \n",
|
| 834 |
+
" # print(f\"✓ Loaded ALL {len(df_all)} records\")\n",
|
| 835 |
+
" # print(f\"\\nData shape: {df_all.shape}\")\n",
|
| 836 |
+
" # display(df_all.head(20))\n",
|
| 837 |
+
" # \n",
|
| 838 |
+
" # except Exception as e:\n",
|
| 839 |
+
" # print(f\"✗ Error reading all data: {e}\")\n",
|
| 840 |
+
" # import traceback\n",
|
| 841 |
+
" # traceback.print_exc()\n",
|
| 842 |
+
" \n",
|
| 843 |
+
" print(\"(Uncomment the code above to query all data)\")"
|
| 844 |
+
]
|
| 845 |
+
},
|
| 846 |
+
{
|
| 847 |
+
"cell_type": "code",
|
| 848 |
+
"execution_count": 11,
|
| 849 |
+
"metadata": {},
|
| 850 |
+
"outputs": [
|
| 851 |
+
{
|
| 852 |
+
"name": "stderr",
|
| 853 |
+
"output_type": "stream",
|
| 854 |
+
"text": [
|
| 855 |
+
"2026-01-25 17:32:55,234 - datalake.athena - INFO - Query started with execution ID: ee01954e-ad1e-4044-aba1-d5b9695cbaef\n",
|
| 856 |
+
"2026-01-25 17:32:56,362 - datalake.athena - INFO - Query ee01954e-ad1e-4044-aba1-d5b9695cbaef completed successfully\n",
|
| 857 |
+
"2026-01-25 17:32:56,510 - datalake.athena - INFO - Retrieved 78 rows from query ee01954e-ad1e-4044-aba1-d5b9695cbaef\n",
|
| 858 |
+
"2026-01-25 17:32:56,511 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 859 |
+
"2026-01-25 17:32:56,630 - datalake.athena - INFO - Query started with execution ID: 6b5ca88b-bfaa-4cb2-9aa2-95e7e8d0facc\n",
|
| 860 |
+
"2026-01-25 17:32:57,977 - datalake.athena - INFO - Query 6b5ca88b-bfaa-4cb2-9aa2-95e7e8d0facc completed successfully\n",
|
| 861 |
+
"2026-01-25 17:32:58,133 - datalake.athena - WARNING - No results returned for execution 6b5ca88b-bfaa-4cb2-9aa2-95e7e8d0facc\n",
|
| 862 |
+
"2026-01-25 17:32:58,134 - datalake.catalog - WARNING - No partitions found for tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m00\n"
|
| 863 |
+
]
|
| 864 |
+
},
|
| 865 |
+
{
|
| 866 |
+
"name": "stdout",
|
| 867 |
+
"output_type": "stream",
|
| 868 |
+
"text": [
|
| 869 |
+
"No partitions available for date filtering\n"
|
| 870 |
+
]
|
| 871 |
+
}
|
| 872 |
+
],
|
| 873 |
+
"source": [
|
| 874 |
+
"# Query with date range (if partitions available)\n",
|
| 875 |
+
"if devices and messages:\n",
|
| 876 |
+
" device_id = devices[0]\n",
|
| 877 |
+
" message = messages[0]\n",
|
| 878 |
+
" \n",
|
| 879 |
+
" try:\n",
|
| 880 |
+
" partitions = catalog.list_partitions(device_id, message)\n",
|
| 881 |
+
" if partitions:\n",
|
| 882 |
+
" start_date = partitions[0]\n",
|
| 883 |
+
" end_date = partitions[-1] if len(partitions) > 1 else partitions[0]\n",
|
| 884 |
+
" \n",
|
| 885 |
+
" print(f\"Querying data from {start_date} to {end_date}...\")\n",
|
| 886 |
+
" \n",
|
| 887 |
+
" df_date = query.read_date_range(\n",
|
| 888 |
+
" device_id=device_id,\n",
|
| 889 |
+
" message=message,\n",
|
| 890 |
+
" start_date=start_date,\n",
|
| 891 |
+
" end_date=end_date,\n",
|
| 892 |
+
" limit=1000\n",
|
| 893 |
+
" )\n",
|
| 894 |
+
" \n",
|
| 895 |
+
" print(f\"✓ Loaded {len(df_date)} records\")\n",
|
| 896 |
+
" display(df_date.head())\n",
|
| 897 |
+
" else:\n",
|
| 898 |
+
" print(\"No partitions available for date filtering\")\n",
|
| 899 |
+
" except Exception as e:\n",
|
| 900 |
+
" print(f\"Error querying date range: {e}\")"
|
| 901 |
+
]
|
| 902 |
+
},
|
| 903 |
+
{
|
| 904 |
+
"cell_type": "markdown",
|
| 905 |
+
"metadata": {},
|
| 906 |
+
"source": [
|
| 907 |
+
"## Time Series Analysis\n",
|
| 908 |
+
"\n",
|
| 909 |
+
"Analyze signals over time."
|
| 910 |
+
]
|
| 911 |
+
},
|
| 912 |
+
{
|
| 913 |
+
"cell_type": "code",
|
| 914 |
+
"execution_count": 12,
|
| 915 |
+
"metadata": {},
|
| 916 |
+
"outputs": [
|
| 917 |
+
{
|
| 918 |
+
"name": "stdout",
|
| 919 |
+
"output_type": "stream",
|
| 920 |
+
"text": [
|
| 921 |
+
"Available signal columns (2):\n",
|
| 922 |
+
" - s01pid00_pidssupported_01_20\n",
|
| 923 |
+
" - date_created\n"
|
| 924 |
+
]
|
| 925 |
+
}
|
| 926 |
+
],
|
| 927 |
+
"source": [
|
| 928 |
+
"# Get available signal columns\n",
|
| 929 |
+
"if devices and messages:\n",
|
| 930 |
+
" device_id = devices[0]\n",
|
| 931 |
+
" message = messages[0]\n",
|
| 932 |
+
" \n",
|
| 933 |
+
" schema = catalog.get_schema(device_id, message)\n",
|
| 934 |
+
" if schema:\n",
|
| 935 |
+
" # Find signal columns (exclude timestamp and date)\n",
|
| 936 |
+
" signal_cols = [\n",
|
| 937 |
+
" col for col in schema.keys() \n",
|
| 938 |
+
" if col not in ['t', 'date', 'timestamp']\n",
|
| 939 |
+
" ]\n",
|
| 940 |
+
" \n",
|
| 941 |
+
" print(f\"Available signal columns ({len(signal_cols)}):\")\n",
|
| 942 |
+
" for col in signal_cols[:10]:\n",
|
| 943 |
+
" print(f\" - {col}\")\n",
|
| 944 |
+
" if len(signal_cols) > 10:\n",
|
| 945 |
+
" print(f\" ... and {len(signal_cols) - 10} more\")"
|
| 946 |
+
]
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"cell_type": "code",
|
| 950 |
+
"execution_count": null,
|
| 951 |
+
"metadata": {},
|
| 952 |
+
"outputs": [],
|
| 953 |
+
"source": [
|
| 954 |
+
"# Query time series for a specific signal\n",
|
| 955 |
+
"if devices and messages and 'signal_cols' in locals() and signal_cols:\n",
|
| 956 |
+
" device_id = devices[0]\n",
|
| 957 |
+
" message = messages[0]\n",
|
| 958 |
+
" signal_name = signal_cols[0] # Use first signal\n",
|
| 959 |
+
" \n",
|
| 960 |
+
" print(f\"Querying time series for {signal_name}...\")\n",
|
| 961 |
+
" print(\"=\" * 60)\n",
|
| 962 |
+
" \n",
|
| 963 |
+
" try:\n",
|
| 964 |
+
" df_ts = query.time_series_query(\n",
|
| 965 |
+
" device_id=device_id,\n",
|
| 966 |
+
" message=message,\n",
|
| 967 |
+
" signal_name=signal_name,\n",
|
| 968 |
+
" limit=10000 # Adjust based on your needs\n",
|
| 969 |
+
" )\n",
|
| 970 |
+
" \n",
|
| 971 |
+
" if not df_ts.empty:\n",
|
| 972 |
+
" # Convert timestamp to datetime\n",
|
| 973 |
+
" if 't' in df_ts.columns:\n",
|
| 974 |
+
" df_ts['timestamp'] = pd.to_datetime(df_ts['t'], unit='us')\n",
|
| 975 |
+
" \n",
|
| 976 |
+
" print(f\"✓ Loaded {len(df_ts)} records\")\n",
|
| 977 |
+
" print(f\"\\nTime range: {df_ts['timestamp'].min()} to {df_ts['timestamp'].max()}\")\n",
|
| 978 |
+
" \n",
|
| 979 |
+
" # Display sample\n",
|
| 980 |
+
" display(df_ts[['timestamp', signal_name]].head(10))\n",
|
| 981 |
+
" \n",
|
| 982 |
+
" # Statistics\n",
|
| 983 |
+
" print(f\"\\nStatistics for {signal_name}:\")\n",
|
| 984 |
+
" print(f\" Mean: {df_ts[signal_name].mean():.2f}\")\n",
|
| 985 |
+
" print(f\" Min: {df_ts[signal_name].min():.2f}\")\n",
|
| 986 |
+
" print(f\" Max: {df_ts[signal_name].max():.2f}\")\n",
|
| 987 |
+
" print(f\" Std: {df_ts[signal_name].std():.2f}\")\n",
|
| 988 |
+
" else:\n",
|
| 989 |
+
" print(\"No data returned\")\n",
|
| 990 |
+
" \n",
|
| 991 |
+
" except Exception as e:\n",
|
| 992 |
+
" print(f\"✗ Error querying time series: {e}\")\n",
|
| 993 |
+
" import traceback\n",
|
| 994 |
+
" traceback.print_exc()"
|
| 995 |
+
]
|
| 996 |
+
},
|
| 997 |
+
{
|
| 998 |
+
"cell_type": "code",
|
| 999 |
+
"execution_count": null,
|
| 1000 |
+
"metadata": {},
|
| 1001 |
+
"outputs": [],
|
| 1002 |
+
"source": [
|
| 1003 |
+
"# Plot time series (if data available)\n",
|
| 1004 |
+
"if 'df_ts' in locals() and not df_ts.empty and 'timestamp' in df_ts.columns:\n",
|
| 1005 |
+
" try:\n",
|
| 1006 |
+
" plt.figure(figsize=(14, 6))\n",
|
| 1007 |
+
" plt.plot(df_ts['timestamp'], df_ts[signal_name], linewidth=0.5, alpha=0.7)\n",
|
| 1008 |
+
" plt.title(f\"Time Series: {signal_name}\", fontsize=14, fontweight='bold')\n",
|
| 1009 |
+
" plt.xlabel('Time', fontsize=12)\n",
|
| 1010 |
+
" plt.ylabel(signal_name, fontsize=12)\n",
|
| 1011 |
+
" plt.grid(True, alpha=0.3)\n",
|
| 1012 |
+
" plt.xticks(rotation=45)\n",
|
| 1013 |
+
" plt.tight_layout()\n",
|
| 1014 |
+
" plt.show()\n",
|
| 1015 |
+
" \n",
|
| 1016 |
+
" # Histogram\n",
|
| 1017 |
+
" plt.figure(figsize=(10, 6))\n",
|
| 1018 |
+
" plt.hist(df_ts[signal_name], bins=50, edgecolor='black', alpha=0.7)\n",
|
| 1019 |
+
" plt.title(f\"Distribution: {signal_name}\", fontsize=14, fontweight='bold')\n",
|
| 1020 |
+
" plt.xlabel(signal_name, fontsize=12)\n",
|
| 1021 |
+
" plt.ylabel('Frequency', fontsize=12)\n",
|
| 1022 |
+
" plt.grid(True, alpha=0.3)\n",
|
| 1023 |
+
" plt.tight_layout()\n",
|
| 1024 |
+
" plt.show()\n",
|
| 1025 |
+
" \n",
|
| 1026 |
+
" except Exception as e:\n",
|
| 1027 |
+
" print(f\"Error plotting: {e}\")"
|
| 1028 |
+
]
|
| 1029 |
+
},
|
| 1030 |
+
{
|
| 1031 |
+
"cell_type": "markdown",
|
| 1032 |
+
"metadata": {},
|
| 1033 |
+
"source": [
|
| 1034 |
+
"## Custom SQL Queries\n",
|
| 1035 |
+
"\n",
|
| 1036 |
+
"Execute custom SQL queries for advanced analysis."
|
| 1037 |
+
]
|
| 1038 |
+
},
|
| 1039 |
+
{
|
| 1040 |
+
"cell_type": "code",
|
| 1041 |
+
"execution_count": null,
|
| 1042 |
+
"metadata": {},
|
| 1043 |
+
"outputs": [],
|
| 1044 |
+
"source": [
|
| 1045 |
+
"# Example: Get record counts per device/message\n",
|
| 1046 |
+
"if devices and messages:\n",
|
| 1047 |
+
" device_id = devices[0]\n",
|
| 1048 |
+
" message = messages[0]\n",
|
| 1049 |
+
" table_name = catalog.get_table_name(device_id, message)\n",
|
| 1050 |
+
" \n",
|
| 1051 |
+
" custom_sql = f\"\"\"\n",
|
| 1052 |
+
" SELECT \n",
|
| 1053 |
+
" COUNT(*) as record_count,\n",
|
| 1054 |
+
" MIN(t) as min_timestamp,\n",
|
| 1055 |
+
" MAX(t) as max_timestamp\n",
|
| 1056 |
+
" FROM {config.database_name}.{table_name}\n",
|
| 1057 |
+
" \"\"\"\n",
|
| 1058 |
+
" \n",
|
| 1059 |
+
" try:\n",
|
| 1060 |
+
" df_stats = query.execute_sql(custom_sql)\n",
|
| 1061 |
+
" print(f\"Statistics for {device_id}/{message}:\")\n",
|
| 1062 |
+
" display(df_stats)\n",
|
| 1063 |
+
" except Exception as e:\n",
|
| 1064 |
+
" print(f\"Error executing custom SQL: {e}\")\n",
|
| 1065 |
+
" import traceback\n",
|
| 1066 |
+
" traceback.print_exc()"
|
| 1067 |
+
]
|
| 1068 |
+
},
|
| 1069 |
+
{
|
| 1070 |
+
"cell_type": "code",
|
| 1071 |
+
"execution_count": null,
|
| 1072 |
+
"metadata": {},
|
| 1073 |
+
"outputs": [],
|
| 1074 |
+
"source": [
|
| 1075 |
+
"# Example: Aggregation query\n",
|
| 1076 |
+
"if devices and messages and 'signal_cols' in locals() and signal_cols:\n",
|
| 1077 |
+
" device_id = devices[0]\n",
|
| 1078 |
+
" message = messages[0]\n",
|
| 1079 |
+
" signal_name = signal_cols[0]\n",
|
| 1080 |
+
" \n",
|
| 1081 |
+
" try:\n",
|
| 1082 |
+
" df_agg = query.aggregate(\n",
|
| 1083 |
+
" device_id=device_id,\n",
|
| 1084 |
+
" message=message,\n",
|
| 1085 |
+
" aggregation=f\"\"\"\n",
|
| 1086 |
+
" COUNT(*) as count,\n",
|
| 1087 |
+
" AVG({signal_name}) as avg_{signal_name},\n",
|
| 1088 |
+
" MIN({signal_name}) as min_{signal_name},\n",
|
| 1089 |
+
" MAX({signal_name}) as max_{signal_name},\n",
|
| 1090 |
+
" STDDEV({signal_name}) as std_{signal_name}\n",
|
| 1091 |
+
" \"\"\",\n",
|
| 1092 |
+
" )\n",
|
| 1093 |
+
" \n",
|
| 1094 |
+
" print(f\"Aggregation for {signal_name}:\")\n",
|
| 1095 |
+
" display(df_agg)\n",
|
| 1096 |
+
" \n",
|
| 1097 |
+
" except Exception as e:\n",
|
| 1098 |
+
" print(f\"Error in aggregation: {e}\")\n",
|
| 1099 |
+
" import traceback\n",
|
| 1100 |
+
" traceback.print_exc()"
|
| 1101 |
+
]
|
| 1102 |
+
},
|
| 1103 |
+
{
|
| 1104 |
+
"cell_type": "markdown",
|
| 1105 |
+
"metadata": {},
|
| 1106 |
+
"source": [
|
| 1107 |
+
"## Summary\n",
|
| 1108 |
+
"\n",
|
| 1109 |
+
"You've successfully:\n",
|
| 1110 |
+
"1. ✓ Connected to Athena\n",
|
| 1111 |
+
"2. ✓ Explored the data lake structure\n",
|
| 1112 |
+
"3. ✓ Queried sample data\n",
|
| 1113 |
+
"4. ✓ Analyzed time series\n",
|
| 1114 |
+
"5. ✓ Executed custom SQL queries\n",
|
| 1115 |
+
"\n",
|
| 1116 |
+
"### Next Steps\n",
|
| 1117 |
+
"\n",
|
| 1118 |
+
"- Modify the queries to explore your specific data\n",
|
| 1119 |
+
"- Add more visualizations\n",
|
| 1120 |
+
"- Perform statistical analysis\n",
|
| 1121 |
+
"- Export data for further analysis\n",
|
| 1122 |
+
"\n",
|
| 1123 |
+
"### Useful Commands\n",
|
| 1124 |
+
"\n",
|
| 1125 |
+
"```python\n",
|
| 1126 |
+
"# List all devices\n",
|
| 1127 |
+
"devices = catalog.list_devices()\n",
|
| 1128 |
+
"\n",
|
| 1129 |
+
"# List messages for a device\n",
|
| 1130 |
+
"messages = catalog.list_messages('device_id')\n",
|
| 1131 |
+
"\n",
|
| 1132 |
+
"# Get schema\n",
|
| 1133 |
+
"schema = catalog.get_schema('device_id', 'message_name')\n",
|
| 1134 |
+
"\n",
|
| 1135 |
+
"# Query data\n",
|
| 1136 |
+
"df = query.read_device_message('device_id', 'message_name', limit=1000)\n",
|
| 1137 |
+
"\n",
|
| 1138 |
+
"# Custom SQL\n",
|
| 1139 |
+
"df = query.execute_sql('SELECT * FROM database.table LIMIT 100')\n",
|
| 1140 |
+
"```"
|
| 1141 |
+
]
|
| 1142 |
+
}
|
| 1143 |
+
],
|
| 1144 |
+
"metadata": {
|
| 1145 |
+
"kernelspec": {
|
| 1146 |
+
"display_name": "venv",
|
| 1147 |
+
"language": "python",
|
| 1148 |
+
"name": "python3"
|
| 1149 |
+
},
|
| 1150 |
+
"language_info": {
|
| 1151 |
+
"codemirror_mode": {
|
| 1152 |
+
"name": "ipython",
|
| 1153 |
+
"version": 3
|
| 1154 |
+
},
|
| 1155 |
+
"file_extension": ".py",
|
| 1156 |
+
"mimetype": "text/x-python",
|
| 1157 |
+
"name": "python",
|
| 1158 |
+
"nbconvert_exporter": "python",
|
| 1159 |
+
"pygments_lexer": "ipython3",
|
| 1160 |
+
"version": "3.10.18"
|
| 1161 |
+
}
|
| 1162 |
+
},
|
| 1163 |
+
"nbformat": 4,
|
| 1164 |
+
"nbformat_minor": 2
|
| 1165 |
+
}
|
src/images/analysis.png
ADDED
|
src/images/logo.png
ADDED
|
src/images/oxon.jpeg
ADDED
|
src/requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas>=1.3.0
|
| 2 |
+
boto3>=1.20.0
|
| 3 |
+
matplotlib>=3.5.0
|
| 4 |
+
seaborn>=0.12.0
|
| 5 |
+
jupyter>=1.0.0
|
| 6 |
+
streamlit>=1.28.0
|
| 7 |
+
pyyaml>=6.0
|
| 8 |
+
plotly>=5.0.0
|
| 9 |
+
pillow>=9.0.0
|
| 10 |
+
ydata-profiling>=4.0.0
|
src/setup.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Setup script for CANedge Data Lake Python SDK.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from setuptools import setup, find_packages
|
| 6 |
+
|
| 7 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 8 |
+
long_description = fh.read()
|
| 9 |
+
|
| 10 |
+
setup(
|
| 11 |
+
name="canedge-datalake",
|
| 12 |
+
version="0.1.0",
|
| 13 |
+
author="CSS Electronics",
|
| 14 |
+
description="Production-ready Python package for querying and analyzing CAN/LIN data lakes",
|
| 15 |
+
long_description=long_description,
|
| 16 |
+
long_description_content_type="text/markdown",
|
| 17 |
+
url="https://github.com/CSS-Electronics/canedge-datalake",
|
| 18 |
+
packages=find_packages(),
|
| 19 |
+
classifiers=[
|
| 20 |
+
"Development Status :: 4 - Beta",
|
| 21 |
+
"Intended Audience :: Developers",
|
| 22 |
+
"Topic :: Scientific/Engineering",
|
| 23 |
+
"License :: OSI Approved :: MIT License",
|
| 24 |
+
"Programming Language :: Python :: 3",
|
| 25 |
+
"Programming Language :: Python :: 3.10",
|
| 26 |
+
"Programming Language :: Python :: 3.11",
|
| 27 |
+
"Programming Language :: Python :: 3.12",
|
| 28 |
+
],
|
| 29 |
+
python_requires=">=3.10",
|
| 30 |
+
install_requires=[
|
| 31 |
+
"pandas>=1.3.0",
|
| 32 |
+
"pyarrow>=8.0.0",
|
| 33 |
+
"boto3>=1.20.0",
|
| 34 |
+
],
|
| 35 |
+
extras_require={
|
| 36 |
+
"dev": [
|
| 37 |
+
"pytest>=7.0.0",
|
| 38 |
+
"black>=22.0.0",
|
| 39 |
+
"mypy>=0.950",
|
| 40 |
+
"ruff>=0.1.0",
|
| 41 |
+
],
|
| 42 |
+
},
|
| 43 |
+
)
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,1115 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import streamlit as st
|
| 5 |
|
|
|
|
| 6 |
"""
|
| 7 |
-
# Welcome to Streamlit!
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OXON Technologies - Professional Streamlit Dashboard
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
A comprehensive dashboard for analyzing device data from AWS Athena data lake.
|
| 5 |
"""
|
|
|
|
| 6 |
|
| 7 |
+
import streamlit as st
|
| 8 |
+
from warnings import filterwarnings
|
| 9 |
+
import base64
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from PIL import Image
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import numpy as np
|
| 14 |
+
import yaml
|
| 15 |
+
import re
|
| 16 |
+
import plotly.graph_objects as go
|
| 17 |
+
from typing import Dict, Optional, List, Tuple
|
| 18 |
+
|
| 19 |
+
from ydata_profiling import ProfileReport
|
| 20 |
+
import plotly.express as px
|
| 21 |
+
|
| 22 |
+
from datalake.config import DataLakeConfig
|
| 23 |
+
from datalake.athena import AthenaQuery
|
| 24 |
+
from datalake.catalog import DataLakeCatalog
|
| 25 |
+
from datalake.query import DataLakeQuery
|
| 26 |
+
from datalake.batch import BatchProcessor
|
| 27 |
+
|
| 28 |
+
from utils.correlation import CorrelationMatrixGenerator
|
| 29 |
+
from utils.dimension_reduction import DimensionReduction
|
| 30 |
+
from utils.feature_class import DetectFeatureClasses
|
| 31 |
+
|
| 32 |
+
# Ignore warnings
|
| 33 |
+
filterwarnings("ignore")
|
| 34 |
+
|
| 35 |
+
# ============================================================================
|
| 36 |
+
# Configuration Management
|
| 37 |
+
# ============================================================================
|
| 38 |
+
|
| 39 |
+
def load_config(config_path: str = "config.yaml") -> Dict:
|
| 40 |
+
"""
|
| 41 |
+
Load configuration from YAML file.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
config_path: Path to the configuration YAML file
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Dictionary containing configuration settings
|
| 48 |
+
|
| 49 |
+
Raises:
|
| 50 |
+
FileNotFoundError: If config file doesn't exist
|
| 51 |
+
yaml.YAMLError: If config file is invalid YAML
|
| 52 |
+
"""
|
| 53 |
+
config_file = Path(config_path)
|
| 54 |
+
if not config_file.exists():
|
| 55 |
+
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
| 56 |
+
|
| 57 |
+
with open(config_file, 'r') as f:
|
| 58 |
+
config = yaml.safe_load(f)
|
| 59 |
+
|
| 60 |
+
return config
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def initialize_aws_services(config: Dict) -> Tuple[DataLakeConfig, AthenaQuery, DataLakeCatalog, DataLakeQuery, BatchProcessor]:
|
| 64 |
+
"""
|
| 65 |
+
Initialize AWS services using configuration.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
config: Configuration dictionary with AWS credentials
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
Tuple of (config, athena, catalog, query, processor)
|
| 72 |
+
|
| 73 |
+
Raises:
|
| 74 |
+
KeyError: If required configuration keys are missing
|
| 75 |
+
Exception: If AWS service initialization fails
|
| 76 |
+
"""
|
| 77 |
+
aws_config = config.get('aws', {})
|
| 78 |
+
|
| 79 |
+
required_keys = ['database_name', 'workgroup', 's3_output_location', 'region',
|
| 80 |
+
'access_key_id', 'secret_access_key']
|
| 81 |
+
missing_keys = [key for key in required_keys if key not in aws_config]
|
| 82 |
+
|
| 83 |
+
if missing_keys:
|
| 84 |
+
raise KeyError(f"Missing required AWS configuration keys: {missing_keys}")
|
| 85 |
+
|
| 86 |
+
data_lake_config = DataLakeConfig.from_credentials(
|
| 87 |
+
database_name=aws_config['database_name'],
|
| 88 |
+
workgroup=aws_config['workgroup'],
|
| 89 |
+
s3_output_location=aws_config['s3_output_location'],
|
| 90 |
+
region=aws_config['region'],
|
| 91 |
+
access_key_id=aws_config['access_key_id'],
|
| 92 |
+
secret_access_key=aws_config['secret_access_key'],
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
athena = AthenaQuery(data_lake_config)
|
| 96 |
+
catalog = DataLakeCatalog(athena, data_lake_config)
|
| 97 |
+
query = DataLakeQuery(athena, catalog)
|
| 98 |
+
processor = BatchProcessor(query)
|
| 99 |
+
|
| 100 |
+
return data_lake_config, athena, catalog, query, processor
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ============================================================================
|
| 104 |
+
# Session State Management
|
| 105 |
+
# ============================================================================
|
| 106 |
+
|
| 107 |
+
def initialize_session_state():
|
| 108 |
+
"""Initialize all session state variables with proper defaults."""
|
| 109 |
+
# Configuration
|
| 110 |
+
if 'app_config' not in st.session_state:
|
| 111 |
+
try:
|
| 112 |
+
st.session_state['app_config'] = load_config()
|
| 113 |
+
except Exception as e:
|
| 114 |
+
st.session_state['app_config'] = None
|
| 115 |
+
st.session_state['config_error'] = str(e)
|
| 116 |
+
|
| 117 |
+
# AWS Services (only initialize when needed)
|
| 118 |
+
if 'aws_initialized' not in st.session_state:
|
| 119 |
+
st.session_state['aws_initialized'] = False
|
| 120 |
+
|
| 121 |
+
if 'aws_error' not in st.session_state:
|
| 122 |
+
st.session_state['aws_error'] = None
|
| 123 |
+
|
| 124 |
+
# User selections
|
| 125 |
+
if 'selected_device' not in st.session_state:
|
| 126 |
+
st.session_state['selected_device'] = None
|
| 127 |
+
|
| 128 |
+
if 'selected_message' not in st.session_state:
|
| 129 |
+
st.session_state['selected_message'] = None
|
| 130 |
+
|
| 131 |
+
if 'message_mapping' not in st.session_state:
|
| 132 |
+
st.session_state['message_mapping'] = None
|
| 133 |
+
|
| 134 |
+
# Date range filter
|
| 135 |
+
if 'date_range_enabled' not in st.session_state:
|
| 136 |
+
st.session_state['date_range_enabled'] = False
|
| 137 |
+
|
| 138 |
+
# Selected dates (what user picks in the UI)
|
| 139 |
+
if 'date_range_start' not in st.session_state:
|
| 140 |
+
st.session_state['date_range_start'] = None
|
| 141 |
+
|
| 142 |
+
if 'date_range_end' not in st.session_state:
|
| 143 |
+
st.session_state['date_range_end'] = None
|
| 144 |
+
|
| 145 |
+
# Applied dates (what's actually being used for filtering)
|
| 146 |
+
if 'applied_date_range_start' not in st.session_state:
|
| 147 |
+
st.session_state['applied_date_range_start'] = None
|
| 148 |
+
|
| 149 |
+
if 'applied_date_range_end' not in st.session_state:
|
| 150 |
+
st.session_state['applied_date_range_end'] = None
|
| 151 |
+
|
| 152 |
+
# Data cache
|
| 153 |
+
if 'device_list' not in st.session_state:
|
| 154 |
+
st.session_state['device_list'] = None
|
| 155 |
+
|
| 156 |
+
if 'message_list' not in st.session_state:
|
| 157 |
+
st.session_state['message_list'] = None
|
| 158 |
+
|
| 159 |
+
if 'current_data' not in st.session_state:
|
| 160 |
+
st.session_state['current_data'] = None
|
| 161 |
+
|
| 162 |
+
# Correlations tab
|
| 163 |
+
if 'correlations_run_clicked' not in st.session_state:
|
| 164 |
+
st.session_state['correlations_run_clicked'] = False
|
| 165 |
+
|
| 166 |
+
if 'correlations_data' not in st.session_state:
|
| 167 |
+
st.session_state['correlations_data'] = None
|
| 168 |
+
|
| 169 |
+
if 'correlation_matrix' not in st.session_state:
|
| 170 |
+
st.session_state['correlation_matrix'] = None
|
| 171 |
+
|
| 172 |
+
if 'feature_clusters' not in st.session_state:
|
| 173 |
+
st.session_state['feature_clusters'] = None
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def initialize_aws_if_needed():
|
| 177 |
+
"""
|
| 178 |
+
Initialize AWS services if not already initialized.
|
| 179 |
+
Returns True if successful, False otherwise.
|
| 180 |
+
"""
|
| 181 |
+
if st.session_state['aws_initialized']:
|
| 182 |
+
return True
|
| 183 |
+
|
| 184 |
+
if st.session_state['app_config'] is None:
|
| 185 |
+
return False
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
config, athena, catalog, query, processor = initialize_aws_services(
|
| 189 |
+
st.session_state['app_config']
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
st.session_state['config'] = config
|
| 193 |
+
st.session_state['athena'] = athena
|
| 194 |
+
st.session_state['catalog'] = catalog
|
| 195 |
+
st.session_state['query'] = query
|
| 196 |
+
st.session_state['processor'] = processor
|
| 197 |
+
st.session_state['aws_initialized'] = True
|
| 198 |
+
st.session_state['aws_error'] = None
|
| 199 |
+
|
| 200 |
+
return True
|
| 201 |
+
except Exception as e:
|
| 202 |
+
st.session_state['aws_error'] = str(e)
|
| 203 |
+
st.session_state['aws_initialized'] = False
|
| 204 |
+
return False
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# ============================================================================
|
| 208 |
+
# UI Components
|
| 209 |
+
# ============================================================================
|
| 210 |
+
|
| 211 |
+
def get_base64_image(image_path: str) -> Optional[str]:
|
| 212 |
+
"""
|
| 213 |
+
Convert image to base64 string.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
image_path: Path to the image file
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Base64 encoded string or None if file not found
|
| 220 |
+
"""
|
| 221 |
+
try:
|
| 222 |
+
image_file = Path(image_path)
|
| 223 |
+
if not image_file.exists():
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
with open(image_file, "rb") as f:
|
| 227 |
+
return base64.b64encode(f.read()).decode()
|
| 228 |
+
except Exception:
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def display_header(logo_path: str, title: str):
|
| 233 |
+
"""
|
| 234 |
+
Display header with logo and title.
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
logo_path: Path to logo image
|
| 238 |
+
title: Header title text
|
| 239 |
+
"""
|
| 240 |
+
logo_base64 = get_base64_image(logo_path)
|
| 241 |
+
|
| 242 |
+
if logo_base64:
|
| 243 |
+
st.markdown(
|
| 244 |
+
f"""
|
| 245 |
+
<div style="display: flex; align-items: center;">
|
| 246 |
+
<img src="data:image/png;base64,{logo_base64}" alt="Logo"
|
| 247 |
+
style="height: 200px; margin-right: 10px;">
|
| 248 |
+
<h1 style="display: inline; margin: 0;">{title} 🔍</h1>
|
| 249 |
+
</div>
|
| 250 |
+
""",
|
| 251 |
+
unsafe_allow_html=True,
|
| 252 |
+
)
|
| 253 |
+
else:
|
| 254 |
+
st.title(f"{title} 🔍")
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def display_sidebar():
|
| 258 |
+
"""Display sidebar with device selection."""
|
| 259 |
+
with st.sidebar:
|
| 260 |
+
# Logo
|
| 261 |
+
logo_path = st.session_state['app_config'].get('dashboard', {}).get('logo_path', 'images/logo.png')
|
| 262 |
+
try:
|
| 263 |
+
st.image(Image.open(logo_path), width='stretch')
|
| 264 |
+
except Exception:
|
| 265 |
+
st.write("OXON Technologies")
|
| 266 |
+
|
| 267 |
+
st.title("OXON Technologies")
|
| 268 |
+
st.write("Welcome to the OXON Technologies dashboard. "
|
| 269 |
+
"Select a device ID and click **Go!** to begin analysis.")
|
| 270 |
+
|
| 271 |
+
# Check if AWS services are initialized
|
| 272 |
+
if not st.session_state['aws_initialized']:
|
| 273 |
+
st.warning("⚠️ AWS services not initialized. Please check configuration.")
|
| 274 |
+
return
|
| 275 |
+
|
| 276 |
+
# Load device list if not cached
|
| 277 |
+
if st.session_state['device_list'] is None:
|
| 278 |
+
try:
|
| 279 |
+
with st.spinner("Loading devices..."):
|
| 280 |
+
st.session_state['device_list'] = st.session_state['catalog'].list_devices()
|
| 281 |
+
except Exception as e:
|
| 282 |
+
st.error(f"Error loading devices: {str(e)}")
|
| 283 |
+
return
|
| 284 |
+
|
| 285 |
+
devices_list = st.session_state['device_list']
|
| 286 |
+
|
| 287 |
+
if not devices_list:
|
| 288 |
+
st.warning("No devices found in the data lake.")
|
| 289 |
+
return
|
| 290 |
+
|
| 291 |
+
# Device selection
|
| 292 |
+
current_index = 0
|
| 293 |
+
if st.session_state['selected_device'] in devices_list:
|
| 294 |
+
current_index = devices_list.index(st.session_state['selected_device'])
|
| 295 |
+
|
| 296 |
+
selected_device = st.selectbox(
|
| 297 |
+
"Device ID",
|
| 298 |
+
devices_list,
|
| 299 |
+
index=current_index,
|
| 300 |
+
key="sidebar_device_select"
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
# Apply device selection only when user clicks the button
|
| 304 |
+
if st.button("Go!", key="device_go_btn", width='stretch'):
|
| 305 |
+
st.session_state['selected_device'] = selected_device
|
| 306 |
+
st.session_state['selected_message'] = None
|
| 307 |
+
st.session_state['message_list'] = None
|
| 308 |
+
st.session_state['message_mapping'] = None
|
| 309 |
+
st.session_state['current_data'] = None
|
| 310 |
+
st.session_state['date_range_enabled'] = False
|
| 311 |
+
st.session_state['date_range_start'] = None
|
| 312 |
+
st.session_state['date_range_end'] = None
|
| 313 |
+
st.session_state['applied_date_range_start'] = None
|
| 314 |
+
st.session_state['applied_date_range_end'] = None
|
| 315 |
+
st.session_state['correlations_run_clicked'] = False
|
| 316 |
+
st.session_state['correlations_data'] = None
|
| 317 |
+
st.session_state['correlation_matrix'] = None
|
| 318 |
+
st.session_state['feature_clusters'] = None
|
| 319 |
+
st.rerun()
|
| 320 |
+
|
| 321 |
+
# Show selected device info only after user has confirmed
|
| 322 |
+
if st.session_state['selected_device']:
|
| 323 |
+
st.success(f"✓ Selected: {st.session_state['selected_device']}")
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
# ============================================================================
|
| 327 |
+
# Message Processing
|
| 328 |
+
# ============================================================================
|
| 329 |
+
|
| 330 |
+
def build_message_mapping(messages_list: List[str], mapping_config: Dict) -> Tuple[Dict[str, str], List[str]]:
|
| 331 |
+
"""
|
| 332 |
+
Build message mapping dictionary from raw messages.
|
| 333 |
+
|
| 334 |
+
Args:
|
| 335 |
+
messages_list: List of raw message names
|
| 336 |
+
mapping_config: Configuration dictionary with message mappings
|
| 337 |
+
|
| 338 |
+
Returns:
|
| 339 |
+
Tuple of (messages_mapping_dict, lost_messages_list)
|
| 340 |
+
"""
|
| 341 |
+
pattern = re.compile(r"s(?P<s>\d{2})pid.*m(?P<m>[0-9a-fA-F]{2})$")
|
| 342 |
+
|
| 343 |
+
messages_mapping_dict = {}
|
| 344 |
+
lost_messages_list = []
|
| 345 |
+
|
| 346 |
+
for message in messages_list:
|
| 347 |
+
|
| 348 |
+
# Do not change name for messages that are not can1
|
| 349 |
+
if not message.startswith('can1'):
|
| 350 |
+
messages_mapping_dict[message] = message
|
| 351 |
+
continue
|
| 352 |
+
|
| 353 |
+
message_id_parts = pattern.search(message)
|
| 354 |
+
if not message_id_parts:
|
| 355 |
+
continue
|
| 356 |
+
|
| 357 |
+
message_id = (message_id_parts.group("s") + message_id_parts.group("m")).upper()
|
| 358 |
+
|
| 359 |
+
if message_id in mapping_config:
|
| 360 |
+
message_name = mapping_config[message_id]['name']
|
| 361 |
+
messages_mapping_dict[message_name] = message
|
| 362 |
+
else:
|
| 363 |
+
lost_messages_list.append(message)
|
| 364 |
+
|
| 365 |
+
return messages_mapping_dict, lost_messages_list
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def load_message_list(device_id: str) -> Optional[List[str]]:
|
| 369 |
+
"""
|
| 370 |
+
Load message list for a device.
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
device_id: Device ID to load messages for
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
List of message names or None if error
|
| 377 |
+
"""
|
| 378 |
+
try:
|
| 379 |
+
return st.session_state['catalog'].list_messages(device_id)
|
| 380 |
+
except Exception as e:
|
| 381 |
+
st.error(f"Error loading messages: {str(e)}")
|
| 382 |
+
return None
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
# ============================================================================
|
| 386 |
+
# Tab Components
|
| 387 |
+
# ============================================================================
|
| 388 |
+
|
| 389 |
+
def render_message_viewer_tab():
|
| 390 |
+
"""Render the Message Viewer tab."""
|
| 391 |
+
# Check prerequisites
|
| 392 |
+
if not st.session_state['aws_initialized']:
|
| 393 |
+
st.error("AWS services not initialized. Please check configuration.")
|
| 394 |
+
return
|
| 395 |
+
|
| 396 |
+
if not st.session_state['selected_device']:
|
| 397 |
+
st.info("👈 Please select a device from the sidebar and click **Go!** to begin.")
|
| 398 |
+
return
|
| 399 |
+
|
| 400 |
+
device_id = st.session_state['selected_device']
|
| 401 |
+
|
| 402 |
+
# Load message list if not cached
|
| 403 |
+
if st.session_state['message_list'] is None:
|
| 404 |
+
with st.spinner(f"Loading messages for device {device_id}..."):
|
| 405 |
+
st.session_state['message_list'] = load_message_list(device_id)
|
| 406 |
+
|
| 407 |
+
if st.session_state['message_list'] is None:
|
| 408 |
+
return
|
| 409 |
+
|
| 410 |
+
messages_list = st.session_state['message_list']
|
| 411 |
+
|
| 412 |
+
if not messages_list:
|
| 413 |
+
st.warning(f"No messages found for device {device_id}.")
|
| 414 |
+
return
|
| 415 |
+
|
| 416 |
+
# Get message mapping configuration
|
| 417 |
+
mapping_config = st.session_state['app_config'].get('message_mapping', {})
|
| 418 |
+
|
| 419 |
+
# Build message mapping
|
| 420 |
+
if st.session_state['message_mapping'] is None:
|
| 421 |
+
messages_mapping_dict, lost_messages_list = build_message_mapping(
|
| 422 |
+
messages_list, mapping_config
|
| 423 |
+
)
|
| 424 |
+
st.session_state['message_mapping'] = messages_mapping_dict
|
| 425 |
+
|
| 426 |
+
if lost_messages_list:
|
| 427 |
+
st.warning(
|
| 428 |
+
f"The following messages were not found in the mapping: "
|
| 429 |
+
f"{', '.join(lost_messages_list[:10])}"
|
| 430 |
+
f"{'...' if len(lost_messages_list) > 10 else ''}"
|
| 431 |
+
)
|
| 432 |
+
else:
|
| 433 |
+
messages_mapping_dict = st.session_state['message_mapping']
|
| 434 |
+
|
| 435 |
+
if not messages_mapping_dict:
|
| 436 |
+
st.warning("No valid messages found after mapping.")
|
| 437 |
+
return
|
| 438 |
+
|
| 439 |
+
# Message selection
|
| 440 |
+
current_index = 0
|
| 441 |
+
if st.session_state['selected_message']:
|
| 442 |
+
# Find the message name that corresponds to selected_message
|
| 443 |
+
for name, msg in messages_mapping_dict.items():
|
| 444 |
+
if msg == st.session_state['selected_message']:
|
| 445 |
+
if name in list(messages_mapping_dict.keys()):
|
| 446 |
+
current_index = list(messages_mapping_dict.keys()).index(name)
|
| 447 |
+
break
|
| 448 |
+
|
| 449 |
+
st.markdown('<div style="text-align: center;"><h2>Message Viewer</h2></div>', unsafe_allow_html=True)
|
| 450 |
+
st.divider()
|
| 451 |
+
|
| 452 |
+
selected_message_name = st.selectbox(
|
| 453 |
+
"Select Message",
|
| 454 |
+
list(messages_mapping_dict.keys()),
|
| 455 |
+
index=current_index,
|
| 456 |
+
key="message_selectbox"
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
message_clicked = st.button("Show!", key="message_show_btn", width='stretch')
|
| 460 |
+
|
| 461 |
+
selected_message = messages_mapping_dict[selected_message_name]
|
| 462 |
+
|
| 463 |
+
# Apply message selection only when user clicks the button
|
| 464 |
+
if message_clicked:
|
| 465 |
+
st.session_state['selected_message'] = selected_message
|
| 466 |
+
st.session_state['current_data'] = None
|
| 467 |
+
st.rerun()
|
| 468 |
+
|
| 469 |
+
if st.session_state['selected_message']:
|
| 470 |
+
st.info(f"📊 Selected message: `{st.session_state['selected_message']}` ({selected_message_name})")
|
| 471 |
+
|
| 472 |
+
# Date range selection (optional filter)
|
| 473 |
+
st.divider()
|
| 474 |
+
date_range_enabled = st.checkbox(
|
| 475 |
+
"Filter by Date Range",
|
| 476 |
+
value=st.session_state.get('date_range_enabled', False),
|
| 477 |
+
key="date_range_checkbox",
|
| 478 |
+
help="Enable to filter data by date range"
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
if date_range_enabled:
|
| 482 |
+
# Get min/max dates from cached data if available
|
| 483 |
+
min_date = None
|
| 484 |
+
max_date = None
|
| 485 |
+
if st.session_state.get('current_data') is not None:
|
| 486 |
+
try:
|
| 487 |
+
df_temp = st.session_state['current_data']
|
| 488 |
+
if 'timestamp' in df_temp.columns:
|
| 489 |
+
min_date = df_temp['timestamp'].min().date()
|
| 490 |
+
max_date = df_temp['timestamp'].max().date()
|
| 491 |
+
except Exception:
|
| 492 |
+
pass
|
| 493 |
+
|
| 494 |
+
col_start, col_end = st.columns([1, 1])
|
| 495 |
+
|
| 496 |
+
with col_start:
|
| 497 |
+
date_start = st.date_input(
|
| 498 |
+
"Start Date",
|
| 499 |
+
value=st.session_state.get('date_range_start') or min_date,
|
| 500 |
+
min_value=min_date,
|
| 501 |
+
max_value=max_date,
|
| 502 |
+
key="date_range_start_input",
|
| 503 |
+
help="Select start date for filtering"
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
with col_end:
|
| 507 |
+
date_end = st.date_input(
|
| 508 |
+
"End Date",
|
| 509 |
+
value=st.session_state.get('date_range_end') or max_date,
|
| 510 |
+
min_value=min_date,
|
| 511 |
+
max_value=max_date,
|
| 512 |
+
key="date_range_end_input",
|
| 513 |
+
help="Select end date for filtering"
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
apply_filter_clicked = st.button(
|
| 517 |
+
"Apply Filter",
|
| 518 |
+
key="apply_date_filter_btn",
|
| 519 |
+
use_container_width=True
|
| 520 |
+
)
|
| 521 |
+
|
| 522 |
+
# Update selected dates in session state
|
| 523 |
+
st.session_state['date_range_start'] = date_start
|
| 524 |
+
st.session_state['date_range_end'] = date_end
|
| 525 |
+
|
| 526 |
+
# Apply filter only when button is clicked
|
| 527 |
+
if apply_filter_clicked:
|
| 528 |
+
# Validate date range before applying
|
| 529 |
+
if date_start > date_end:
|
| 530 |
+
st.error("⚠️ Start date must be before or equal to end date.")
|
| 531 |
+
else:
|
| 532 |
+
st.session_state['applied_date_range_start'] = date_start
|
| 533 |
+
st.session_state['applied_date_range_end'] = date_end
|
| 534 |
+
st.rerun()
|
| 535 |
+
|
| 536 |
+
# Show current applied filter status
|
| 537 |
+
if st.session_state.get('applied_date_range_start') and st.session_state.get('applied_date_range_end'):
|
| 538 |
+
st.success(
|
| 539 |
+
f"📅 **Applied filter:** {st.session_state['applied_date_range_start']} to "
|
| 540 |
+
f"{st.session_state['applied_date_range_end']}"
|
| 541 |
+
)
|
| 542 |
+
elif date_start and date_end:
|
| 543 |
+
if date_start <= date_end:
|
| 544 |
+
st.info("ℹ️ Select dates and click **Apply Filter** to filter the data.")
|
| 545 |
+
else:
|
| 546 |
+
st.error("⚠️ Start date must be before or equal to end date.")
|
| 547 |
+
else:
|
| 548 |
+
# Clear applied date range when disabled
|
| 549 |
+
if st.session_state.get('date_range_enabled'):
|
| 550 |
+
st.session_state['applied_date_range_start'] = None
|
| 551 |
+
st.session_state['applied_date_range_end'] = None
|
| 552 |
+
st.session_state['date_range_start'] = None
|
| 553 |
+
st.session_state['date_range_end'] = None
|
| 554 |
+
|
| 555 |
+
# Update enabled state
|
| 556 |
+
st.session_state['date_range_enabled'] = date_range_enabled
|
| 557 |
+
|
| 558 |
+
render_message_data(device_id, st.session_state['selected_message'])
|
| 559 |
+
else:
|
| 560 |
+
st.info("Select a message and click **Show!** to load data.")
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
def render_message_data(device_id: str, message: str):
|
| 564 |
+
"""
|
| 565 |
+
Render data and plot for a selected message.
|
| 566 |
+
|
| 567 |
+
Args:
|
| 568 |
+
device_id: Device ID
|
| 569 |
+
message: Message name
|
| 570 |
+
"""
|
| 571 |
+
# Load data if not cached
|
| 572 |
+
if st.session_state['current_data'] is None:
|
| 573 |
+
with st.spinner("Loading data..."):
|
| 574 |
+
try:
|
| 575 |
+
df = st.session_state['query'].read_device_message(
|
| 576 |
+
device_id=device_id,
|
| 577 |
+
message=message,
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
if df is None or df.empty:
|
| 581 |
+
st.warning("No data found for the selected message.")
|
| 582 |
+
return
|
| 583 |
+
|
| 584 |
+
# Process data
|
| 585 |
+
df['t'] = pd.to_datetime(df['t'])
|
| 586 |
+
df = df.sort_values(by='t').reset_index(drop=True)
|
| 587 |
+
df = df.rename(columns={'t': 'timestamp'})
|
| 588 |
+
|
| 589 |
+
st.session_state['current_data'] = df
|
| 590 |
+
except Exception as e:
|
| 591 |
+
st.error(f"Error loading data: {str(e)}")
|
| 592 |
+
return
|
| 593 |
+
|
| 594 |
+
df = st.session_state['current_data'].copy()
|
| 595 |
+
df = df.drop(columns=['date_created'], errors='ignore')
|
| 596 |
+
|
| 597 |
+
if df is None or df.empty:
|
| 598 |
+
return
|
| 599 |
+
|
| 600 |
+
# Apply date range filter if enabled and applied dates are set
|
| 601 |
+
original_row_count = len(df)
|
| 602 |
+
if (st.session_state.get('date_range_enabled') and
|
| 603 |
+
st.session_state.get('applied_date_range_start') and
|
| 604 |
+
st.session_state.get('applied_date_range_end')):
|
| 605 |
+
|
| 606 |
+
start_date = pd.to_datetime(st.session_state['applied_date_range_start'])
|
| 607 |
+
end_date = pd.to_datetime(st.session_state['applied_date_range_end'])
|
| 608 |
+
# Include the entire end date (set to end of day)
|
| 609 |
+
end_date = end_date.replace(hour=23, minute=59, second=59)
|
| 610 |
+
|
| 611 |
+
df = df[(df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)].copy()
|
| 612 |
+
|
| 613 |
+
if len(df) == 0:
|
| 614 |
+
st.warning(
|
| 615 |
+
f"⚠️ No data found in the selected date range "
|
| 616 |
+
f"({st.session_state['applied_date_range_start']} to {st.session_state['applied_date_range_end']})."
|
| 617 |
+
)
|
| 618 |
+
st.info("Try selecting a different date range or disable the filter to see all data.")
|
| 619 |
+
return
|
| 620 |
+
elif len(df) < original_row_count:
|
| 621 |
+
st.info(f"📊 Showing {len(df):,} of {original_row_count:,} records (filtered by date range).")
|
| 622 |
+
|
| 623 |
+
# Display statistics
|
| 624 |
+
# st.subheader("Statistics")
|
| 625 |
+
st.divider()
|
| 626 |
+
st.markdown('<div style="text-align: center;"><h2>Overview</h2></div>', unsafe_allow_html=True)
|
| 627 |
+
st.divider()
|
| 628 |
+
col1, col2, col3, col4 = st.columns([1, 2, 1, 1])
|
| 629 |
+
|
| 630 |
+
with col1:
|
| 631 |
+
st.metric("Total Records", len(df))
|
| 632 |
+
with col2:
|
| 633 |
+
st.metric("Date Range", f"{df['timestamp'].min().date()} to {df['timestamp'].max().date()}")
|
| 634 |
+
with col3:
|
| 635 |
+
st.metric("Data Columns", len(df.columns) - 1) # Exclude timestamp
|
| 636 |
+
with col4:
|
| 637 |
+
st.metric("Time Span", f"{(df['timestamp'].max() - df['timestamp'].min()).days} days")
|
| 638 |
+
|
| 639 |
+
# Display data section
|
| 640 |
+
st.divider()
|
| 641 |
+
st.markdown('<div style="text-align: center;"><h2>Data & Profile Report</h2></div>', unsafe_allow_html=True)
|
| 642 |
+
st.divider()
|
| 643 |
+
|
| 644 |
+
col1, col2 = st.columns([1, 2])
|
| 645 |
+
with col1:
|
| 646 |
+
try:
|
| 647 |
+
st.dataframe(df.set_index('timestamp'), width='stretch', height=700)
|
| 648 |
+
except Exception as e: # dataframe was too large
|
| 649 |
+
st.warning(f"Dataframe was too large to display: {str(e)}")
|
| 650 |
+
st.info("Dataframe was too large to display. Please use the profile report to analyze the data.")
|
| 651 |
+
with col2:
|
| 652 |
+
try:
|
| 653 |
+
pr = ProfileReport(df, title="Data Profile", explorative=False, vars={"num": {"low_categorical_threshold": 0}})
|
| 654 |
+
st.components.v1.html(pr.to_html(), scrolling=True, height=700)
|
| 655 |
+
except Exception as e:
|
| 656 |
+
st.warning(f"Profile report could not be generated: {e}")
|
| 657 |
+
|
| 658 |
+
# Display plot section
|
| 659 |
+
st.divider()
|
| 660 |
+
st.markdown('<div style="text-align: center;"><h2>Visualization</h2></div>', unsafe_allow_html=True)
|
| 661 |
+
st.divider()
|
| 662 |
+
|
| 663 |
+
try:
|
| 664 |
+
# Prepare aggregated data
|
| 665 |
+
daily_aggregated_df = df.groupby(
|
| 666 |
+
pd.Grouper(key='timestamp', freq='D')
|
| 667 |
+
).mean().reset_index().fillna(0)
|
| 668 |
+
|
| 669 |
+
# Create plot
|
| 670 |
+
fig = go.Figure()
|
| 671 |
+
|
| 672 |
+
data_columns = [col for col in daily_aggregated_df.columns
|
| 673 |
+
if col not in ['timestamp']]
|
| 674 |
+
|
| 675 |
+
for column in data_columns:
|
| 676 |
+
fig.add_trace(
|
| 677 |
+
go.Scatter(
|
| 678 |
+
x=daily_aggregated_df['timestamp'],
|
| 679 |
+
y=daily_aggregated_df[column],
|
| 680 |
+
name=column,
|
| 681 |
+
mode='lines+markers'
|
| 682 |
+
)
|
| 683 |
+
)
|
| 684 |
+
|
| 685 |
+
# Red vertical line at 16 December 2025 with legend entry "Dosing Stage"
|
| 686 |
+
dosing_date = st.session_state['app_config'].get('dashboard', {}).get('dosing_stage_date', '2025-12-16')
|
| 687 |
+
try:
|
| 688 |
+
dosing_datetime = pd.to_datetime(dosing_date)
|
| 689 |
+
if data_columns:
|
| 690 |
+
y_min = daily_aggregated_df[data_columns].min().min()
|
| 691 |
+
y_max = daily_aggregated_df[data_columns].max().max()
|
| 692 |
+
if y_min == y_max:
|
| 693 |
+
y_min, y_max = y_min - 0.1, y_max + 0.1
|
| 694 |
+
else:
|
| 695 |
+
y_min, y_max = 0, 1
|
| 696 |
+
# Add vertical line as a trace so it appears in the legend as "Dosing Stage"
|
| 697 |
+
fig.add_trace(
|
| 698 |
+
go.Scatter(
|
| 699 |
+
x=[dosing_datetime, dosing_datetime],
|
| 700 |
+
y=[y_min, y_max],
|
| 701 |
+
mode='lines',
|
| 702 |
+
name='Dosing Stage',
|
| 703 |
+
line=dict(color='red', width=2)
|
| 704 |
+
)
|
| 705 |
+
)
|
| 706 |
+
except Exception:
|
| 707 |
+
pass
|
| 708 |
+
|
| 709 |
+
# Update layout with legend
|
| 710 |
+
fig.update_layout(
|
| 711 |
+
title="Daily Aggregated Data",
|
| 712 |
+
xaxis_title="Date",
|
| 713 |
+
yaxis_title="Value",
|
| 714 |
+
hovermode='x unified',
|
| 715 |
+
width=800,
|
| 716 |
+
height=700,
|
| 717 |
+
showlegend=True,
|
| 718 |
+
legend=dict(
|
| 719 |
+
orientation="h",
|
| 720 |
+
yanchor="bottom",
|
| 721 |
+
y=1.02,
|
| 722 |
+
xanchor="right",
|
| 723 |
+
x=1,
|
| 724 |
+
title_text=""
|
| 725 |
+
)
|
| 726 |
+
)
|
| 727 |
+
st.plotly_chart(fig, width='stretch')
|
| 728 |
+
|
| 729 |
+
except Exception as e:
|
| 730 |
+
st.error(f"Error creating visualization: {str(e)}")
|
| 731 |
+
|
| 732 |
+
|
| 733 |
+
def load_all_device_messages(device_id: str) -> Optional[pd.DataFrame]:
|
| 734 |
+
"""
|
| 735 |
+
Load all messages for a device, aggregate daily, and merge on timestamp.
|
| 736 |
+
|
| 737 |
+
Args:
|
| 738 |
+
device_id: Device ID to load messages for
|
| 739 |
+
|
| 740 |
+
Returns:
|
| 741 |
+
Merged DataFrame with all messages aggregated daily, or None if error
|
| 742 |
+
"""
|
| 743 |
+
try:
|
| 744 |
+
messages_list = st.session_state['catalog'].list_messages(device_id)
|
| 745 |
+
if not messages_list:
|
| 746 |
+
return None
|
| 747 |
+
|
| 748 |
+
aggregated_dfs = []
|
| 749 |
+
failed_messages = []
|
| 750 |
+
|
| 751 |
+
progress_bar = st.progress(0)
|
| 752 |
+
status_text = st.empty()
|
| 753 |
+
|
| 754 |
+
total_messages = len(messages_list)
|
| 755 |
+
|
| 756 |
+
for idx, message in enumerate(messages_list):
|
| 757 |
+
|
| 758 |
+
if message.startswith('can9'):
|
| 759 |
+
continue
|
| 760 |
+
|
| 761 |
+
status_text.text(f"Loading message {idx + 1}/{total_messages}: {message}")
|
| 762 |
+
progress_bar.progress((idx + 1) / total_messages)
|
| 763 |
+
|
| 764 |
+
try:
|
| 765 |
+
# Load message data
|
| 766 |
+
df = st.session_state['query'].read_device_message(
|
| 767 |
+
device_id=device_id,
|
| 768 |
+
message=message,
|
| 769 |
+
)
|
| 770 |
+
|
| 771 |
+
if df is None or df.empty:
|
| 772 |
+
failed_messages.append(message)
|
| 773 |
+
continue
|
| 774 |
+
|
| 775 |
+
# Process data
|
| 776 |
+
df['t'] = pd.to_datetime(df['t'])
|
| 777 |
+
df = df.sort_values(by='t').reset_index(drop=True)
|
| 778 |
+
df = df.rename(columns={'t': 'timestamp'})
|
| 779 |
+
|
| 780 |
+
# Drop date_created column
|
| 781 |
+
df = df.drop(columns=['date_created'], errors='ignore')
|
| 782 |
+
|
| 783 |
+
# Aggregate daily by mean
|
| 784 |
+
daily_df = df.groupby(
|
| 785 |
+
pd.Grouper(key='timestamp', freq='D')
|
| 786 |
+
).mean().reset_index()
|
| 787 |
+
|
| 788 |
+
# Remove rows with all NaN (days with no data)
|
| 789 |
+
daily_df = daily_df.dropna(how='all', subset=[col for col in daily_df.columns if col != 'timestamp'])
|
| 790 |
+
|
| 791 |
+
if daily_df.empty:
|
| 792 |
+
failed_messages.append(message)
|
| 793 |
+
continue
|
| 794 |
+
|
| 795 |
+
# Rename columns to include message name (except timestamp)
|
| 796 |
+
# Handle multiple data columns for non-can1 messages
|
| 797 |
+
rename_dict = {}
|
| 798 |
+
for col in daily_df.columns:
|
| 799 |
+
if col != 'timestamp':
|
| 800 |
+
# Create unique column name: message_name__column_name
|
| 801 |
+
rename_dict[col] = f"{message}__{col}"
|
| 802 |
+
|
| 803 |
+
daily_df = daily_df.rename(columns=rename_dict)
|
| 804 |
+
|
| 805 |
+
aggregated_dfs.append(daily_df)
|
| 806 |
+
|
| 807 |
+
except Exception as e:
|
| 808 |
+
failed_messages.append(f"{message} ({str(e)})")
|
| 809 |
+
continue
|
| 810 |
+
|
| 811 |
+
progress_bar.empty()
|
| 812 |
+
status_text.empty()
|
| 813 |
+
|
| 814 |
+
if not aggregated_dfs:
|
| 815 |
+
if failed_messages:
|
| 816 |
+
st.warning(f"Failed to load all messages. Errors: {', '.join(failed_messages[:5])}")
|
| 817 |
+
return None
|
| 818 |
+
|
| 819 |
+
if failed_messages:
|
| 820 |
+
st.warning(f"Failed to load {len(failed_messages)} message(s). Continuing with {len(aggregated_dfs)} messages.")
|
| 821 |
+
|
| 822 |
+
# Merge all dataframes on timestamp
|
| 823 |
+
merged_df = aggregated_dfs[0]
|
| 824 |
+
for df in aggregated_dfs[1:]:
|
| 825 |
+
merged_df = pd.merge(
|
| 826 |
+
merged_df,
|
| 827 |
+
df,
|
| 828 |
+
on='timestamp',
|
| 829 |
+
how='outer' # Keep all days from all messages
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
# Sort by timestamp
|
| 833 |
+
merged_df = merged_df.sort_values(by='timestamp').reset_index(drop=True)
|
| 834 |
+
|
| 835 |
+
# Fill NaN with 0 for numeric columns (or forward fill)
|
| 836 |
+
numeric_cols = merged_df.select_dtypes(include=[np.number]).columns
|
| 837 |
+
merged_df[numeric_cols] = merged_df[numeric_cols].fillna(0)
|
| 838 |
+
|
| 839 |
+
return merged_df
|
| 840 |
+
|
| 841 |
+
except Exception as e:
|
| 842 |
+
st.error(f"Error loading device messages: {str(e)}")
|
| 843 |
+
return None
|
| 844 |
+
|
| 845 |
+
|
| 846 |
+
def _reset_correlations():
|
| 847 |
+
"""Clear correlations run state and caches (used by Start over button)."""
|
| 848 |
+
st.session_state['correlations_run_clicked'] = False
|
| 849 |
+
st.session_state['correlations_data'] = None
|
| 850 |
+
st.session_state['correlation_matrix'] = None
|
| 851 |
+
st.session_state['feature_clusters'] = None
|
| 852 |
+
|
| 853 |
+
|
| 854 |
+
def render_correlations_tab():
|
| 855 |
+
"""Render the Correlations tab with correlation matrix and feature clusters."""
|
| 856 |
+
# Check prerequisites
|
| 857 |
+
if not st.session_state['aws_initialized']:
|
| 858 |
+
st.error("AWS services not initialized. Please check configuration.")
|
| 859 |
+
return
|
| 860 |
+
|
| 861 |
+
if not st.session_state['selected_device']:
|
| 862 |
+
st.info("👈 Please select a device from the sidebar and click **Go!** to begin.")
|
| 863 |
+
return
|
| 864 |
+
|
| 865 |
+
device_id = st.session_state['selected_device']
|
| 866 |
+
|
| 867 |
+
st.markdown('<div style="text-align: center;"><h2>Correlation Analysis</h2></div>', unsafe_allow_html=True)
|
| 868 |
+
st.divider()
|
| 869 |
+
|
| 870 |
+
# Run button: calculations start only after user presses it
|
| 871 |
+
if not st.session_state.get('correlations_run_clicked'):
|
| 872 |
+
st.info(
|
| 873 |
+
"This analysis loads **all messages** for the selected device, aggregates them daily, "
|
| 874 |
+
"and computes correlations and feature cohorts. Click the button below to start."
|
| 875 |
+
)
|
| 876 |
+
if st.button("Run Correlation Analysis", key="run_correlations_btn", type="primary", use_container_width=True):
|
| 877 |
+
st.session_state['correlations_run_clicked'] = True
|
| 878 |
+
st.rerun()
|
| 879 |
+
return
|
| 880 |
+
|
| 881 |
+
# Load all device messages if not cached
|
| 882 |
+
if st.session_state['correlations_data'] is None:
|
| 883 |
+
with st.spinner(f"Loading all messages for device {device_id}..."):
|
| 884 |
+
st.session_state['correlations_data'] = load_all_device_messages(device_id)
|
| 885 |
+
|
| 886 |
+
if st.session_state['correlations_data'] is None or st.session_state['correlations_data'].empty:
|
| 887 |
+
st.error("No data available for correlation analysis.")
|
| 888 |
+
if st.button("Start over", key="correlations_start_over_btn"):
|
| 889 |
+
_reset_correlations()
|
| 890 |
+
st.rerun()
|
| 891 |
+
return
|
| 892 |
+
|
| 893 |
+
df = st.session_state['correlations_data'].copy()
|
| 894 |
+
|
| 895 |
+
# Remove timestamp column for correlation analysis
|
| 896 |
+
df_features = df.drop(columns=['timestamp'])
|
| 897 |
+
|
| 898 |
+
if df_features.empty:
|
| 899 |
+
st.error("No features available for correlation analysis.")
|
| 900 |
+
return
|
| 901 |
+
|
| 902 |
+
st.info(f"📊 Analyzing {len(df_features.columns)} features from {len(df)} days of data.")
|
| 903 |
+
|
| 904 |
+
# Detect feature classes
|
| 905 |
+
st.subheader("1. Feature Classification")
|
| 906 |
+
with st.spinner("Classifying features..."):
|
| 907 |
+
try:
|
| 908 |
+
detector = DetectFeatureClasses(df_features, categorical_threshold=0.5, string_data_policy='drop')
|
| 909 |
+
feature_classes, dropped_features = detector.feature_classes()
|
| 910 |
+
|
| 911 |
+
if dropped_features:
|
| 912 |
+
st.warning(f"Dropped {len(dropped_features)} non-numeric features: {', '.join(dropped_features[:5])}")
|
| 913 |
+
df_features = df_features.drop(columns=dropped_features)
|
| 914 |
+
|
| 915 |
+
# Display feature class summary
|
| 916 |
+
class_counts = {}
|
| 917 |
+
for cls in feature_classes.values():
|
| 918 |
+
class_counts[cls] = class_counts.get(cls, 0) + 1
|
| 919 |
+
|
| 920 |
+
col1, col2, col3 = st.columns(3)
|
| 921 |
+
with col1:
|
| 922 |
+
st.metric("Continuous", class_counts.get('Continuous', 0))
|
| 923 |
+
with col2:
|
| 924 |
+
st.metric("Binary", class_counts.get('Binary', 0))
|
| 925 |
+
with col3:
|
| 926 |
+
st.metric("Categorical", class_counts.get('Categorical', 0))
|
| 927 |
+
|
| 928 |
+
except Exception as e:
|
| 929 |
+
st.error(f"Error classifying features: {str(e)}")
|
| 930 |
+
return
|
| 931 |
+
|
| 932 |
+
# Generate correlation matrix
|
| 933 |
+
st.subheader("2. Correlation Matrix")
|
| 934 |
+
if st.session_state['correlation_matrix'] is None:
|
| 935 |
+
with st.spinner("Generating correlation matrix (this may take a while)..."):
|
| 936 |
+
try:
|
| 937 |
+
corr_generator = CorrelationMatrixGenerator(
|
| 938 |
+
df=df_features,
|
| 939 |
+
feature_classes=feature_classes,
|
| 940 |
+
continuous_vs_continuous_method='pearson'
|
| 941 |
+
)
|
| 942 |
+
st.session_state['correlation_matrix'] = corr_generator.generate_matrix()
|
| 943 |
+
except Exception as e:
|
| 944 |
+
st.error(f"Error generating correlation matrix: {str(e)}")
|
| 945 |
+
return
|
| 946 |
+
|
| 947 |
+
corr_matrix = st.session_state['correlation_matrix']
|
| 948 |
+
|
| 949 |
+
# Display interactive heatmap
|
| 950 |
+
st.markdown("**Interactive Correlation Heatmap**")
|
| 951 |
+
try:
|
| 952 |
+
# Create heatmap using plotly
|
| 953 |
+
fig = px.imshow(
|
| 954 |
+
corr_matrix,
|
| 955 |
+
color_continuous_scale='RdBu',
|
| 956 |
+
aspect='auto',
|
| 957 |
+
labels=dict(x="Feature", y="Feature", color="Correlation"),
|
| 958 |
+
title="Feature Correlation Matrix"
|
| 959 |
+
)
|
| 960 |
+
fig.update_layout(
|
| 961 |
+
height=max(800, len(corr_matrix) * 40),
|
| 962 |
+
width=max(800, len(corr_matrix) * 40)
|
| 963 |
+
)
|
| 964 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 965 |
+
except Exception as e:
|
| 966 |
+
st.error(f"Error displaying heatmap: {str(e)}")
|
| 967 |
+
|
| 968 |
+
# Find feature clusters using dimension reduction
|
| 969 |
+
st.subheader("3. Feature Clusters (Cohorts)")
|
| 970 |
+
if st.session_state['feature_clusters'] is None:
|
| 971 |
+
with st.spinner("Finding feature clusters..."):
|
| 972 |
+
try:
|
| 973 |
+
dim_reduction = DimensionReduction(
|
| 974 |
+
dataframe=df_features,
|
| 975 |
+
feature_classes=feature_classes,
|
| 976 |
+
method='pearson',
|
| 977 |
+
projection_dimension=1
|
| 978 |
+
)
|
| 979 |
+
|
| 980 |
+
# Find clusters at different correlation thresholds; store (lower, upper) with each band for correct labeling
|
| 981 |
+
st.session_state['feature_clusters'] = [
|
| 982 |
+
((0.95, 1.0), dim_reduction.find_clusters(lower_bound=0.95, upper_bound=1.0)),
|
| 983 |
+
((0.90, 0.95), dim_reduction.find_clusters(lower_bound=0.90, upper_bound=0.95)),
|
| 984 |
+
((0.85, 0.90), dim_reduction.find_clusters(lower_bound=0.85, upper_bound=0.90)),
|
| 985 |
+
((0.80, 0.85), dim_reduction.find_clusters(lower_bound=0.80, upper_bound=0.85)),
|
| 986 |
+
((0.75, 0.80), dim_reduction.find_clusters(lower_bound=0.75, upper_bound=0.80)),
|
| 987 |
+
((0.70, 0.75), dim_reduction.find_clusters(lower_bound=0.70, upper_bound=0.75)),
|
| 988 |
+
]
|
| 989 |
+
except Exception as e:
|
| 990 |
+
st.error(f"Error finding clusters: {str(e)}")
|
| 991 |
+
return
|
| 992 |
+
|
| 993 |
+
cluster_bands = st.session_state['feature_clusters']
|
| 994 |
+
|
| 995 |
+
# Display clusters with band-bound labels so captions match the shown matrices
|
| 996 |
+
for (lower, upper), cluster_list in cluster_bands:
|
| 997 |
+
band_label = f"[{lower}, {upper}]"
|
| 998 |
+
if cluster_list:
|
| 999 |
+
st.markdown(f"**Cohorts with pairwise correlation in {band_label}**")
|
| 1000 |
+
for idx, cluster in enumerate(cluster_list):
|
| 1001 |
+
with st.expander(f"Cohort {idx + 1}: {len(cluster)} features (all pairs in {band_label})"):
|
| 1002 |
+
for feature in cluster:
|
| 1003 |
+
st.write(f" • {feature}")
|
| 1004 |
+
if len(cluster) > 1:
|
| 1005 |
+
st.markdown("**Pairwise correlations (values lie in " + band_label + "):**")
|
| 1006 |
+
cluster_corr = corr_matrix.loc[cluster, cluster]
|
| 1007 |
+
st.dataframe(cluster_corr, use_container_width=True)
|
| 1008 |
+
# Sanity check: ensure displayed matrix matches the band
|
| 1009 |
+
vals = cluster_corr.values
|
| 1010 |
+
off_diag = vals[~np.eye(len(cluster), dtype=bool)]
|
| 1011 |
+
if off_diag.size > 0:
|
| 1012 |
+
in_range = np.sum((off_diag >= lower) & (off_diag <= upper)) == off_diag.size
|
| 1013 |
+
if in_range:
|
| 1014 |
+
st.caption(f"All off-diagonal values in {band_label}.")
|
| 1015 |
+
else:
|
| 1016 |
+
st.caption(f"Note: some values fall outside {band_label} (may include NaNs or rounding).")
|
| 1017 |
+
else:
|
| 1018 |
+
st.info(f"No cohorts found with pairwise correlation in {band_label}.")
|
| 1019 |
+
|
| 1020 |
+
# Summary statistics
|
| 1021 |
+
st.subheader("4. Summary")
|
| 1022 |
+
total_clusters = sum(len(cluster_list) for (_, cluster_list) in cluster_bands)
|
| 1023 |
+
total_features_in_clusters = sum(
|
| 1024 |
+
len(cluster) for (_, cluster_list) in cluster_bands for cluster in cluster_list
|
| 1025 |
+
)
|
| 1026 |
+
|
| 1027 |
+
col1, col2 = st.columns(2)
|
| 1028 |
+
with col1:
|
| 1029 |
+
st.metric("Total Cohorts Found", total_clusters)
|
| 1030 |
+
with col2:
|
| 1031 |
+
st.metric("Features in Cohorts", total_features_in_clusters)
|
| 1032 |
+
|
| 1033 |
+
st.divider()
|
| 1034 |
+
if st.button("Start over", key="correlations_start_over_bottom", use_container_width=True):
|
| 1035 |
+
_reset_correlations()
|
| 1036 |
+
st.rerun()
|
| 1037 |
+
|
| 1038 |
+
|
| 1039 |
+
def render_placeholder_tab():
|
| 1040 |
+
"""Render placeholder tab."""
|
| 1041 |
+
st.info("🚧 This feature is under development.")
|
| 1042 |
+
|
| 1043 |
+
|
| 1044 |
+
# ============================================================================
|
| 1045 |
+
# Main Application
|
| 1046 |
+
# ============================================================================
|
| 1047 |
+
|
| 1048 |
+
def main():
|
| 1049 |
+
"""Main application entry point."""
|
| 1050 |
+
# Initialize session state
|
| 1051 |
+
initialize_session_state()
|
| 1052 |
+
|
| 1053 |
+
# Load configuration
|
| 1054 |
+
if st.session_state['app_config'] is None:
|
| 1055 |
+
st.error(
|
| 1056 |
+
f"❌ Configuration Error: {st.session_state.get('config_error', 'Unknown error')}\n\n"
|
| 1057 |
+
"Please ensure `config.yaml` exists and is properly formatted."
|
| 1058 |
+
)
|
| 1059 |
+
st.stop()
|
| 1060 |
+
|
| 1061 |
+
# Initialize AWS services
|
| 1062 |
+
if not initialize_aws_if_needed():
|
| 1063 |
+
if st.session_state['aws_error']:
|
| 1064 |
+
st.error(
|
| 1065 |
+
f"❌ AWS Initialization Error: {st.session_state['aws_error']}\n\n"
|
| 1066 |
+
"Please check your AWS credentials in `config.yaml`."
|
| 1067 |
+
)
|
| 1068 |
+
st.stop()
|
| 1069 |
+
|
| 1070 |
+
# Get dashboard configuration
|
| 1071 |
+
dashboard_config = st.session_state['app_config'].get('dashboard', {})
|
| 1072 |
+
|
| 1073 |
+
# Set page config
|
| 1074 |
+
st.set_page_config(
|
| 1075 |
+
page_title=dashboard_config.get('page_title', 'OXON Technologies'),
|
| 1076 |
+
page_icon=dashboard_config.get('page_icon', ':mag:'),
|
| 1077 |
+
layout=dashboard_config.get('layout', 'wide')
|
| 1078 |
+
)
|
| 1079 |
+
|
| 1080 |
+
# Custom sidebar styling
|
| 1081 |
+
sidebar_color = dashboard_config.get('sidebar_background_color', '#74b9ff')
|
| 1082 |
+
st.markdown(
|
| 1083 |
+
f"""
|
| 1084 |
+
<style>
|
| 1085 |
+
section[data-testid="stSidebar"] {{
|
| 1086 |
+
background-color: {sidebar_color};
|
| 1087 |
+
}}
|
| 1088 |
+
</style>
|
| 1089 |
+
""",
|
| 1090 |
+
unsafe_allow_html=True,
|
| 1091 |
+
)
|
| 1092 |
+
|
| 1093 |
+
# Display header
|
| 1094 |
+
header_logo = dashboard_config.get('header_logo_path', 'images/analysis.png')
|
| 1095 |
+
header_title = dashboard_config.get('page_title', 'Analytical Dashboard')
|
| 1096 |
+
display_header(header_logo, header_title)
|
| 1097 |
+
|
| 1098 |
+
# Display sidebar
|
| 1099 |
+
display_sidebar()
|
| 1100 |
+
|
| 1101 |
+
# Main content tabs
|
| 1102 |
+
tabs = st.tabs(['Message Viewer', 'Correlations', 'To be Implemented'])
|
| 1103 |
+
|
| 1104 |
+
with tabs[0]:
|
| 1105 |
+
render_message_viewer_tab()
|
| 1106 |
+
|
| 1107 |
+
with tabs[1]:
|
| 1108 |
+
render_correlations_tab()
|
| 1109 |
+
|
| 1110 |
+
with tabs[2]:
|
| 1111 |
+
render_placeholder_tab()
|
| 1112 |
|
|
|
|
|
|
|
| 1113 |
|
| 1114 |
+
if __name__ == "__main__":
|
| 1115 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/test_connection.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quick test script to verify Athena connection and basic functionality.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from datalake.config import DataLakeConfig
|
| 6 |
+
from datalake.athena import AthenaQuery
|
| 7 |
+
from datalake.catalog import DataLakeCatalog
|
| 8 |
+
|
| 9 |
+
def main():
|
| 10 |
+
"""Test basic connection and functionality."""
|
| 11 |
+
print("Testing Athena Connection...")
|
| 12 |
+
print("=" * 60)
|
| 13 |
+
|
| 14 |
+
# Load config with explicit credentials
|
| 15 |
+
config = DataLakeConfig.from_credentials(
|
| 16 |
+
database_name="dbparquetdatalake05",
|
| 17 |
+
workgroup="athenaworkgroup-datalake05",
|
| 18 |
+
s3_output_location="s3://canedge-raw-data-parquet/athena-results/",
|
| 19 |
+
region="eu-north-1",
|
| 20 |
+
access_key_id="AKIARJQJFFVASPMSGNNY",
|
| 21 |
+
secret_access_key="Z6ISPZJvvcv13JZKYyuUxiMRZvDrvfoWs4YTUBnh",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
print(f"✓ Configuration loaded")
|
| 25 |
+
print(f" Database: {config.database_name}")
|
| 26 |
+
print(f" Workgroup: {config.workgroup}")
|
| 27 |
+
print(f" Region: {config.region}")
|
| 28 |
+
print(f" S3 Output: {config.s3_output_location}")
|
| 29 |
+
print()
|
| 30 |
+
|
| 31 |
+
# Initialize Athena
|
| 32 |
+
try:
|
| 33 |
+
athena = AthenaQuery(config)
|
| 34 |
+
print("✓ Athena client initialized")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"✗ Failed to initialize Athena client: {e}")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
# Test simple query
|
| 40 |
+
try:
|
| 41 |
+
print("Testing simple query...")
|
| 42 |
+
test_query = f"SHOW TABLES IN {config.database_name}"
|
| 43 |
+
df = athena.query_to_dataframe(test_query, timeout=60)
|
| 44 |
+
print(f"✓ Query executed successfully")
|
| 45 |
+
print(f" Found {len(df)} tables")
|
| 46 |
+
if not df.empty:
|
| 47 |
+
print(f" Sample tables: {list(df.iloc[:, 0])[:5]}")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"✗ Query failed: {e}")
|
| 50 |
+
import traceback
|
| 51 |
+
traceback.print_exc()
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
# Test catalog
|
| 55 |
+
try:
|
| 56 |
+
print("\nTesting catalog...")
|
| 57 |
+
catalog = DataLakeCatalog(athena, config)
|
| 58 |
+
tables = catalog.list_tables()
|
| 59 |
+
print(f"✓ Catalog initialized")
|
| 60 |
+
print(f" Total tables: {len(tables)}")
|
| 61 |
+
|
| 62 |
+
if tables:
|
| 63 |
+
devices = catalog.list_devices()
|
| 64 |
+
print(f" Devices found: {len(devices)}")
|
| 65 |
+
if devices:
|
| 66 |
+
print(f" Sample devices: {devices[:3]}")
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"✗ Catalog test failed: {e}")
|
| 69 |
+
import traceback
|
| 70 |
+
traceback.print_exc()
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
print("\n" + "=" * 60)
|
| 74 |
+
print("✓ All tests passed! Connection is working.")
|
| 75 |
+
print("=" * 60)
|
| 76 |
+
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
main()
|
src/utils/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .correlation import CorrelationMatrixGenerator
|
| 2 |
+
from .dimension_reduction import DimensionReduction
|
| 3 |
+
from .feature_class import DetectFeatureClasses
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
'CorrelationMatrixGenerator',
|
| 7 |
+
'DimensionReduction',
|
| 8 |
+
'DetectFeatureClasses'
|
| 9 |
+
]
|
src/utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (355 Bytes). View file
|
|
|
src/utils/__pycache__/correlation.cpython-310.pyc
ADDED
|
Binary file (7.66 kB). View file
|
|
|
src/utils/__pycache__/dimension_reduction.cpython-310.pyc
ADDED
|
Binary file (7.96 kB). View file
|
|
|
src/utils/__pycache__/feature_class.cpython-310.pyc
ADDED
|
Binary file (4.57 kB). View file
|
|
|
src/utils/correlation.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Correlation matrix generation module for mixed data types.
|
| 3 |
+
|
| 4 |
+
This module provides the CorrelationMatrixGenerator class which computes
|
| 5 |
+
correlation/association matrices for DataFrames containing mixed data types
|
| 6 |
+
(Continuous, Binary, Categorical). It automatically selects appropriate
|
| 7 |
+
correlation measures based on feature type pairs.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from scipy.stats import chi2_contingency, pointbiserialr
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CorrelationMatrixGenerator:
|
| 17 |
+
"""
|
| 18 |
+
A class to generate a correlation/association matrix for a pandas DataFrame,
|
| 19 |
+
handling different data types appropriately. It supports Continuous, Binary, and Categorical data types.
|
| 20 |
+
Parameters:
|
| 21 |
+
----------
|
| 22 |
+
df : pd.DataFrame
|
| 23 |
+
The input DataFrame containing features for correlation analysis.
|
| 24 |
+
feature_classes : dict
|
| 25 |
+
A dictionary mapping column names to their data types ('Continuous', 'Binary', 'Categorical').
|
| 26 |
+
continuous_vs_continuous_method : str, optional
|
| 27 |
+
Method to use for estimating the correlation coefficient of two continuous data types. Default is 'pearson'.
|
| 28 |
+
Methods:
|
| 29 |
+
-------
|
| 30 |
+
generate_matrix() -> pd.DataFrame
|
| 31 |
+
Generates and returns a symmetric correlation/association matrix for the DataFrame.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(self, df, feature_classes, continuous_vs_continuous_method='pearson'):
|
| 35 |
+
|
| 36 |
+
"""
|
| 37 |
+
Initialize with a DataFrame and a dictionary mapping column names to data types.
|
| 38 |
+
|
| 39 |
+
Parameters:
|
| 40 |
+
df : pandas.DataFrame
|
| 41 |
+
The DataFrame containing your data.
|
| 42 |
+
feature_classes : dict
|
| 43 |
+
A dictionary where keys are column names in df and values are their data types.
|
| 44 |
+
Valid types are 'Continuous', 'Binary', or 'Categorical'.
|
| 45 |
+
continuous_vs_continuous_method : str
|
| 46 |
+
Method to use for estimating the correlation coefficient of two continuous data
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
self.df = df
|
| 50 |
+
self.feature_classes = feature_classes
|
| 51 |
+
self.continuous_vs_continuous_method = continuous_vs_continuous_method
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@staticmethod
|
| 55 |
+
def recode_binary(series):
|
| 56 |
+
"""
|
| 57 |
+
Ensure a binary series is coded as 0 and 1.
|
| 58 |
+
|
| 59 |
+
If the series is already numeric with values {0,1}, it is returned as is.
|
| 60 |
+
Otherwise, it maps the two unique values to 0 and 1.
|
| 61 |
+
|
| 62 |
+
Parameters
|
| 63 |
+
----------
|
| 64 |
+
series : pd.Series
|
| 65 |
+
A binary series to recode.
|
| 66 |
+
|
| 67 |
+
Returns
|
| 68 |
+
-------
|
| 69 |
+
pd.Series
|
| 70 |
+
Binary series with values {0, 1}.
|
| 71 |
+
|
| 72 |
+
Raises
|
| 73 |
+
------
|
| 74 |
+
ValueError
|
| 75 |
+
If the series does not appear to be binary (has more than 2 unique values).
|
| 76 |
+
"""
|
| 77 |
+
# Check if already numeric and in {0, 1}
|
| 78 |
+
if pd.api.types.is_numeric_dtype(series):
|
| 79 |
+
unique_vals = series.dropna().unique()
|
| 80 |
+
if set(unique_vals) <= {0, 1}:
|
| 81 |
+
return series
|
| 82 |
+
# Map two unique values to {0, 1}
|
| 83 |
+
unique_vals = series.dropna().unique()
|
| 84 |
+
if len(unique_vals) == 2:
|
| 85 |
+
mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
|
| 86 |
+
return series.map(mapping)
|
| 87 |
+
else:
|
| 88 |
+
raise ValueError("Series does not appear to be binary")
|
| 89 |
+
|
| 90 |
+
@staticmethod
|
| 91 |
+
def cramers_v(x, y):
|
| 92 |
+
"""
|
| 93 |
+
Calculate Cramér's V statistic for a categorical-categorical association.
|
| 94 |
+
|
| 95 |
+
Cramér's V is a measure of association between two nominal variables,
|
| 96 |
+
ranging from 0 (no association) to 1 (perfect association).
|
| 97 |
+
|
| 98 |
+
Parameters
|
| 99 |
+
----------
|
| 100 |
+
x, y : array-like
|
| 101 |
+
Two categorical variables.
|
| 102 |
+
|
| 103 |
+
Returns
|
| 104 |
+
-------
|
| 105 |
+
float
|
| 106 |
+
Cramér's V statistic, or np.nan if computation is not possible.
|
| 107 |
+
"""
|
| 108 |
+
contingency_table = pd.crosstab(x, y)
|
| 109 |
+
chi2 = chi2_contingency(contingency_table)[0]
|
| 110 |
+
n = contingency_table.values.sum()
|
| 111 |
+
min_dim = min(contingency_table.shape) - 1
|
| 112 |
+
if n == 0 or min_dim == 0:
|
| 113 |
+
return np.nan
|
| 114 |
+
return np.sqrt(chi2 / (n * min_dim))
|
| 115 |
+
|
| 116 |
+
@staticmethod
|
| 117 |
+
def anova_eta(categories, measurements):
|
| 118 |
+
"""
|
| 119 |
+
Compute the eta (η) as an effect size measure derived from one-way ANOVA.
|
| 120 |
+
It indicates the proportion of variance in the continuous variable (measurements)
|
| 121 |
+
explained by the categorical grouping (categories). Higher values indicate a stronger effect.
|
| 122 |
+
|
| 123 |
+
Parameters:
|
| 124 |
+
categories : array-like (categorical grouping)
|
| 125 |
+
measurements : array-like (continuous values)
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
eta : float, between 0 and 1 representing the effect size.
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
+
# Factorize the categorical variable
|
| 132 |
+
factors, _ = pd.factorize(categories)
|
| 133 |
+
categories_count = np.max(factors) + 1
|
| 134 |
+
overall_mean = np.mean(measurements)
|
| 135 |
+
ss_between = 0.0 # Sum of Squares
|
| 136 |
+
|
| 137 |
+
for i in range(categories_count):
|
| 138 |
+
group = measurements[factors == i]
|
| 139 |
+
n_i = len(group)
|
| 140 |
+
if n_i == 0:
|
| 141 |
+
continue
|
| 142 |
+
group_mean = np.mean(group)
|
| 143 |
+
ss_between += n_i * ((group_mean - overall_mean) ** 2)
|
| 144 |
+
|
| 145 |
+
ss_total = np.sum((measurements - overall_mean) ** 2)
|
| 146 |
+
|
| 147 |
+
if ss_total == 0:
|
| 148 |
+
return np.nan
|
| 149 |
+
|
| 150 |
+
eta = np.sqrt(ss_between / ss_total)
|
| 151 |
+
|
| 152 |
+
return eta
|
| 153 |
+
|
| 154 |
+
def compute_pairwise_correlation(self, series1, type1, series2, type2):
|
| 155 |
+
"""
|
| 156 |
+
Compute the correlation/association between two series based on their data types.
|
| 157 |
+
|
| 158 |
+
Parameters:
|
| 159 |
+
series1, series2 : pandas.Series
|
| 160 |
+
type1, type2 : str, one of 'Continuous', 'Binary', 'Categorical'
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
A correlation/association measure (float) or np.nan if not defined.
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
# ------------- Homogeneous Data types -------------
|
| 167 |
+
|
| 168 |
+
# Continuous vs. Continuous: Pearson correlation
|
| 169 |
+
if {type1, type2} == {'Continuous', 'Continuous'}:
|
| 170 |
+
return series1.corr(series2, method=self.continuous_vs_continuous_method)
|
| 171 |
+
|
| 172 |
+
# Binary vs. Binary: Phi coefficient (using Pearson on recoded binaries)
|
| 173 |
+
elif {type1, type2} == {'Binary', 'Binary'}:
|
| 174 |
+
try:
|
| 175 |
+
s1 = self.recode_binary(series1)
|
| 176 |
+
s2 = self.recode_binary(series2)
|
| 177 |
+
except Exception as e:
|
| 178 |
+
return np.nan
|
| 179 |
+
return s1.corr(s2, method='pearson')
|
| 180 |
+
|
| 181 |
+
# Categorical vs. Categorical: Use Cramér's V
|
| 182 |
+
elif {type1, type2} == {'Categorical', 'Categorical'}:
|
| 183 |
+
return self.cramers_v(series1, series2)
|
| 184 |
+
|
| 185 |
+
# ------------- Heterogeneous Data Types -------------
|
| 186 |
+
|
| 187 |
+
# Binary & Continuous: Point-biserial correlation coefficient
|
| 188 |
+
elif {type1, type2} == {'Continuous', 'Binary'}:
|
| 189 |
+
|
| 190 |
+
binary_series = series1 if type1 == 'Binary' else series2
|
| 191 |
+
continuous_series = series2 if type2 == 'Continuous' else series1
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
binary_series = self.recode_binary(binary_series)
|
| 195 |
+
except Exception as e:
|
| 196 |
+
return np.nan
|
| 197 |
+
|
| 198 |
+
corr, _ = pointbiserialr(binary_series, continuous_series)
|
| 199 |
+
|
| 200 |
+
return corr
|
| 201 |
+
|
| 202 |
+
# Categorical vs. Continuous: Use ANOVA-based effect size (η)
|
| 203 |
+
elif {type1, type2} == {'Continuous', 'Categorical'}:
|
| 204 |
+
return self.anova_eta(series1, series2) if type1 == 'Categorical' else self.anova_eta(series2, series1)
|
| 205 |
+
|
| 206 |
+
# Binary vs. Categorical: Treat as nominal and use Cramér's V
|
| 207 |
+
elif {type1, type2} == {'Binary', 'Categorical'}:
|
| 208 |
+
return self.cramers_v(series1, series2)
|
| 209 |
+
|
| 210 |
+
else:
|
| 211 |
+
return np.nan
|
| 212 |
+
|
| 213 |
+
def generate_matrix(self):
|
| 214 |
+
"""
|
| 215 |
+
Generate a symmetric correlation/association matrix for the specified columns,
|
| 216 |
+
using the appropriate method based on their data types.
|
| 217 |
+
|
| 218 |
+
The matrix is computed by iterating over all feature pairs and selecting
|
| 219 |
+
the appropriate correlation measure based on their types. The matrix
|
| 220 |
+
is symmetric (corr(A, B) = corr(B, A)).
|
| 221 |
+
|
| 222 |
+
Returns
|
| 223 |
+
-------
|
| 224 |
+
pd.DataFrame
|
| 225 |
+
A symmetric correlation/association matrix with feature names as
|
| 226 |
+
both index and columns. Values are rounded to 4 decimal places.
|
| 227 |
+
"""
|
| 228 |
+
factors = list(self.feature_classes.keys())
|
| 229 |
+
corr_matrix = pd.DataFrame(index=factors, columns=factors, dtype=float)
|
| 230 |
+
|
| 231 |
+
# Compute pairwise correlations
|
| 232 |
+
for i, var1 in tqdm(list(enumerate(factors))):
|
| 233 |
+
for j, var2 in enumerate(factors):
|
| 234 |
+
if i == j:
|
| 235 |
+
# Diagonal: perfect correlation with itself
|
| 236 |
+
corr_matrix.loc[var1, var2] = 1.0
|
| 237 |
+
elif pd.isna(corr_matrix.loc[var1, var2]):
|
| 238 |
+
# Compute correlation only if not already computed (upper triangle)
|
| 239 |
+
series1 = self.df[var1]
|
| 240 |
+
series2 = self.df[var2]
|
| 241 |
+
type1 = self.feature_classes[var1]
|
| 242 |
+
type2 = self.feature_classes[var2]
|
| 243 |
+
corr_value = self.compute_pairwise_correlation(series1, type1, series2, type2)
|
| 244 |
+
# Fill both upper and lower triangle for symmetry
|
| 245 |
+
corr_matrix.loc[var1, var2] = corr_value
|
| 246 |
+
corr_matrix.loc[var2, var1] = corr_value # ensure symmetry
|
| 247 |
+
|
| 248 |
+
return corr_matrix.round(4)
|
src/utils/dimension_reduction.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from itertools import chain
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sklearn.decomposition import PCA
|
| 6 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 7 |
+
|
| 8 |
+
from utils.correlation import CorrelationMatrixGenerator
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DimensionReduction:
|
| 12 |
+
"""
|
| 13 |
+
Correlation-driven clustering of features with a STRICT pairwise constraint:
|
| 14 |
+
every pair of features in a cluster must have correlation within [lower_bound, upper_bound].
|
| 15 |
+
|
| 16 |
+
Clusters are found as (maximal) cliques in the graph where an edge connects two features
|
| 17 |
+
iff their correlation lies in the requested band.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, dataframe, feature_classes, method="pearson", projection_dimension=1):
|
| 21 |
+
self.dataframe = dataframe.copy()
|
| 22 |
+
|
| 23 |
+
self.correlation_matrix = CorrelationMatrixGenerator(
|
| 24 |
+
df=self.dataframe,
|
| 25 |
+
feature_classes=feature_classes,
|
| 26 |
+
continuous_vs_continuous_method=method
|
| 27 |
+
).generate_matrix()
|
| 28 |
+
|
| 29 |
+
if not isinstance(self.correlation_matrix, pd.DataFrame):
|
| 30 |
+
raise TypeError("CorrelationMatrixGenerator.generate_matrix() must return a pandas.DataFrame")
|
| 31 |
+
|
| 32 |
+
if projection_dimension < 1:
|
| 33 |
+
raise ValueError("projection_dimension must be >= 1")
|
| 34 |
+
self.k = int(projection_dimension)
|
| 35 |
+
|
| 36 |
+
# ---------------------------
|
| 37 |
+
# Strict clique-based clustering
|
| 38 |
+
# ---------------------------
|
| 39 |
+
def _cluster_features(self, lower_bound, upper_bound):
|
| 40 |
+
"""
|
| 41 |
+
Return DISJOINT clusters where every pair is within [lower_bound, upper_bound].
|
| 42 |
+
Implemented via maximal cliques + greedy disjoint selection.
|
| 43 |
+
"""
|
| 44 |
+
if not (0 <= lower_bound <= upper_bound <= 1):
|
| 45 |
+
raise ValueError("Bounds must satisfy 0 <= lower_bound <= upper_bound <= 1")
|
| 46 |
+
|
| 47 |
+
cm = self.correlation_matrix
|
| 48 |
+
|
| 49 |
+
# Use only features present in the correlation matrix (and ideally in dataframe)
|
| 50 |
+
features = [c for c in cm.columns if c in cm.index]
|
| 51 |
+
if not features:
|
| 52 |
+
return []
|
| 53 |
+
|
| 54 |
+
def in_band(x):
|
| 55 |
+
return pd.notna(x) and (lower_bound <= x <= upper_bound)
|
| 56 |
+
|
| 57 |
+
# Build adjacency sets
|
| 58 |
+
adj = {f: set() for f in features}
|
| 59 |
+
for f in features:
|
| 60 |
+
row = cm.loc[f, features]
|
| 61 |
+
for g, val in row.items():
|
| 62 |
+
if g == f:
|
| 63 |
+
continue
|
| 64 |
+
if in_band(val):
|
| 65 |
+
adj[f].add(g)
|
| 66 |
+
|
| 67 |
+
# Bron–Kerbosch with pivot to enumerate maximal cliques
|
| 68 |
+
cliques = []
|
| 69 |
+
|
| 70 |
+
def bron_kerbosch(R, P, X):
|
| 71 |
+
if not P and not X:
|
| 72 |
+
if len(R) >= 2:
|
| 73 |
+
cliques.append(set(R))
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
# Choose a pivot to reduce branching
|
| 77 |
+
if P or X:
|
| 78 |
+
u = max(P | X, key=lambda v: len(adj[v] & P))
|
| 79 |
+
candidates = P - (adj[u] if u in adj else set())
|
| 80 |
+
else:
|
| 81 |
+
candidates = set(P)
|
| 82 |
+
|
| 83 |
+
for v in list(candidates):
|
| 84 |
+
bron_kerbosch(R | {v}, P & adj[v], X & adj[v])
|
| 85 |
+
P.remove(v)
|
| 86 |
+
X.add(v)
|
| 87 |
+
|
| 88 |
+
bron_kerbosch(set(), set(features), set())
|
| 89 |
+
|
| 90 |
+
if not cliques:
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
# Score cliques: prefer larger, then higher average correlation (tie-break deterministic)
|
| 94 |
+
def avg_corr(clique_set):
|
| 95 |
+
cols = sorted(clique_set)
|
| 96 |
+
sub = cm.loc[cols, cols].to_numpy(dtype=float)
|
| 97 |
+
tri = sub[np.triu_indices_from(sub, k=1)]
|
| 98 |
+
tri = tri[~np.isnan(tri)]
|
| 99 |
+
return float(tri.mean()) if tri.size else -np.inf
|
| 100 |
+
|
| 101 |
+
cliques_sorted = sorted(
|
| 102 |
+
cliques,
|
| 103 |
+
key=lambda c: (-len(c), -avg_corr(c), tuple(sorted(c)))
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Greedily produce DISJOINT clusters (otherwise PCA/drop will conflict)
|
| 107 |
+
used = set()
|
| 108 |
+
final_clusters = []
|
| 109 |
+
for c in cliques_sorted:
|
| 110 |
+
# Subset of a clique is still a clique -> pairwise constraint remains valid
|
| 111 |
+
remaining = sorted(list(set(c) - used))
|
| 112 |
+
if len(remaining) >= 2:
|
| 113 |
+
final_clusters.append(remaining)
|
| 114 |
+
used.update(remaining)
|
| 115 |
+
|
| 116 |
+
return final_clusters
|
| 117 |
+
|
| 118 |
+
@staticmethod
|
| 119 |
+
def _solve_conflict(clusters_dictionary):
|
| 120 |
+
"""
|
| 121 |
+
Safe conflict resolver across correlation bands:
|
| 122 |
+
later keys win, features removed from earlier clusters.
|
| 123 |
+
Removing elements from a clique keeps it a clique, so pairwise constraint is preserved.
|
| 124 |
+
"""
|
| 125 |
+
keys = list(clusters_dictionary.keys())
|
| 126 |
+
used = set()
|
| 127 |
+
|
| 128 |
+
for key in reversed(keys): # later bands win
|
| 129 |
+
cleaned = []
|
| 130 |
+
for cluster in clusters_dictionary[key]:
|
| 131 |
+
remaining = [f for f in cluster if f not in used]
|
| 132 |
+
if len(remaining) >= 2:
|
| 133 |
+
cleaned.append(remaining)
|
| 134 |
+
used.update(remaining)
|
| 135 |
+
clusters_dictionary[key] = cleaned
|
| 136 |
+
|
| 137 |
+
return clusters_dictionary
|
| 138 |
+
|
| 139 |
+
def find_clusters(self, lower_bound, upper_bound):
|
| 140 |
+
return self._cluster_features(lower_bound=lower_bound, upper_bound=upper_bound)
|
| 141 |
+
|
| 142 |
+
# ---------------------------
|
| 143 |
+
# PCA projection / replacement
|
| 144 |
+
# ---------------------------
|
| 145 |
+
def _assign_pca_components(self, cluster_index, comps, index):
|
| 146 |
+
"""
|
| 147 |
+
Assign PCA components into group.* columns, supporting k==1 and k>1.
|
| 148 |
+
"""
|
| 149 |
+
if comps.ndim != 2:
|
| 150 |
+
raise ValueError("PCA output must be 2D")
|
| 151 |
+
|
| 152 |
+
k_eff = comps.shape[1]
|
| 153 |
+
if k_eff == 1:
|
| 154 |
+
self.dataframe[f"group.{cluster_index}"] = pd.Series(comps[:, 0], index=index)
|
| 155 |
+
else:
|
| 156 |
+
for c in range(k_eff):
|
| 157 |
+
self.dataframe[f"group.{cluster_index}.{c}"] = pd.Series(comps[:, c], index=index)
|
| 158 |
+
|
| 159 |
+
def reduce_dimension(self, lower_bound=0.95, upper_bound=1.0, scale=True):
|
| 160 |
+
clusters = self._cluster_features(lower_bound=lower_bound, upper_bound=upper_bound)
|
| 161 |
+
|
| 162 |
+
for cluster_index, cols in enumerate(clusters):
|
| 163 |
+
# Guard: only keep columns still present
|
| 164 |
+
cols = [c for c in cols if c in self.dataframe.columns]
|
| 165 |
+
if len(cols) < 2:
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
subset = self.dataframe[cols]
|
| 169 |
+
|
| 170 |
+
# PCA needs numeric matrix; if you truly have non-numerics here, you must encode upstream.
|
| 171 |
+
if not all(pd.api.types.is_numeric_dtype(subset[c]) for c in subset.columns):
|
| 172 |
+
raise TypeError(
|
| 173 |
+
f"Non-numeric columns found in cluster {cluster_index}: {cols}. "
|
| 174 |
+
"Encode them before PCA or restrict clustering to numeric features."
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
X = subset.to_numpy()
|
| 178 |
+
if scale:
|
| 179 |
+
X = MinMaxScaler().fit_transform(X)
|
| 180 |
+
|
| 181 |
+
pca = PCA(n_components=min(self.k, X.shape[1]))
|
| 182 |
+
comps = pca.fit_transform(X)
|
| 183 |
+
|
| 184 |
+
self._assign_pca_components(cluster_index, comps, index=subset.index)
|
| 185 |
+
self.dataframe.drop(columns=cols, inplace=True)
|
| 186 |
+
|
| 187 |
+
return self.dataframe
|
| 188 |
+
|
| 189 |
+
def reduce_dimension_by_grouping(self, threshold=0.8, group_count=4, scale=True):
|
| 190 |
+
clusters = {}
|
| 191 |
+
steps = np.round(np.linspace(threshold, 1.0, group_count + 1), 4)
|
| 192 |
+
|
| 193 |
+
for i in range(len(steps) - 1):
|
| 194 |
+
lb, ub = float(steps[i]), float(steps[i + 1])
|
| 195 |
+
clusters[(lb, ub)] = self._cluster_features(lower_bound=lb, upper_bound=ub)
|
| 196 |
+
|
| 197 |
+
clusters = self._solve_conflict(clusters_dictionary=clusters)
|
| 198 |
+
final_clusters = list(chain(*clusters.values()))
|
| 199 |
+
|
| 200 |
+
for cluster_index, cols in enumerate(final_clusters):
|
| 201 |
+
cols = [c for c in cols if c in self.dataframe.columns]
|
| 202 |
+
if len(cols) < 2:
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
subset = self.dataframe[cols]
|
| 206 |
+
if not all(pd.api.types.is_numeric_dtype(subset[c]) for c in subset.columns):
|
| 207 |
+
raise TypeError(
|
| 208 |
+
f"Non-numeric columns found in cluster {cluster_index}: {cols}. "
|
| 209 |
+
"Encode them before PCA or restrict clustering to numeric features."
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
X = subset.to_numpy()
|
| 213 |
+
if scale:
|
| 214 |
+
X = MinMaxScaler().fit_transform(X)
|
| 215 |
+
|
| 216 |
+
pca = PCA(n_components=min(self.k, X.shape[1]))
|
| 217 |
+
comps = pca.fit_transform(X)
|
| 218 |
+
|
| 219 |
+
self._assign_pca_components(cluster_index, comps, index=subset.index)
|
| 220 |
+
self.dataframe.drop(columns=cols, inplace=True)
|
| 221 |
+
|
| 222 |
+
return self.dataframe, final_clusters
|
src/utils/feature_class.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Feature classification module for detecting data types in DataFrames.
|
| 3 |
+
|
| 4 |
+
This module provides the DetectFeatureClasses class which automatically
|
| 5 |
+
classifies features as Binary, Categorical, or Continuous based on their
|
| 6 |
+
statistical properties.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DetectFeatureClasses:
|
| 13 |
+
"""
|
| 14 |
+
A class to detect feature classes in a pandas DataFrame.
|
| 15 |
+
Parameters:
|
| 16 |
+
----------
|
| 17 |
+
dataframe : pd.DataFrame
|
| 18 |
+
The input DataFrame containing features to be classified.
|
| 19 |
+
categorical_threshold : float, optional
|
| 20 |
+
The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5.
|
| 21 |
+
string_data_policy : str, optional
|
| 22 |
+
Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'.
|
| 23 |
+
Methods:
|
| 24 |
+
-------
|
| 25 |
+
feature_classes() -> dict
|
| 26 |
+
Classifies features into 'Binary', 'Categorical', or 'Continuous' and returns a dictionary with feature names as keys and their classes as values.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, dataframe, categorical_threshold=0.5, string_data_policy='drop'):
|
| 30 |
+
|
| 31 |
+
"""
|
| 32 |
+
Initializes the DetectFeatureClasses with the provided DataFrame and parameters.
|
| 33 |
+
Parameters:
|
| 34 |
+
----------
|
| 35 |
+
dataframe : pd.DataFrame
|
| 36 |
+
The input DataFrame containing features to be classified.
|
| 37 |
+
categorical_threshold : float, optional
|
| 38 |
+
The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5.
|
| 39 |
+
string_data_policy : str, optional
|
| 40 |
+
Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
self.dataframe = dataframe
|
| 44 |
+
self.categorical_threshold = categorical_threshold
|
| 45 |
+
self.string_data_policy = string_data_policy
|
| 46 |
+
|
| 47 |
+
def _binaries(self):
|
| 48 |
+
"""
|
| 49 |
+
Identifies binary features in the DataFrame.
|
| 50 |
+
|
| 51 |
+
A feature is considered binary if it has at most 2 unique values.
|
| 52 |
+
|
| 53 |
+
Returns
|
| 54 |
+
-------
|
| 55 |
+
list
|
| 56 |
+
A list of column names that are classified as binary features.
|
| 57 |
+
"""
|
| 58 |
+
binary_columns = [column for column in self.dataframe.columns if len(self.dataframe[column].unique()) <= 2]
|
| 59 |
+
return binary_columns
|
| 60 |
+
|
| 61 |
+
def _categorical(self):
|
| 62 |
+
"""
|
| 63 |
+
Identifies categorical features in the DataFrame based on the categorical threshold.
|
| 64 |
+
|
| 65 |
+
A feature is considered categorical if the number of unique values
|
| 66 |
+
is significantly less than the total number of rows (using the
|
| 67 |
+
categorical_threshold as a relative tolerance).
|
| 68 |
+
|
| 69 |
+
Returns
|
| 70 |
+
-------
|
| 71 |
+
list
|
| 72 |
+
A list of column names that are classified as categorical features.
|
| 73 |
+
"""
|
| 74 |
+
categorical_columns = []
|
| 75 |
+
for column in self.dataframe.columns:
|
| 76 |
+
# Check if unique count is not close to total rows (within threshold)
|
| 77 |
+
if np.isclose(len(
|
| 78 |
+
self.dataframe[column].unique()), len(self.dataframe),
|
| 79 |
+
rtol=self.categorical_threshold
|
| 80 |
+
) is False:
|
| 81 |
+
categorical_columns.append(column)
|
| 82 |
+
return categorical_columns
|
| 83 |
+
|
| 84 |
+
def feature_classes(self):
|
| 85 |
+
"""
|
| 86 |
+
Classifies features in the DataFrame into 'Binary', 'Categorical', or 'Continuous'.
|
| 87 |
+
Returns:
|
| 88 |
+
-------
|
| 89 |
+
dict
|
| 90 |
+
A dictionary with feature names as keys and their classes ('Binary', 'Categorical', 'Continuous') as values.
|
| 91 |
+
list
|
| 92 |
+
A list of features that were dropped due to string data policy.
|
| 93 |
+
"""
|
| 94 |
+
binary_columns = self._binaries()
|
| 95 |
+
categorical_columns = self._categorical()
|
| 96 |
+
features_class_types = {}
|
| 97 |
+
excess_columns = []
|
| 98 |
+
|
| 99 |
+
# Classify each feature
|
| 100 |
+
for feature in self.dataframe.columns:
|
| 101 |
+
if feature in binary_columns:
|
| 102 |
+
features_class_types[feature] = 'Binary'
|
| 103 |
+
elif feature in categorical_columns:
|
| 104 |
+
features_class_types[feature] = 'Categorical'
|
| 105 |
+
else:
|
| 106 |
+
# Try to convert to float to determine if continuous
|
| 107 |
+
try:
|
| 108 |
+
self.dataframe[feature] = self.dataframe[feature].astype(float)
|
| 109 |
+
features_class_types[feature] = 'Continuous'
|
| 110 |
+
except ValueError:
|
| 111 |
+
# Cannot convert to float - handle based on policy
|
| 112 |
+
if self.string_data_policy == 'drop':
|
| 113 |
+
excess_columns.append(feature)
|
| 114 |
+
else:
|
| 115 |
+
# 'ignore' policy: leave as-is (not recommended)
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
return features_class_types, excess_columns
|
| 119 |
+
|
src/workshop.ipynb
ADDED
|
@@ -0,0 +1,1448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"id": "3ebfe4e7",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"✓ Libraries imported successfully\n"
|
| 14 |
+
]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"source": [
|
| 18 |
+
"# Import required libraries\n",
|
| 19 |
+
"import pandas as pd\n",
|
| 20 |
+
"import matplotlib.pyplot as plt\n",
|
| 21 |
+
"import seaborn as sns\n",
|
| 22 |
+
"from datalake.config import DataLakeConfig\n",
|
| 23 |
+
"from datalake.athena import AthenaQuery\n",
|
| 24 |
+
"from datalake.catalog import DataLakeCatalog\n",
|
| 25 |
+
"from datalake.query import DataLakeQuery\n",
|
| 26 |
+
"from datalake.batch import BatchProcessor\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"# Set up plotting\n",
|
| 29 |
+
"%matplotlib inline\n",
|
| 30 |
+
"plt.style.use('seaborn-v0_8')\n",
|
| 31 |
+
"sns.set_palette(\"husl\")\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"print(\"✓ Libraries imported successfully\")"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": 3,
|
| 39 |
+
"id": "f03eaae2",
|
| 40 |
+
"metadata": {},
|
| 41 |
+
"outputs": [
|
| 42 |
+
{
|
| 43 |
+
"name": "stdout",
|
| 44 |
+
"output_type": "stream",
|
| 45 |
+
"text": [
|
| 46 |
+
"✓ Configuration loaded\n",
|
| 47 |
+
" Database: dbparquetdatalake05\n",
|
| 48 |
+
" Workgroup: athenaworkgroup-datalake05\n",
|
| 49 |
+
" Region: eu-north-1\n"
|
| 50 |
+
]
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"source": [
|
| 54 |
+
"# Configure connection with your credentials\n",
|
| 55 |
+
"config = DataLakeConfig.from_credentials(\n",
|
| 56 |
+
" database_name=\"dbparquetdatalake05\",\n",
|
| 57 |
+
" workgroup=\"athenaworkgroup-datalake05\",\n",
|
| 58 |
+
" s3_output_location=\"s3://canedge-raw-data-parquet/athena-results/\",\n",
|
| 59 |
+
" region=\"eu-north-1\",\n",
|
| 60 |
+
" access_key_id=\"AKIARJQJFFVASPMSGNNY\",\n",
|
| 61 |
+
" secret_access_key=\"Z6ISPZJvvcv13JZKYyuUxiMRZvDrvfoWs4YTUBnh\",\n",
|
| 62 |
+
")\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"print(f\"✓ Configuration loaded\")\n",
|
| 65 |
+
"print(f\" Database: {config.database_name}\")\n",
|
| 66 |
+
"print(f\" Workgroup: {config.workgroup}\")\n",
|
| 67 |
+
"print(f\" Region: {config.region}\")"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"cell_type": "code",
|
| 72 |
+
"execution_count": 4,
|
| 73 |
+
"id": "9e8ceaf6",
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"outputs": [
|
| 76 |
+
{
|
| 77 |
+
"name": "stderr",
|
| 78 |
+
"output_type": "stream",
|
| 79 |
+
"text": [
|
| 80 |
+
"2026-01-26 23:23:13,728 - datalake.athena - INFO - Initialized Athena client for database: dbparquetdatalake05\n",
|
| 81 |
+
"2026-01-26 23:23:13,729 - datalake.catalog - INFO - Initialized catalog for database: dbparquetdatalake05\n",
|
| 82 |
+
"2026-01-26 23:23:13,729 - datalake.query - INFO - Initialized DataLakeQuery\n",
|
| 83 |
+
"2026-01-26 23:23:13,730 - datalake.batch - INFO - Initialized BatchProcessor\n"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "stdout",
|
| 88 |
+
"output_type": "stream",
|
| 89 |
+
"text": [
|
| 90 |
+
"✓ Athena client and catalog initialized\n"
|
| 91 |
+
]
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"source": [
|
| 95 |
+
"# Initialize Athena and catalog\n",
|
| 96 |
+
"athena = AthenaQuery(config)\n",
|
| 97 |
+
"catalog = DataLakeCatalog(athena, config)\n",
|
| 98 |
+
"query = DataLakeQuery(athena, catalog)\n",
|
| 99 |
+
"processor = BatchProcessor(query)\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"print(\"✓ Athena client and catalog initialized\")"
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"cell_type": "code",
|
| 106 |
+
"execution_count": 5,
|
| 107 |
+
"id": "0e3d813f",
|
| 108 |
+
"metadata": {},
|
| 109 |
+
"outputs": [
|
| 110 |
+
{
|
| 111 |
+
"name": "stderr",
|
| 112 |
+
"output_type": "stream",
|
| 113 |
+
"text": [
|
| 114 |
+
"2026-01-26 23:23:14,057 - datalake.athena - INFO - Query started with execution ID: beffdb49-e31a-48bf-8dbf-8c06ae7960cc\n",
|
| 115 |
+
"2026-01-26 23:23:15,190 - datalake.athena - INFO - Query beffdb49-e31a-48bf-8dbf-8c06ae7960cc completed successfully\n",
|
| 116 |
+
"2026-01-26 23:23:15,490 - datalake.athena - INFO - Retrieved 77 rows from S3 for query beffdb49-e31a-48bf-8dbf-8c06ae7960cc\n"
|
| 117 |
+
]
|
| 118 |
+
}
|
| 119 |
+
],
|
| 120 |
+
"source": [
|
| 121 |
+
"test_query = f\"SHOW TABLES IN {config.database_name}\"\n",
|
| 122 |
+
"df_tables = athena.query_to_dataframe(test_query, timeout=60)"
|
| 123 |
+
]
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"cell_type": "code",
|
| 127 |
+
"execution_count": 6,
|
| 128 |
+
"id": "fca55b3b",
|
| 129 |
+
"metadata": {},
|
| 130 |
+
"outputs": [
|
| 131 |
+
{
|
| 132 |
+
"name": "stderr",
|
| 133 |
+
"output_type": "stream",
|
| 134 |
+
"text": [
|
| 135 |
+
"2026-01-26 23:23:15,601 - datalake.athena - INFO - Query started with execution ID: bd81d8c7-2371-431b-a6ed-0208bc4b4f1c\n",
|
| 136 |
+
"2026-01-26 23:23:16,798 - datalake.athena - INFO - Query bd81d8c7-2371-431b-a6ed-0208bc4b4f1c completed successfully\n",
|
| 137 |
+
"2026-01-26 23:23:16,920 - datalake.athena - INFO - Retrieved 78 rows from S3 for query bd81d8c7-2371-431b-a6ed-0208bc4b4f1c\n",
|
| 138 |
+
"2026-01-26 23:23:16,921 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 139 |
+
"2026-01-26 23:23:16,922 - datalake.catalog - INFO - Found 3 device(s)\n"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"name": "stdout",
|
| 144 |
+
"output_type": "stream",
|
| 145 |
+
"text": [
|
| 146 |
+
"Found 3 device(s):\n",
|
| 147 |
+
" - 97a4aaf4\n",
|
| 148 |
+
" - b8280fd1\n",
|
| 149 |
+
" - f1da612a\n"
|
| 150 |
+
]
|
| 151 |
+
}
|
| 152 |
+
],
|
| 153 |
+
"source": [
|
| 154 |
+
"# Discover devices\n",
|
| 155 |
+
"devices = catalog.list_devices()\n",
|
| 156 |
+
"print(f\"Found {len(devices)} device(s):\")\n",
|
| 157 |
+
"for device in devices:\n",
|
| 158 |
+
" print(f\" - {device}\")"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"cell_type": "code",
|
| 163 |
+
"execution_count": 13,
|
| 164 |
+
"id": "103ddb07",
|
| 165 |
+
"metadata": {},
|
| 166 |
+
"outputs": [
|
| 167 |
+
{
|
| 168 |
+
"data": {
|
| 169 |
+
"text/plain": [
|
| 170 |
+
"{1: 'out of memory',\n",
|
| 171 |
+
" 2: 'syntax error',\n",
|
| 172 |
+
" 3: 'no element found',\n",
|
| 173 |
+
" 4: 'not well-formed (invalid token)',\n",
|
| 174 |
+
" 5: 'unclosed token',\n",
|
| 175 |
+
" 6: 'partial character',\n",
|
| 176 |
+
" 7: 'mismatched tag',\n",
|
| 177 |
+
" 8: 'duplicate attribute',\n",
|
| 178 |
+
" 9: 'junk after document element',\n",
|
| 179 |
+
" 10: 'illegal parameter entity reference',\n",
|
| 180 |
+
" 11: 'undefined entity',\n",
|
| 181 |
+
" 12: 'recursive entity reference',\n",
|
| 182 |
+
" 13: 'asynchronous entity',\n",
|
| 183 |
+
" 14: 'reference to invalid character number',\n",
|
| 184 |
+
" 15: 'reference to binary entity',\n",
|
| 185 |
+
" 16: 'reference to external entity in attribute',\n",
|
| 186 |
+
" 17: 'XML or text declaration not at start of entity',\n",
|
| 187 |
+
" 18: 'unknown encoding',\n",
|
| 188 |
+
" 19: 'encoding specified in XML declaration is incorrect',\n",
|
| 189 |
+
" 20: 'unclosed CDATA section',\n",
|
| 190 |
+
" 21: 'error in processing external entity reference',\n",
|
| 191 |
+
" 22: 'document is not standalone',\n",
|
| 192 |
+
" 23: 'unexpected parser state - please send a bug report',\n",
|
| 193 |
+
" 24: 'entity declared in parameter entity',\n",
|
| 194 |
+
" 25: 'requested feature requires XML_DTD support in Expat',\n",
|
| 195 |
+
" 26: 'cannot change setting once parsing has begun',\n",
|
| 196 |
+
" 27: 'unbound prefix',\n",
|
| 197 |
+
" 28: 'must not undeclare prefix',\n",
|
| 198 |
+
" 29: 'incomplete markup in parameter entity',\n",
|
| 199 |
+
" 30: 'XML declaration not well-formed',\n",
|
| 200 |
+
" 31: 'text declaration not well-formed',\n",
|
| 201 |
+
" 32: 'illegal character(s) in public id',\n",
|
| 202 |
+
" 33: 'parser suspended',\n",
|
| 203 |
+
" 34: 'parser not suspended',\n",
|
| 204 |
+
" 35: 'parsing aborted',\n",
|
| 205 |
+
" 36: 'parsing finished',\n",
|
| 206 |
+
" 37: 'cannot suspend in external parameter entity'}"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
"execution_count": 13,
|
| 210 |
+
"metadata": {},
|
| 211 |
+
"output_type": "execute_result"
|
| 212 |
+
}
|
| 213 |
+
],
|
| 214 |
+
"source": [
|
| 215 |
+
"messages"
|
| 216 |
+
]
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"cell_type": "code",
|
| 220 |
+
"execution_count": 19,
|
| 221 |
+
"id": "fbc4938b",
|
| 222 |
+
"metadata": {},
|
| 223 |
+
"outputs": [
|
| 224 |
+
{
|
| 225 |
+
"name": "stdout",
|
| 226 |
+
"output_type": "stream",
|
| 227 |
+
"text": [
|
| 228 |
+
"['0100', '0103', '0104', '0105', '0106', '0107', '010c', '010d', '010e', '010f', '0110', '0111', '011f', '012e', '012f', '0133', '0134', '0135', '0143', '0144', '0149', '0155', '0156', '015c']\n"
|
| 229 |
+
]
|
| 230 |
+
}
|
| 231 |
+
],
|
| 232 |
+
"source": [
|
| 233 |
+
"import re\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"pattern = re.compile(r\"s(?P<s>\\d{2})pid.*m(?P<m>[0-9a-fA-F]{2})$\")\n",
|
| 236 |
+
"\n",
|
| 237 |
+
"strings = [\n",
|
| 238 |
+
" \"can1_obd2_s_m41_s01pid_m00\",\n",
|
| 239 |
+
" \"can1_obd2_s_m41_s01pid_m03\",\n",
|
| 240 |
+
" \"can1_obd2_s_m41_s01pid_m04\",\n",
|
| 241 |
+
" \"can1_obd2_s_m41_s01pid_m05\",\n",
|
| 242 |
+
" \"can1_obd2_s_m41_s01pid_m06\",\n",
|
| 243 |
+
" \"can1_obd2_s_m41_s01pid_m07\",\n",
|
| 244 |
+
" \"can1_obd2_s_m41_s01pid_m0c\",\n",
|
| 245 |
+
"]\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"out = []\n",
|
| 248 |
+
"for x in messages:\n",
|
| 249 |
+
" if x.startswith('can1') is False:\n",
|
| 250 |
+
" continue\n",
|
| 251 |
+
" m = pattern.search(x)\n",
|
| 252 |
+
" out.append((m.group(\"s\") + m.group(\"m\")))\n",
|
| 253 |
+
"\n",
|
| 254 |
+
"print(out)\n",
|
| 255 |
+
"# [('01', '00'), ('01', '03'), ('01', '04'), ('01', '05'), ('01', '06'), ('01', '07'), ('01', '0c')]\n"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"cell_type": "code",
|
| 260 |
+
"execution_count": 7,
|
| 261 |
+
"id": "41a79e1e",
|
| 262 |
+
"metadata": {},
|
| 263 |
+
"outputs": [
|
| 264 |
+
{
|
| 265 |
+
"data": {
|
| 266 |
+
"text/plain": [
|
| 267 |
+
"['97a4aaf4', 'b8280fd1', 'f1da612a']"
|
| 268 |
+
]
|
| 269 |
+
},
|
| 270 |
+
"execution_count": 7,
|
| 271 |
+
"metadata": {},
|
| 272 |
+
"output_type": "execute_result"
|
| 273 |
+
}
|
| 274 |
+
],
|
| 275 |
+
"source": [
|
| 276 |
+
"devices"
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"cell_type": "code",
|
| 281 |
+
"execution_count": 16,
|
| 282 |
+
"id": "ffe04714",
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"outputs": [
|
| 285 |
+
{
|
| 286 |
+
"name": "stdout",
|
| 287 |
+
"output_type": "stream",
|
| 288 |
+
"text": [
|
| 289 |
+
"\n",
|
| 290 |
+
"Exploring device: 97a4aaf4\n",
|
| 291 |
+
"============================================================\n"
|
| 292 |
+
]
|
| 293 |
+
},
|
| 294 |
+
{
|
| 295 |
+
"name": "stderr",
|
| 296 |
+
"output_type": "stream",
|
| 297 |
+
"text": [
|
| 298 |
+
"2026-01-26 23:34:44,692 - datalake.athena - INFO - Query started with execution ID: 442a7cfd-68ed-46eb-98ee-b964a3e3cb6d\n",
|
| 299 |
+
"2026-01-26 23:34:45,808 - datalake.athena - INFO - Query 442a7cfd-68ed-46eb-98ee-b964a3e3cb6d completed successfully\n",
|
| 300 |
+
"2026-01-26 23:34:46,146 - datalake.athena - INFO - Retrieved 78 rows from S3 for query 442a7cfd-68ed-46eb-98ee-b964a3e3cb6d\n",
|
| 301 |
+
"2026-01-26 23:34:46,146 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 302 |
+
"2026-01-26 23:34:46,146 - datalake.catalog - INFO - Found 35 messages for device 97a4aaf4\n"
|
| 303 |
+
]
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"name": "stdout",
|
| 307 |
+
"output_type": "stream",
|
| 308 |
+
"text": [
|
| 309 |
+
"Found 35 message(s):\n",
|
| 310 |
+
" - can1_obd2_s_m41_s01pid_m00\n",
|
| 311 |
+
" - can1_obd2_s_m41_s01pid_m03\n",
|
| 312 |
+
" - can1_obd2_s_m41_s01pid_m04\n",
|
| 313 |
+
" - can1_obd2_s_m41_s01pid_m05\n",
|
| 314 |
+
" - can1_obd2_s_m41_s01pid_m06\n",
|
| 315 |
+
" - can1_obd2_s_m41_s01pid_m07\n",
|
| 316 |
+
" - can1_obd2_s_m41_s01pid_m0c\n",
|
| 317 |
+
" - can1_obd2_s_m41_s01pid_m0d\n",
|
| 318 |
+
" - can1_obd2_s_m41_s01pid_m0e\n",
|
| 319 |
+
" - can1_obd2_s_m41_s01pid_m0f\n",
|
| 320 |
+
" - can1_obd2_s_m41_s01pid_m10\n",
|
| 321 |
+
" - can1_obd2_s_m41_s01pid_m11\n",
|
| 322 |
+
" - can1_obd2_s_m41_s01pid_m1f\n",
|
| 323 |
+
" - can1_obd2_s_m41_s01pid_m2e\n",
|
| 324 |
+
" - can1_obd2_s_m41_s01pid_m2f\n",
|
| 325 |
+
" - can1_obd2_s_m41_s01pid_m33\n",
|
| 326 |
+
" - can1_obd2_s_m41_s01pid_m34\n",
|
| 327 |
+
" - can1_obd2_s_m41_s01pid_m35\n",
|
| 328 |
+
" - can1_obd2_s_m41_s01pid_m43\n",
|
| 329 |
+
" - can1_obd2_s_m41_s01pid_m44\n",
|
| 330 |
+
" - can1_obd2_s_m41_s01pid_m49\n",
|
| 331 |
+
" - can1_obd2_s_m41_s01pid_m55\n",
|
| 332 |
+
" - can1_obd2_s_m41_s01pid_m56\n",
|
| 333 |
+
" - can1_obd2_s_m41_s01pid_m5c\n",
|
| 334 |
+
" - can9_gnssaltitude\n",
|
| 335 |
+
" - can9_gnssdistance\n",
|
| 336 |
+
" - can9_gnsspos\n",
|
| 337 |
+
" - can9_gnssspeed\n",
|
| 338 |
+
" - can9_gnssstatus\n",
|
| 339 |
+
" - can9_gnsstime\n",
|
| 340 |
+
" - can9_heartbeat\n",
|
| 341 |
+
" - can9_imudata\n",
|
| 342 |
+
" - can9_timecalendar\n",
|
| 343 |
+
" - can9_timeexternal\n",
|
| 344 |
+
" - messages\n"
|
| 345 |
+
]
|
| 346 |
+
}
|
| 347 |
+
],
|
| 348 |
+
"source": [
|
| 349 |
+
"if devices:\n",
|
| 350 |
+
" device_id = devices[0]\n",
|
| 351 |
+
" print(f\"\\nExploring device: {device_id}\")\n",
|
| 352 |
+
" print(\"=\" * 60)\n",
|
| 353 |
+
" \n",
|
| 354 |
+
" messages = catalog.list_messages(device_id)\n",
|
| 355 |
+
" print(f\"Found {len(messages)} message(s):\")\n",
|
| 356 |
+
" for message in messages:\n",
|
| 357 |
+
" print(f\" - {message}\")"
|
| 358 |
+
]
|
| 359 |
+
},
|
| 360 |
+
{
|
| 361 |
+
"cell_type": "code",
|
| 362 |
+
"execution_count": 7,
|
| 363 |
+
"id": "a7bae557",
|
| 364 |
+
"metadata": {},
|
| 365 |
+
"outputs": [
|
| 366 |
+
{
|
| 367 |
+
"name": "stderr",
|
| 368 |
+
"output_type": "stream",
|
| 369 |
+
"text": [
|
| 370 |
+
"2026-01-26 15:14:27,675 - datalake.athena - INFO - Query started with execution ID: 62096cc6-be14-49f4-ae61-80efc006dbc2\n"
|
| 371 |
+
]
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"name": "stdout",
|
| 375 |
+
"output_type": "stream",
|
| 376 |
+
"text": [
|
| 377 |
+
"\n",
|
| 378 |
+
"Schema for 97a4aaf4/can1_obd2_s_m41_s01pid_m00:\n",
|
| 379 |
+
"============================================================\n"
|
| 380 |
+
]
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"name": "stderr",
|
| 384 |
+
"output_type": "stream",
|
| 385 |
+
"text": [
|
| 386 |
+
"2026-01-26 15:14:28,793 - datalake.athena - INFO - Query 62096cc6-be14-49f4-ae61-80efc006dbc2 completed successfully\n",
|
| 387 |
+
"2026-01-26 15:14:28,947 - datalake.athena - INFO - Retrieved 78 rows from S3 for query 62096cc6-be14-49f4-ae61-80efc006dbc2\n",
|
| 388 |
+
"2026-01-26 15:14:28,948 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 389 |
+
"2026-01-26 15:14:29,052 - datalake.athena - INFO - Query started with execution ID: 93497778-7bd4-4976-bf95-7009ab18b6df\n",
|
| 390 |
+
"2026-01-26 15:14:30,176 - datalake.athena - INFO - Query 93497778-7bd4-4976-bf95-7009ab18b6df completed successfully\n",
|
| 391 |
+
"2026-01-26 15:14:30,296 - datalake.athena - INFO - Retrieved 3 rows from S3 for query 93497778-7bd4-4976-bf95-7009ab18b6df\n",
|
| 392 |
+
"2026-01-26 15:14:30,297 - datalake.catalog - INFO - Schema for 97a4aaf4/can1_obd2_s_m41_s01pid_m00: 3 columns\n"
|
| 393 |
+
]
|
| 394 |
+
},
|
| 395 |
+
{
|
| 396 |
+
"name": "stdout",
|
| 397 |
+
"output_type": "stream",
|
| 398 |
+
"text": [
|
| 399 |
+
" Column Type\n",
|
| 400 |
+
" t timestamp(3)\n",
|
| 401 |
+
"s01pid00_pidssupported_01_20 double\n",
|
| 402 |
+
" date_created varchar\n",
|
| 403 |
+
"\n",
|
| 404 |
+
"Total columns: 3\n"
|
| 405 |
+
]
|
| 406 |
+
}
|
| 407 |
+
],
|
| 408 |
+
"source": [
|
| 409 |
+
"# Get schema for first device/message combination\n",
|
| 410 |
+
"if devices and messages:\n",
|
| 411 |
+
" device_id = devices[0]\n",
|
| 412 |
+
" message = messages[0]\n",
|
| 413 |
+
" \n",
|
| 414 |
+
" print(f\"\\nSchema for {device_id}/{message}:\")\n",
|
| 415 |
+
" print(\"=\" * 60)\n",
|
| 416 |
+
" \n",
|
| 417 |
+
" schema = catalog.get_schema(device_id, message)\n",
|
| 418 |
+
" if schema:\n",
|
| 419 |
+
" schema_df = pd.DataFrame([\n",
|
| 420 |
+
" {\"Column\": col, \"Type\": dtype}\n",
|
| 421 |
+
" for col, dtype in schema.items()\n",
|
| 422 |
+
" ])\n",
|
| 423 |
+
" print(schema_df.to_string(index=False))\n",
|
| 424 |
+
" print(f\"\\nTotal columns: {len(schema)}\")"
|
| 425 |
+
]
|
| 426 |
+
},
|
| 427 |
+
{
|
| 428 |
+
"cell_type": "code",
|
| 429 |
+
"execution_count": 8,
|
| 430 |
+
"id": "f3b16b2d",
|
| 431 |
+
"metadata": {},
|
| 432 |
+
"outputs": [
|
| 433 |
+
{
|
| 434 |
+
"name": "stderr",
|
| 435 |
+
"output_type": "stream",
|
| 436 |
+
"text": [
|
| 437 |
+
"2026-01-26 15:14:30,406 - datalake.athena - INFO - Query started with execution ID: 4fb3b506-612a-458e-a0e7-709b60a9f91e\n"
|
| 438 |
+
]
|
| 439 |
+
},
|
| 440 |
+
{
|
| 441 |
+
"name": "stdout",
|
| 442 |
+
"output_type": "stream",
|
| 443 |
+
"text": [
|
| 444 |
+
"\n",
|
| 445 |
+
"Partitions (dates) for 97a4aaf4/can1_obd2_s_m41_s01pid_m00:\n",
|
| 446 |
+
"============================================================\n"
|
| 447 |
+
]
|
| 448 |
+
},
|
| 449 |
+
{
|
| 450 |
+
"name": "stderr",
|
| 451 |
+
"output_type": "stream",
|
| 452 |
+
"text": [
|
| 453 |
+
"2026-01-26 15:14:31,530 - datalake.athena - INFO - Query 4fb3b506-612a-458e-a0e7-709b60a9f91e completed successfully\n",
|
| 454 |
+
"2026-01-26 15:14:31,648 - datalake.athena - INFO - Retrieved 78 rows from S3 for query 4fb3b506-612a-458e-a0e7-709b60a9f91e\n",
|
| 455 |
+
"2026-01-26 15:14:31,649 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 456 |
+
"2026-01-26 15:14:31,755 - datalake.athena - INFO - Query started with execution ID: 4ce87717-db17-4fa5-b33d-ef63fb4a89fe\n",
|
| 457 |
+
"2026-01-26 15:14:36,039 - datalake.athena - INFO - Query 4ce87717-db17-4fa5-b33d-ef63fb4a89fe completed successfully\n",
|
| 458 |
+
"2026-01-26 15:14:36,162 - datalake.athena - INFO - Retrieved 13 rows from S3 for query 4ce87717-db17-4fa5-b33d-ef63fb4a89fe\n",
|
| 459 |
+
"2026-01-26 15:14:36,164 - datalake.catalog - INFO - Found 13 partitions for tbl_97a4aaf4_can1_obd2_s_m41_s01pid_m00\n"
|
| 460 |
+
]
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"name": "stdout",
|
| 464 |
+
"output_type": "stream",
|
| 465 |
+
"text": [
|
| 466 |
+
"Found 13 partition(s):\n",
|
| 467 |
+
" Date range: 2025-10-21 to 2025-11-11\n",
|
| 468 |
+
"\n",
|
| 469 |
+
" All dates:\n",
|
| 470 |
+
" - 2025-10-21\n",
|
| 471 |
+
" - 2025-10-27\n",
|
| 472 |
+
" - 2025-10-28\n",
|
| 473 |
+
" - 2025-10-29\n",
|
| 474 |
+
" - 2025-10-30\n",
|
| 475 |
+
" - 2025-10-31\n",
|
| 476 |
+
" - 2025-11-03\n",
|
| 477 |
+
" - 2025-11-04\n",
|
| 478 |
+
" - 2025-11-05\n",
|
| 479 |
+
" - 2025-11-06\n",
|
| 480 |
+
" - 2025-11-07\n",
|
| 481 |
+
" - 2025-11-10\n",
|
| 482 |
+
" - 2025-11-11\n"
|
| 483 |
+
]
|
| 484 |
+
}
|
| 485 |
+
],
|
| 486 |
+
"source": [
|
| 487 |
+
"# Check available partitions (dates)\n",
|
| 488 |
+
"if devices and messages:\n",
|
| 489 |
+
" device_id = devices[0]\n",
|
| 490 |
+
" message = messages[0]\n",
|
| 491 |
+
" \n",
|
| 492 |
+
" print(f\"\\nPartitions (dates) for {device_id}/{message}:\")\n",
|
| 493 |
+
" print(\"=\" * 60)\n",
|
| 494 |
+
" \n",
|
| 495 |
+
" try:\n",
|
| 496 |
+
" partitions = catalog.list_partitions(device_id, message)\n",
|
| 497 |
+
" if partitions:\n",
|
| 498 |
+
" print(f\"Found {len(partitions)} partition(s):\")\n",
|
| 499 |
+
" print(f\" Date range: {partitions[0]} to {partitions[-1]}\")\n",
|
| 500 |
+
" print(f\"\\n All dates:\")\n",
|
| 501 |
+
" for date in partitions[:20]: # Show first 20\n",
|
| 502 |
+
" print(f\" - {date}\")\n",
|
| 503 |
+
" if len(partitions) > 20:\n",
|
| 504 |
+
" print(f\" ... and {len(partitions) - 20} more\")\n",
|
| 505 |
+
" else:\n",
|
| 506 |
+
" print(\"No partitions found (table may not be partitioned)\")\n",
|
| 507 |
+
" except Exception as e:\n",
|
| 508 |
+
" print(f\"Could not list partitions: {e}\")"
|
| 509 |
+
]
|
| 510 |
+
},
|
| 511 |
+
{
|
| 512 |
+
"cell_type": "code",
|
| 513 |
+
"execution_count": 9,
|
| 514 |
+
"id": "66579956",
|
| 515 |
+
"metadata": {},
|
| 516 |
+
"outputs": [
|
| 517 |
+
{
|
| 518 |
+
"data": {
|
| 519 |
+
"text/plain": [
|
| 520 |
+
"'97a4aaf4'"
|
| 521 |
+
]
|
| 522 |
+
},
|
| 523 |
+
"execution_count": 9,
|
| 524 |
+
"metadata": {},
|
| 525 |
+
"output_type": "execute_result"
|
| 526 |
+
}
|
| 527 |
+
],
|
| 528 |
+
"source": [
|
| 529 |
+
"device_id"
|
| 530 |
+
]
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"cell_type": "code",
|
| 534 |
+
"execution_count": 10,
|
| 535 |
+
"id": "3411bcdb",
|
| 536 |
+
"metadata": {},
|
| 537 |
+
"outputs": [
|
| 538 |
+
{
|
| 539 |
+
"name": "stdout",
|
| 540 |
+
"output_type": "stream",
|
| 541 |
+
"text": [
|
| 542 |
+
"['can1_obd2_s_m41_s01pid_m00', 'can1_obd2_s_m41_s01pid_m03', 'can1_obd2_s_m41_s01pid_m04', 'can1_obd2_s_m41_s01pid_m05', 'can1_obd2_s_m41_s01pid_m06', 'can1_obd2_s_m41_s01pid_m07', 'can1_obd2_s_m41_s01pid_m0c', 'can1_obd2_s_m41_s01pid_m0d', 'can1_obd2_s_m41_s01pid_m0e', 'can1_obd2_s_m41_s01pid_m0f', 'can1_obd2_s_m41_s01pid_m10', 'can1_obd2_s_m41_s01pid_m11', 'can1_obd2_s_m41_s01pid_m1f', 'can1_obd2_s_m41_s01pid_m2e', 'can1_obd2_s_m41_s01pid_m2f', 'can1_obd2_s_m41_s01pid_m33', 'can1_obd2_s_m41_s01pid_m34', 'can1_obd2_s_m41_s01pid_m35', 'can1_obd2_s_m41_s01pid_m43', 'can1_obd2_s_m41_s01pid_m44', 'can1_obd2_s_m41_s01pid_m49', 'can1_obd2_s_m41_s01pid_m55', 'can1_obd2_s_m41_s01pid_m56', 'can1_obd2_s_m41_s01pid_m5c', 'can9_gnssaltitude', 'can9_gnssdistance', 'can9_gnsspos', 'can9_gnssspeed', 'can9_gnssstatus', 'can9_gnsstime', 'can9_heartbeat', 'can9_imudata', 'can9_timecalendar', 'can9_timeexternal', 'messages']\n"
|
| 543 |
+
]
|
| 544 |
+
}
|
| 545 |
+
],
|
| 546 |
+
"source": [
|
| 547 |
+
"print(messages)"
|
| 548 |
+
]
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"cell_type": "code",
|
| 552 |
+
"execution_count": 20,
|
| 553 |
+
"id": "b98df0e7",
|
| 554 |
+
"metadata": {},
|
| 555 |
+
"outputs": [
|
| 556 |
+
{
|
| 557 |
+
"name": "stdout",
|
| 558 |
+
"output_type": "stream",
|
| 559 |
+
"text": [
|
| 560 |
+
"Reading sample data from 97a4aaf4/can1_obd2_s_m41_s01pid_m49...\n",
|
| 561 |
+
"============================================================\n"
|
| 562 |
+
]
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"name": "stderr",
|
| 566 |
+
"output_type": "stream",
|
| 567 |
+
"text": [
|
| 568 |
+
"2026-01-26 23:48:28,200 - datalake.athena - INFO - Query started with execution ID: 2501a646-a908-47a8-95df-3870e5696e62\n",
|
| 569 |
+
"2026-01-26 23:48:29,314 - datalake.athena - INFO - Query 2501a646-a908-47a8-95df-3870e5696e62 completed successfully\n",
|
| 570 |
+
"2026-01-26 23:48:29,604 - datalake.athena - INFO - Retrieved 78 rows from S3 for query 2501a646-a908-47a8-95df-3870e5696e62\n",
|
| 571 |
+
"2026-01-26 23:48:29,604 - datalake.catalog - INFO - Found 78 tables in database\n",
|
| 572 |
+
"2026-01-26 23:48:29,604 - datalake.query - INFO - Executing query for 97a4aaf4/can1_obd2_s_m41_s01pid_m49\n",
|
| 573 |
+
"2026-01-26 23:48:29,718 - datalake.athena - INFO - Query started with execution ID: b9f04fc6-6408-4054-8d9a-bf77c0bcf28d\n",
|
| 574 |
+
"2026-01-26 23:48:35,706 - datalake.athena - INFO - Query b9f04fc6-6408-4054-8d9a-bf77c0bcf28d completed successfully\n",
|
| 575 |
+
"2026-01-26 23:48:41,916 - datalake.athena - INFO - Retrieved 652001 rows from S3 for query b9f04fc6-6408-4054-8d9a-bf77c0bcf28d\n"
|
| 576 |
+
]
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"name": "stdout",
|
| 580 |
+
"output_type": "stream",
|
| 581 |
+
"text": [
|
| 582 |
+
"✓ Loaded 652001 records\n",
|
| 583 |
+
"\n",
|
| 584 |
+
"Data shape: (652001, 3)\n",
|
| 585 |
+
"\n",
|
| 586 |
+
"Columns: ['t', 's01pid49_absthrottleposd', 'date_created']\n",
|
| 587 |
+
"\n",
|
| 588 |
+
"First few rows:\n"
|
| 589 |
+
]
|
| 590 |
+
},
|
| 591 |
+
{
|
| 592 |
+
"data": {
|
| 593 |
+
"text/html": [
|
| 594 |
+
"<div>\n",
|
| 595 |
+
"<style scoped>\n",
|
| 596 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 597 |
+
" vertical-align: middle;\n",
|
| 598 |
+
" }\n",
|
| 599 |
+
"\n",
|
| 600 |
+
" .dataframe tbody tr th {\n",
|
| 601 |
+
" vertical-align: top;\n",
|
| 602 |
+
" }\n",
|
| 603 |
+
"\n",
|
| 604 |
+
" .dataframe thead th {\n",
|
| 605 |
+
" text-align: right;\n",
|
| 606 |
+
" }\n",
|
| 607 |
+
"</style>\n",
|
| 608 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 609 |
+
" <thead>\n",
|
| 610 |
+
" <tr style=\"text-align: right;\">\n",
|
| 611 |
+
" <th></th>\n",
|
| 612 |
+
" <th>t</th>\n",
|
| 613 |
+
" <th>s01pid49_absthrottleposd</th>\n",
|
| 614 |
+
" <th>date_created</th>\n",
|
| 615 |
+
" </tr>\n",
|
| 616 |
+
" </thead>\n",
|
| 617 |
+
" <tbody>\n",
|
| 618 |
+
" <tr>\n",
|
| 619 |
+
" <th>0</th>\n",
|
| 620 |
+
" <td>2026-01-02 03:04:02.441</td>\n",
|
| 621 |
+
" <td>15.686275</td>\n",
|
| 622 |
+
" <td>2026/01/02</td>\n",
|
| 623 |
+
" </tr>\n",
|
| 624 |
+
" <tr>\n",
|
| 625 |
+
" <th>1</th>\n",
|
| 626 |
+
" <td>2025-12-03 03:10:02.217</td>\n",
|
| 627 |
+
" <td>15.686275</td>\n",
|
| 628 |
+
" <td>2025/12/03</td>\n",
|
| 629 |
+
" </tr>\n",
|
| 630 |
+
" <tr>\n",
|
| 631 |
+
" <th>2</th>\n",
|
| 632 |
+
" <td>2025-12-31 03:00:00.162</td>\n",
|
| 633 |
+
" <td>15.686275</td>\n",
|
| 634 |
+
" <td>2025/12/31</td>\n",
|
| 635 |
+
" </tr>\n",
|
| 636 |
+
" <tr>\n",
|
| 637 |
+
" <th>3</th>\n",
|
| 638 |
+
" <td>2025-12-19 04:00:00.157</td>\n",
|
| 639 |
+
" <td>30.980392</td>\n",
|
| 640 |
+
" <td>2025/12/19</td>\n",
|
| 641 |
+
" </tr>\n",
|
| 642 |
+
" <tr>\n",
|
| 643 |
+
" <th>4</th>\n",
|
| 644 |
+
" <td>2025-12-22 04:00:00.661</td>\n",
|
| 645 |
+
" <td>15.686275</td>\n",
|
| 646 |
+
" <td>2025/12/22</td>\n",
|
| 647 |
+
" </tr>\n",
|
| 648 |
+
" <tr>\n",
|
| 649 |
+
" <th>5</th>\n",
|
| 650 |
+
" <td>2026-01-13 06:00:00.339</td>\n",
|
| 651 |
+
" <td>15.686275</td>\n",
|
| 652 |
+
" <td>2026/01/13</td>\n",
|
| 653 |
+
" </tr>\n",
|
| 654 |
+
" <tr>\n",
|
| 655 |
+
" <th>6</th>\n",
|
| 656 |
+
" <td>2025-12-19 07:00:00.010</td>\n",
|
| 657 |
+
" <td>38.823529</td>\n",
|
| 658 |
+
" <td>2025/12/19</td>\n",
|
| 659 |
+
" </tr>\n",
|
| 660 |
+
" <tr>\n",
|
| 661 |
+
" <th>7</th>\n",
|
| 662 |
+
" <td>2025-12-19 04:00:01.156</td>\n",
|
| 663 |
+
" <td>33.333333</td>\n",
|
| 664 |
+
" <td>2025/12/19</td>\n",
|
| 665 |
+
" </tr>\n",
|
| 666 |
+
" <tr>\n",
|
| 667 |
+
" <th>8</th>\n",
|
| 668 |
+
" <td>2025-12-19 04:00:02.157</td>\n",
|
| 669 |
+
" <td>35.294118</td>\n",
|
| 670 |
+
" <td>2025/12/19</td>\n",
|
| 671 |
+
" </tr>\n",
|
| 672 |
+
" <tr>\n",
|
| 673 |
+
" <th>9</th>\n",
|
| 674 |
+
" <td>2025-12-19 07:00:01.009</td>\n",
|
| 675 |
+
" <td>34.901961</td>\n",
|
| 676 |
+
" <td>2025/12/19</td>\n",
|
| 677 |
+
" </tr>\n",
|
| 678 |
+
" </tbody>\n",
|
| 679 |
+
"</table>\n",
|
| 680 |
+
"</div>"
|
| 681 |
+
],
|
| 682 |
+
"text/plain": [
|
| 683 |
+
" t s01pid49_absthrottleposd date_created\n",
|
| 684 |
+
"0 2026-01-02 03:04:02.441 15.686275 2026/01/02\n",
|
| 685 |
+
"1 2025-12-03 03:10:02.217 15.686275 2025/12/03\n",
|
| 686 |
+
"2 2025-12-31 03:00:00.162 15.686275 2025/12/31\n",
|
| 687 |
+
"3 2025-12-19 04:00:00.157 30.980392 2025/12/19\n",
|
| 688 |
+
"4 2025-12-22 04:00:00.661 15.686275 2025/12/22\n",
|
| 689 |
+
"5 2026-01-13 06:00:00.339 15.686275 2026/01/13\n",
|
| 690 |
+
"6 2025-12-19 07:00:00.010 38.823529 2025/12/19\n",
|
| 691 |
+
"7 2025-12-19 04:00:01.156 33.333333 2025/12/19\n",
|
| 692 |
+
"8 2025-12-19 04:00:02.157 35.294118 2025/12/19\n",
|
| 693 |
+
"9 2025-12-19 07:00:01.009 34.901961 2025/12/19"
|
| 694 |
+
]
|
| 695 |
+
},
|
| 696 |
+
"metadata": {},
|
| 697 |
+
"output_type": "display_data"
|
| 698 |
+
},
|
| 699 |
+
{
|
| 700 |
+
"name": "stdout",
|
| 701 |
+
"output_type": "stream",
|
| 702 |
+
"text": [
|
| 703 |
+
"\n",
|
| 704 |
+
"Data types:\n",
|
| 705 |
+
"t object\n",
|
| 706 |
+
"s01pid49_absthrottleposd float64\n",
|
| 707 |
+
"date_created object\n",
|
| 708 |
+
"dtype: object\n",
|
| 709 |
+
"\n",
|
| 710 |
+
"Basic statistics:\n"
|
| 711 |
+
]
|
| 712 |
+
},
|
| 713 |
+
{
|
| 714 |
+
"data": {
|
| 715 |
+
"text/html": [
|
| 716 |
+
"<div>\n",
|
| 717 |
+
"<style scoped>\n",
|
| 718 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 719 |
+
" vertical-align: middle;\n",
|
| 720 |
+
" }\n",
|
| 721 |
+
"\n",
|
| 722 |
+
" .dataframe tbody tr th {\n",
|
| 723 |
+
" vertical-align: top;\n",
|
| 724 |
+
" }\n",
|
| 725 |
+
"\n",
|
| 726 |
+
" .dataframe thead th {\n",
|
| 727 |
+
" text-align: right;\n",
|
| 728 |
+
" }\n",
|
| 729 |
+
"</style>\n",
|
| 730 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 731 |
+
" <thead>\n",
|
| 732 |
+
" <tr style=\"text-align: right;\">\n",
|
| 733 |
+
" <th></th>\n",
|
| 734 |
+
" <th>s01pid49_absthrottleposd</th>\n",
|
| 735 |
+
" </tr>\n",
|
| 736 |
+
" </thead>\n",
|
| 737 |
+
" <tbody>\n",
|
| 738 |
+
" <tr>\n",
|
| 739 |
+
" <th>count</th>\n",
|
| 740 |
+
" <td>652001.000000</td>\n",
|
| 741 |
+
" </tr>\n",
|
| 742 |
+
" <tr>\n",
|
| 743 |
+
" <th>mean</th>\n",
|
| 744 |
+
" <td>21.921143</td>\n",
|
| 745 |
+
" </tr>\n",
|
| 746 |
+
" <tr>\n",
|
| 747 |
+
" <th>std</th>\n",
|
| 748 |
+
" <td>8.487119</td>\n",
|
| 749 |
+
" </tr>\n",
|
| 750 |
+
" <tr>\n",
|
| 751 |
+
" <th>min</th>\n",
|
| 752 |
+
" <td>15.686275</td>\n",
|
| 753 |
+
" </tr>\n",
|
| 754 |
+
" <tr>\n",
|
| 755 |
+
" <th>25%</th>\n",
|
| 756 |
+
" <td>15.686275</td>\n",
|
| 757 |
+
" </tr>\n",
|
| 758 |
+
" <tr>\n",
|
| 759 |
+
" <th>50%</th>\n",
|
| 760 |
+
" <td>15.686275</td>\n",
|
| 761 |
+
" </tr>\n",
|
| 762 |
+
" <tr>\n",
|
| 763 |
+
" <th>75%</th>\n",
|
| 764 |
+
" <td>29.019608</td>\n",
|
| 765 |
+
" </tr>\n",
|
| 766 |
+
" <tr>\n",
|
| 767 |
+
" <th>max</th>\n",
|
| 768 |
+
" <td>58.431373</td>\n",
|
| 769 |
+
" </tr>\n",
|
| 770 |
+
" </tbody>\n",
|
| 771 |
+
"</table>\n",
|
| 772 |
+
"</div>"
|
| 773 |
+
],
|
| 774 |
+
"text/plain": [
|
| 775 |
+
" s01pid49_absthrottleposd\n",
|
| 776 |
+
"count 652001.000000\n",
|
| 777 |
+
"mean 21.921143\n",
|
| 778 |
+
"std 8.487119\n",
|
| 779 |
+
"min 15.686275\n",
|
| 780 |
+
"25% 15.686275\n",
|
| 781 |
+
"50% 15.686275\n",
|
| 782 |
+
"75% 29.019608\n",
|
| 783 |
+
"max 58.431373"
|
| 784 |
+
]
|
| 785 |
+
},
|
| 786 |
+
"metadata": {},
|
| 787 |
+
"output_type": "display_data"
|
| 788 |
+
}
|
| 789 |
+
],
|
| 790 |
+
"source": [
|
| 791 |
+
"# Read a sample of data\n",
|
| 792 |
+
"if devices and messages:\n",
|
| 793 |
+
" device_id = devices[0]\n",
|
| 794 |
+
" # message = messages[0]\n",
|
| 795 |
+
" message = 'can1_obd2_s_m41_s01pid_m49'\n",
|
| 796 |
+
" \n",
|
| 797 |
+
" print(f\"Reading sample data from {device_id}/{message}...\")\n",
|
| 798 |
+
" print(\"=\" * 60)\n",
|
| 799 |
+
" \n",
|
| 800 |
+
" try:\n",
|
| 801 |
+
" df = query.read_device_message(\n",
|
| 802 |
+
" device_id=device_id,\n",
|
| 803 |
+
" message=message,\n",
|
| 804 |
+
" )\n",
|
| 805 |
+
" \n",
|
| 806 |
+
" print(f\"✓ Loaded {len(df)} records\")\n",
|
| 807 |
+
" print(f\"\\nData shape: {df.shape}\")\n",
|
| 808 |
+
" print(f\"\\nColumns: {list(df.columns)}\")\n",
|
| 809 |
+
" print(f\"\\nFirst few rows:\")\n",
|
| 810 |
+
" display(df.head(10))\n",
|
| 811 |
+
" \n",
|
| 812 |
+
" print(f\"\\nData types:\")\n",
|
| 813 |
+
" print(df.dtypes)\n",
|
| 814 |
+
" \n",
|
| 815 |
+
" print(f\"\\nBasic statistics:\")\n",
|
| 816 |
+
" display(df.describe())\n",
|
| 817 |
+
" \n",
|
| 818 |
+
" except Exception as e:\n",
|
| 819 |
+
" print(f\"✗ Error reading data: {e}\")\n",
|
| 820 |
+
" import traceback\n",
|
| 821 |
+
" traceback.print_exc()"
|
| 822 |
+
]
|
| 823 |
+
},
|
| 824 |
+
{
|
| 825 |
+
"cell_type": "code",
|
| 826 |
+
"execution_count": 21,
|
| 827 |
+
"id": "31396a98",
|
| 828 |
+
"metadata": {},
|
| 829 |
+
"outputs": [],
|
| 830 |
+
"source": [
|
| 831 |
+
"df['t'] = pd.to_datetime(df['t']) "
|
| 832 |
+
]
|
| 833 |
+
},
|
| 834 |
+
{
|
| 835 |
+
"cell_type": "code",
|
| 836 |
+
"execution_count": 25,
|
| 837 |
+
"id": "8fa88ee6",
|
| 838 |
+
"metadata": {},
|
| 839 |
+
"outputs": [
|
| 840 |
+
{
|
| 841 |
+
"data": {
|
| 842 |
+
"text/html": [
|
| 843 |
+
"<div>\n",
|
| 844 |
+
"<style scoped>\n",
|
| 845 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 846 |
+
" vertical-align: middle;\n",
|
| 847 |
+
" }\n",
|
| 848 |
+
"\n",
|
| 849 |
+
" .dataframe tbody tr th {\n",
|
| 850 |
+
" vertical-align: top;\n",
|
| 851 |
+
" }\n",
|
| 852 |
+
"\n",
|
| 853 |
+
" .dataframe thead th {\n",
|
| 854 |
+
" text-align: right;\n",
|
| 855 |
+
" }\n",
|
| 856 |
+
"</style>\n",
|
| 857 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 858 |
+
" <thead>\n",
|
| 859 |
+
" <tr style=\"text-align: right;\">\n",
|
| 860 |
+
" <th></th>\n",
|
| 861 |
+
" <th>t</th>\n",
|
| 862 |
+
" <th>s01pid49_absthrottleposd</th>\n",
|
| 863 |
+
" </tr>\n",
|
| 864 |
+
" </thead>\n",
|
| 865 |
+
" <tbody>\n",
|
| 866 |
+
" <tr>\n",
|
| 867 |
+
" <th>0</th>\n",
|
| 868 |
+
" <td>2025-11-22</td>\n",
|
| 869 |
+
" <td>15.709290</td>\n",
|
| 870 |
+
" </tr>\n",
|
| 871 |
+
" <tr>\n",
|
| 872 |
+
" <th>1</th>\n",
|
| 873 |
+
" <td>2025-11-23</td>\n",
|
| 874 |
+
" <td>NaN</td>\n",
|
| 875 |
+
" </tr>\n",
|
| 876 |
+
" <tr>\n",
|
| 877 |
+
" <th>2</th>\n",
|
| 878 |
+
" <td>2025-11-24</td>\n",
|
| 879 |
+
" <td>21.347658</td>\n",
|
| 880 |
+
" </tr>\n",
|
| 881 |
+
" <tr>\n",
|
| 882 |
+
" <th>3</th>\n",
|
| 883 |
+
" <td>2025-11-25</td>\n",
|
| 884 |
+
" <td>22.176305</td>\n",
|
| 885 |
+
" </tr>\n",
|
| 886 |
+
" <tr>\n",
|
| 887 |
+
" <th>4</th>\n",
|
| 888 |
+
" <td>2025-11-26</td>\n",
|
| 889 |
+
" <td>22.074130</td>\n",
|
| 890 |
+
" </tr>\n",
|
| 891 |
+
" <tr>\n",
|
| 892 |
+
" <th>5</th>\n",
|
| 893 |
+
" <td>2025-11-27</td>\n",
|
| 894 |
+
" <td>22.379063</td>\n",
|
| 895 |
+
" </tr>\n",
|
| 896 |
+
" <tr>\n",
|
| 897 |
+
" <th>6</th>\n",
|
| 898 |
+
" <td>2025-11-28</td>\n",
|
| 899 |
+
" <td>22.611687</td>\n",
|
| 900 |
+
" </tr>\n",
|
| 901 |
+
" <tr>\n",
|
| 902 |
+
" <th>7</th>\n",
|
| 903 |
+
" <td>2025-11-29</td>\n",
|
| 904 |
+
" <td>NaN</td>\n",
|
| 905 |
+
" </tr>\n",
|
| 906 |
+
" <tr>\n",
|
| 907 |
+
" <th>8</th>\n",
|
| 908 |
+
" <td>2025-11-30</td>\n",
|
| 909 |
+
" <td>NaN</td>\n",
|
| 910 |
+
" </tr>\n",
|
| 911 |
+
" <tr>\n",
|
| 912 |
+
" <th>9</th>\n",
|
| 913 |
+
" <td>2025-12-01</td>\n",
|
| 914 |
+
" <td>NaN</td>\n",
|
| 915 |
+
" </tr>\n",
|
| 916 |
+
" <tr>\n",
|
| 917 |
+
" <th>10</th>\n",
|
| 918 |
+
" <td>2025-12-02</td>\n",
|
| 919 |
+
" <td>NaN</td>\n",
|
| 920 |
+
" </tr>\n",
|
| 921 |
+
" <tr>\n",
|
| 922 |
+
" <th>11</th>\n",
|
| 923 |
+
" <td>2025-12-03</td>\n",
|
| 924 |
+
" <td>22.212069</td>\n",
|
| 925 |
+
" </tr>\n",
|
| 926 |
+
" <tr>\n",
|
| 927 |
+
" <th>12</th>\n",
|
| 928 |
+
" <td>2025-12-04</td>\n",
|
| 929 |
+
" <td>21.593356</td>\n",
|
| 930 |
+
" </tr>\n",
|
| 931 |
+
" <tr>\n",
|
| 932 |
+
" <th>13</th>\n",
|
| 933 |
+
" <td>2025-12-05</td>\n",
|
| 934 |
+
" <td>22.048014</td>\n",
|
| 935 |
+
" </tr>\n",
|
| 936 |
+
" <tr>\n",
|
| 937 |
+
" <th>14</th>\n",
|
| 938 |
+
" <td>2025-12-06</td>\n",
|
| 939 |
+
" <td>NaN</td>\n",
|
| 940 |
+
" </tr>\n",
|
| 941 |
+
" <tr>\n",
|
| 942 |
+
" <th>15</th>\n",
|
| 943 |
+
" <td>2025-12-07</td>\n",
|
| 944 |
+
" <td>NaN</td>\n",
|
| 945 |
+
" </tr>\n",
|
| 946 |
+
" <tr>\n",
|
| 947 |
+
" <th>16</th>\n",
|
| 948 |
+
" <td>2025-12-08</td>\n",
|
| 949 |
+
" <td>21.288014</td>\n",
|
| 950 |
+
" </tr>\n",
|
| 951 |
+
" <tr>\n",
|
| 952 |
+
" <th>17</th>\n",
|
| 953 |
+
" <td>2025-12-09</td>\n",
|
| 954 |
+
" <td>22.105263</td>\n",
|
| 955 |
+
" </tr>\n",
|
| 956 |
+
" <tr>\n",
|
| 957 |
+
" <th>18</th>\n",
|
| 958 |
+
" <td>2025-12-10</td>\n",
|
| 959 |
+
" <td>22.144666</td>\n",
|
| 960 |
+
" </tr>\n",
|
| 961 |
+
" <tr>\n",
|
| 962 |
+
" <th>19</th>\n",
|
| 963 |
+
" <td>2025-12-11</td>\n",
|
| 964 |
+
" <td>21.774071</td>\n",
|
| 965 |
+
" </tr>\n",
|
| 966 |
+
" <tr>\n",
|
| 967 |
+
" <th>20</th>\n",
|
| 968 |
+
" <td>2025-12-12</td>\n",
|
| 969 |
+
" <td>21.957367</td>\n",
|
| 970 |
+
" </tr>\n",
|
| 971 |
+
" <tr>\n",
|
| 972 |
+
" <th>21</th>\n",
|
| 973 |
+
" <td>2025-12-13</td>\n",
|
| 974 |
+
" <td>NaN</td>\n",
|
| 975 |
+
" </tr>\n",
|
| 976 |
+
" <tr>\n",
|
| 977 |
+
" <th>22</th>\n",
|
| 978 |
+
" <td>2025-12-14</td>\n",
|
| 979 |
+
" <td>NaN</td>\n",
|
| 980 |
+
" </tr>\n",
|
| 981 |
+
" <tr>\n",
|
| 982 |
+
" <th>23</th>\n",
|
| 983 |
+
" <td>2025-12-15</td>\n",
|
| 984 |
+
" <td>20.411036</td>\n",
|
| 985 |
+
" </tr>\n",
|
| 986 |
+
" <tr>\n",
|
| 987 |
+
" <th>24</th>\n",
|
| 988 |
+
" <td>2025-12-16</td>\n",
|
| 989 |
+
" <td>21.394285</td>\n",
|
| 990 |
+
" </tr>\n",
|
| 991 |
+
" <tr>\n",
|
| 992 |
+
" <th>25</th>\n",
|
| 993 |
+
" <td>2025-12-17</td>\n",
|
| 994 |
+
" <td>21.644342</td>\n",
|
| 995 |
+
" </tr>\n",
|
| 996 |
+
" <tr>\n",
|
| 997 |
+
" <th>26</th>\n",
|
| 998 |
+
" <td>2025-12-18</td>\n",
|
| 999 |
+
" <td>22.130631</td>\n",
|
| 1000 |
+
" </tr>\n",
|
| 1001 |
+
" <tr>\n",
|
| 1002 |
+
" <th>27</th>\n",
|
| 1003 |
+
" <td>2025-12-19</td>\n",
|
| 1004 |
+
" <td>21.194253</td>\n",
|
| 1005 |
+
" </tr>\n",
|
| 1006 |
+
" <tr>\n",
|
| 1007 |
+
" <th>28</th>\n",
|
| 1008 |
+
" <td>2025-12-20</td>\n",
|
| 1009 |
+
" <td>NaN</td>\n",
|
| 1010 |
+
" </tr>\n",
|
| 1011 |
+
" <tr>\n",
|
| 1012 |
+
" <th>29</th>\n",
|
| 1013 |
+
" <td>2025-12-21</td>\n",
|
| 1014 |
+
" <td>NaN</td>\n",
|
| 1015 |
+
" </tr>\n",
|
| 1016 |
+
" <tr>\n",
|
| 1017 |
+
" <th>30</th>\n",
|
| 1018 |
+
" <td>2025-12-22</td>\n",
|
| 1019 |
+
" <td>21.804700</td>\n",
|
| 1020 |
+
" </tr>\n",
|
| 1021 |
+
" <tr>\n",
|
| 1022 |
+
" <th>31</th>\n",
|
| 1023 |
+
" <td>2025-12-23</td>\n",
|
| 1024 |
+
" <td>21.961360</td>\n",
|
| 1025 |
+
" </tr>\n",
|
| 1026 |
+
" <tr>\n",
|
| 1027 |
+
" <th>32</th>\n",
|
| 1028 |
+
" <td>2025-12-24</td>\n",
|
| 1029 |
+
" <td>22.252882</td>\n",
|
| 1030 |
+
" </tr>\n",
|
| 1031 |
+
" <tr>\n",
|
| 1032 |
+
" <th>33</th>\n",
|
| 1033 |
+
" <td>2025-12-25</td>\n",
|
| 1034 |
+
" <td>21.916508</td>\n",
|
| 1035 |
+
" </tr>\n",
|
| 1036 |
+
" <tr>\n",
|
| 1037 |
+
" <th>34</th>\n",
|
| 1038 |
+
" <td>2025-12-26</td>\n",
|
| 1039 |
+
" <td>22.494252</td>\n",
|
| 1040 |
+
" </tr>\n",
|
| 1041 |
+
" <tr>\n",
|
| 1042 |
+
" <th>35</th>\n",
|
| 1043 |
+
" <td>2025-12-27</td>\n",
|
| 1044 |
+
" <td>NaN</td>\n",
|
| 1045 |
+
" </tr>\n",
|
| 1046 |
+
" <tr>\n",
|
| 1047 |
+
" <th>36</th>\n",
|
| 1048 |
+
" <td>2025-12-28</td>\n",
|
| 1049 |
+
" <td>NaN</td>\n",
|
| 1050 |
+
" </tr>\n",
|
| 1051 |
+
" <tr>\n",
|
| 1052 |
+
" <th>37</th>\n",
|
| 1053 |
+
" <td>2025-12-29</td>\n",
|
| 1054 |
+
" <td>21.873543</td>\n",
|
| 1055 |
+
" </tr>\n",
|
| 1056 |
+
" <tr>\n",
|
| 1057 |
+
" <th>38</th>\n",
|
| 1058 |
+
" <td>2025-12-30</td>\n",
|
| 1059 |
+
" <td>21.890226</td>\n",
|
| 1060 |
+
" </tr>\n",
|
| 1061 |
+
" <tr>\n",
|
| 1062 |
+
" <th>39</th>\n",
|
| 1063 |
+
" <td>2025-12-31</td>\n",
|
| 1064 |
+
" <td>22.529185</td>\n",
|
| 1065 |
+
" </tr>\n",
|
| 1066 |
+
" <tr>\n",
|
| 1067 |
+
" <th>40</th>\n",
|
| 1068 |
+
" <td>2026-01-01</td>\n",
|
| 1069 |
+
" <td>NaN</td>\n",
|
| 1070 |
+
" </tr>\n",
|
| 1071 |
+
" <tr>\n",
|
| 1072 |
+
" <th>41</th>\n",
|
| 1073 |
+
" <td>2026-01-02</td>\n",
|
| 1074 |
+
" <td>22.761742</td>\n",
|
| 1075 |
+
" </tr>\n",
|
| 1076 |
+
" <tr>\n",
|
| 1077 |
+
" <th>42</th>\n",
|
| 1078 |
+
" <td>2026-01-03</td>\n",
|
| 1079 |
+
" <td>NaN</td>\n",
|
| 1080 |
+
" </tr>\n",
|
| 1081 |
+
" <tr>\n",
|
| 1082 |
+
" <th>43</th>\n",
|
| 1083 |
+
" <td>2026-01-04</td>\n",
|
| 1084 |
+
" <td>NaN</td>\n",
|
| 1085 |
+
" </tr>\n",
|
| 1086 |
+
" <tr>\n",
|
| 1087 |
+
" <th>44</th>\n",
|
| 1088 |
+
" <td>2026-01-05</td>\n",
|
| 1089 |
+
" <td>22.315963</td>\n",
|
| 1090 |
+
" </tr>\n",
|
| 1091 |
+
" <tr>\n",
|
| 1092 |
+
" <th>45</th>\n",
|
| 1093 |
+
" <td>2026-01-06</td>\n",
|
| 1094 |
+
" <td>22.110849</td>\n",
|
| 1095 |
+
" </tr>\n",
|
| 1096 |
+
" <tr>\n",
|
| 1097 |
+
" <th>46</th>\n",
|
| 1098 |
+
" <td>2026-01-07</td>\n",
|
| 1099 |
+
" <td>21.613014</td>\n",
|
| 1100 |
+
" </tr>\n",
|
| 1101 |
+
" <tr>\n",
|
| 1102 |
+
" <th>47</th>\n",
|
| 1103 |
+
" <td>2026-01-08</td>\n",
|
| 1104 |
+
" <td>21.953064</td>\n",
|
| 1105 |
+
" </tr>\n",
|
| 1106 |
+
" <tr>\n",
|
| 1107 |
+
" <th>48</th>\n",
|
| 1108 |
+
" <td>2026-01-09</td>\n",
|
| 1109 |
+
" <td>21.585424</td>\n",
|
| 1110 |
+
" </tr>\n",
|
| 1111 |
+
" <tr>\n",
|
| 1112 |
+
" <th>49</th>\n",
|
| 1113 |
+
" <td>2026-01-10</td>\n",
|
| 1114 |
+
" <td>NaN</td>\n",
|
| 1115 |
+
" </tr>\n",
|
| 1116 |
+
" <tr>\n",
|
| 1117 |
+
" <th>50</th>\n",
|
| 1118 |
+
" <td>2026-01-11</td>\n",
|
| 1119 |
+
" <td>NaN</td>\n",
|
| 1120 |
+
" </tr>\n",
|
| 1121 |
+
" <tr>\n",
|
| 1122 |
+
" <th>51</th>\n",
|
| 1123 |
+
" <td>2026-01-12</td>\n",
|
| 1124 |
+
" <td>22.092380</td>\n",
|
| 1125 |
+
" </tr>\n",
|
| 1126 |
+
" <tr>\n",
|
| 1127 |
+
" <th>52</th>\n",
|
| 1128 |
+
" <td>2026-01-13</td>\n",
|
| 1129 |
+
" <td>22.664499</td>\n",
|
| 1130 |
+
" </tr>\n",
|
| 1131 |
+
" <tr>\n",
|
| 1132 |
+
" <th>53</th>\n",
|
| 1133 |
+
" <td>2026-01-14</td>\n",
|
| 1134 |
+
" <td>22.124919</td>\n",
|
| 1135 |
+
" </tr>\n",
|
| 1136 |
+
" <tr>\n",
|
| 1137 |
+
" <th>54</th>\n",
|
| 1138 |
+
" <td>2026-01-15</td>\n",
|
| 1139 |
+
" <td>22.252390</td>\n",
|
| 1140 |
+
" </tr>\n",
|
| 1141 |
+
" <tr>\n",
|
| 1142 |
+
" <th>55</th>\n",
|
| 1143 |
+
" <td>2026-01-16</td>\n",
|
| 1144 |
+
" <td>22.551813</td>\n",
|
| 1145 |
+
" </tr>\n",
|
| 1146 |
+
" </tbody>\n",
|
| 1147 |
+
"</table>\n",
|
| 1148 |
+
"</div>"
|
| 1149 |
+
],
|
| 1150 |
+
"text/plain": [
|
| 1151 |
+
" t s01pid49_absthrottleposd\n",
|
| 1152 |
+
"0 2025-11-22 15.709290\n",
|
| 1153 |
+
"1 2025-11-23 NaN\n",
|
| 1154 |
+
"2 2025-11-24 21.347658\n",
|
| 1155 |
+
"3 2025-11-25 22.176305\n",
|
| 1156 |
+
"4 2025-11-26 22.074130\n",
|
| 1157 |
+
"5 2025-11-27 22.379063\n",
|
| 1158 |
+
"6 2025-11-28 22.611687\n",
|
| 1159 |
+
"7 2025-11-29 NaN\n",
|
| 1160 |
+
"8 2025-11-30 NaN\n",
|
| 1161 |
+
"9 2025-12-01 NaN\n",
|
| 1162 |
+
"10 2025-12-02 NaN\n",
|
| 1163 |
+
"11 2025-12-03 22.212069\n",
|
| 1164 |
+
"12 2025-12-04 21.593356\n",
|
| 1165 |
+
"13 2025-12-05 22.048014\n",
|
| 1166 |
+
"14 2025-12-06 NaN\n",
|
| 1167 |
+
"15 2025-12-07 NaN\n",
|
| 1168 |
+
"16 2025-12-08 21.288014\n",
|
| 1169 |
+
"17 2025-12-09 22.105263\n",
|
| 1170 |
+
"18 2025-12-10 22.144666\n",
|
| 1171 |
+
"19 2025-12-11 21.774071\n",
|
| 1172 |
+
"20 2025-12-12 21.957367\n",
|
| 1173 |
+
"21 2025-12-13 NaN\n",
|
| 1174 |
+
"22 2025-12-14 NaN\n",
|
| 1175 |
+
"23 2025-12-15 20.411036\n",
|
| 1176 |
+
"24 2025-12-16 21.394285\n",
|
| 1177 |
+
"25 2025-12-17 21.644342\n",
|
| 1178 |
+
"26 2025-12-18 22.130631\n",
|
| 1179 |
+
"27 2025-12-19 21.194253\n",
|
| 1180 |
+
"28 2025-12-20 NaN\n",
|
| 1181 |
+
"29 2025-12-21 NaN\n",
|
| 1182 |
+
"30 2025-12-22 21.804700\n",
|
| 1183 |
+
"31 2025-12-23 21.961360\n",
|
| 1184 |
+
"32 2025-12-24 22.252882\n",
|
| 1185 |
+
"33 2025-12-25 21.916508\n",
|
| 1186 |
+
"34 2025-12-26 22.494252\n",
|
| 1187 |
+
"35 2025-12-27 NaN\n",
|
| 1188 |
+
"36 2025-12-28 NaN\n",
|
| 1189 |
+
"37 2025-12-29 21.873543\n",
|
| 1190 |
+
"38 2025-12-30 21.890226\n",
|
| 1191 |
+
"39 2025-12-31 22.529185\n",
|
| 1192 |
+
"40 2026-01-01 NaN\n",
|
| 1193 |
+
"41 2026-01-02 22.761742\n",
|
| 1194 |
+
"42 2026-01-03 NaN\n",
|
| 1195 |
+
"43 2026-01-04 NaN\n",
|
| 1196 |
+
"44 2026-01-05 22.315963\n",
|
| 1197 |
+
"45 2026-01-06 22.110849\n",
|
| 1198 |
+
"46 2026-01-07 21.613014\n",
|
| 1199 |
+
"47 2026-01-08 21.953064\n",
|
| 1200 |
+
"48 2026-01-09 21.585424\n",
|
| 1201 |
+
"49 2026-01-10 NaN\n",
|
| 1202 |
+
"50 2026-01-11 NaN\n",
|
| 1203 |
+
"51 2026-01-12 22.092380\n",
|
| 1204 |
+
"52 2026-01-13 22.664499\n",
|
| 1205 |
+
"53 2026-01-14 22.124919\n",
|
| 1206 |
+
"54 2026-01-15 22.252390\n",
|
| 1207 |
+
"55 2026-01-16 22.551813"
|
| 1208 |
+
]
|
| 1209 |
+
},
|
| 1210 |
+
"execution_count": 25,
|
| 1211 |
+
"metadata": {},
|
| 1212 |
+
"output_type": "execute_result"
|
| 1213 |
+
}
|
| 1214 |
+
],
|
| 1215 |
+
"source": [
|
| 1216 |
+
"df.drop(columns=['date_created']).groupby(pd.Grouper(key='t', freq='D')).mean().reset_index()"
|
| 1217 |
+
]
|
| 1218 |
+
},
|
| 1219 |
+
{
|
| 1220 |
+
"cell_type": "code",
|
| 1221 |
+
"execution_count": 22,
|
| 1222 |
+
"id": "64376fb5",
|
| 1223 |
+
"metadata": {},
|
| 1224 |
+
"outputs": [
|
| 1225 |
+
{
|
| 1226 |
+
"data": {
|
| 1227 |
+
"text/plain": [
|
| 1228 |
+
"0 15.686275\n",
|
| 1229 |
+
"1 15.686275\n",
|
| 1230 |
+
"2 15.686275\n",
|
| 1231 |
+
"3 30.980392\n",
|
| 1232 |
+
"4 15.686275\n",
|
| 1233 |
+
" ... \n",
|
| 1234 |
+
"651996 15.686275\n",
|
| 1235 |
+
"651997 15.686275\n",
|
| 1236 |
+
"651998 15.686275\n",
|
| 1237 |
+
"651999 15.686275\n",
|
| 1238 |
+
"652000 15.686275\n",
|
| 1239 |
+
"Name: s01pid49_absthrottleposd, Length: 652001, dtype: float64"
|
| 1240 |
+
]
|
| 1241 |
+
},
|
| 1242 |
+
"execution_count": 22,
|
| 1243 |
+
"metadata": {},
|
| 1244 |
+
"output_type": "execute_result"
|
| 1245 |
+
}
|
| 1246 |
+
],
|
| 1247 |
+
"source": [
|
| 1248 |
+
"df['s01pid49_absthrottleposd']"
|
| 1249 |
+
]
|
| 1250 |
+
},
|
| 1251 |
+
{
|
| 1252 |
+
"cell_type": "code",
|
| 1253 |
+
"execution_count": 15,
|
| 1254 |
+
"id": "ec3d240b",
|
| 1255 |
+
"metadata": {},
|
| 1256 |
+
"outputs": [
|
| 1257 |
+
{
|
| 1258 |
+
"data": {
|
| 1259 |
+
"text/html": [
|
| 1260 |
+
"<div>\n",
|
| 1261 |
+
"<style scoped>\n",
|
| 1262 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 1263 |
+
" vertical-align: middle;\n",
|
| 1264 |
+
" }\n",
|
| 1265 |
+
"\n",
|
| 1266 |
+
" .dataframe tbody tr th {\n",
|
| 1267 |
+
" vertical-align: top;\n",
|
| 1268 |
+
" }\n",
|
| 1269 |
+
"\n",
|
| 1270 |
+
" .dataframe thead th {\n",
|
| 1271 |
+
" text-align: right;\n",
|
| 1272 |
+
" }\n",
|
| 1273 |
+
"</style>\n",
|
| 1274 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 1275 |
+
" <thead>\n",
|
| 1276 |
+
" <tr style=\"text-align: right;\">\n",
|
| 1277 |
+
" <th></th>\n",
|
| 1278 |
+
" <th>t</th>\n",
|
| 1279 |
+
" <th>altitudevalid</th>\n",
|
| 1280 |
+
" <th>altitude</th>\n",
|
| 1281 |
+
" <th>altitudeaccuracy</th>\n",
|
| 1282 |
+
" <th>date_created</th>\n",
|
| 1283 |
+
" </tr>\n",
|
| 1284 |
+
" </thead>\n",
|
| 1285 |
+
" <tbody>\n",
|
| 1286 |
+
" <tr>\n",
|
| 1287 |
+
" <th>6923253</th>\n",
|
| 1288 |
+
" <td>2025-10-21 15:09:49.454</td>\n",
|
| 1289 |
+
" <td>1.0</td>\n",
|
| 1290 |
+
" <td>-45.4</td>\n",
|
| 1291 |
+
" <td>43.0</td>\n",
|
| 1292 |
+
" <td>2025/10/21</td>\n",
|
| 1293 |
+
" </tr>\n",
|
| 1294 |
+
" <tr>\n",
|
| 1295 |
+
" <th>6923254</th>\n",
|
| 1296 |
+
" <td>2025-10-21 15:09:49.654</td>\n",
|
| 1297 |
+
" <td>1.0</td>\n",
|
| 1298 |
+
" <td>-44.4</td>\n",
|
| 1299 |
+
" <td>37.0</td>\n",
|
| 1300 |
+
" <td>2025/10/21</td>\n",
|
| 1301 |
+
" </tr>\n",
|
| 1302 |
+
" <tr>\n",
|
| 1303 |
+
" <th>6923255</th>\n",
|
| 1304 |
+
" <td>2025-10-21 15:09:49.854</td>\n",
|
| 1305 |
+
" <td>1.0</td>\n",
|
| 1306 |
+
" <td>-43.8</td>\n",
|
| 1307 |
+
" <td>32.0</td>\n",
|
| 1308 |
+
" <td>2025/10/21</td>\n",
|
| 1309 |
+
" </tr>\n",
|
| 1310 |
+
" <tr>\n",
|
| 1311 |
+
" <th>6923256</th>\n",
|
| 1312 |
+
" <td>2025-10-21 15:09:50.263</td>\n",
|
| 1313 |
+
" <td>1.0</td>\n",
|
| 1314 |
+
" <td>-43.3</td>\n",
|
| 1315 |
+
" <td>29.0</td>\n",
|
| 1316 |
+
" <td>2025/10/21</td>\n",
|
| 1317 |
+
" </tr>\n",
|
| 1318 |
+
" <tr>\n",
|
| 1319 |
+
" <th>6923257</th>\n",
|
| 1320 |
+
" <td>2025-10-21 15:09:50.463</td>\n",
|
| 1321 |
+
" <td>1.0</td>\n",
|
| 1322 |
+
" <td>-41.9</td>\n",
|
| 1323 |
+
" <td>24.0</td>\n",
|
| 1324 |
+
" <td>2025/10/21</td>\n",
|
| 1325 |
+
" </tr>\n",
|
| 1326 |
+
" <tr>\n",
|
| 1327 |
+
" <th>...</th>\n",
|
| 1328 |
+
" <td>...</td>\n",
|
| 1329 |
+
" <td>...</td>\n",
|
| 1330 |
+
" <td>...</td>\n",
|
| 1331 |
+
" <td>...</td>\n",
|
| 1332 |
+
" <td>...</td>\n",
|
| 1333 |
+
" </tr>\n",
|
| 1334 |
+
" <tr>\n",
|
| 1335 |
+
" <th>12876766</th>\n",
|
| 1336 |
+
" <td>2026-01-17 12:59:59.093</td>\n",
|
| 1337 |
+
" <td>1.0</td>\n",
|
| 1338 |
+
" <td>-12.4</td>\n",
|
| 1339 |
+
" <td>1.0</td>\n",
|
| 1340 |
+
" <td>2026/01/17</td>\n",
|
| 1341 |
+
" </tr>\n",
|
| 1342 |
+
" <tr>\n",
|
| 1343 |
+
" <th>12876767</th>\n",
|
| 1344 |
+
" <td>2026-01-17 12:59:59.293</td>\n",
|
| 1345 |
+
" <td>1.0</td>\n",
|
| 1346 |
+
" <td>-12.4</td>\n",
|
| 1347 |
+
" <td>1.0</td>\n",
|
| 1348 |
+
" <td>2026/01/17</td>\n",
|
| 1349 |
+
" </tr>\n",
|
| 1350 |
+
" <tr>\n",
|
| 1351 |
+
" <th>12876768</th>\n",
|
| 1352 |
+
" <td>2026-01-17 12:59:59.493</td>\n",
|
| 1353 |
+
" <td>1.0</td>\n",
|
| 1354 |
+
" <td>-12.4</td>\n",
|
| 1355 |
+
" <td>1.0</td>\n",
|
| 1356 |
+
" <td>2026/01/17</td>\n",
|
| 1357 |
+
" </tr>\n",
|
| 1358 |
+
" <tr>\n",
|
| 1359 |
+
" <th>12876769</th>\n",
|
| 1360 |
+
" <td>2026-01-17 12:59:59.693</td>\n",
|
| 1361 |
+
" <td>1.0</td>\n",
|
| 1362 |
+
" <td>-12.4</td>\n",
|
| 1363 |
+
" <td>1.0</td>\n",
|
| 1364 |
+
" <td>2026/01/17</td>\n",
|
| 1365 |
+
" </tr>\n",
|
| 1366 |
+
" <tr>\n",
|
| 1367 |
+
" <th>12876770</th>\n",
|
| 1368 |
+
" <td>2026-01-17 12:59:59.893</td>\n",
|
| 1369 |
+
" <td>1.0</td>\n",
|
| 1370 |
+
" <td>-12.4</td>\n",
|
| 1371 |
+
" <td>1.0</td>\n",
|
| 1372 |
+
" <td>2026/01/17</td>\n",
|
| 1373 |
+
" </tr>\n",
|
| 1374 |
+
" </tbody>\n",
|
| 1375 |
+
"</table>\n",
|
| 1376 |
+
"<p>12981228 rows × 5 columns</p>\n",
|
| 1377 |
+
"</div>"
|
| 1378 |
+
],
|
| 1379 |
+
"text/plain": [
|
| 1380 |
+
" t altitudevalid altitude altitudeaccuracy \\\n",
|
| 1381 |
+
"6923253 2025-10-21 15:09:49.454 1.0 -45.4 43.0 \n",
|
| 1382 |
+
"6923254 2025-10-21 15:09:49.654 1.0 -44.4 37.0 \n",
|
| 1383 |
+
"6923255 2025-10-21 15:09:49.854 1.0 -43.8 32.0 \n",
|
| 1384 |
+
"6923256 2025-10-21 15:09:50.263 1.0 -43.3 29.0 \n",
|
| 1385 |
+
"6923257 2025-10-21 15:09:50.463 1.0 -41.9 24.0 \n",
|
| 1386 |
+
"... ... ... ... ... \n",
|
| 1387 |
+
"12876766 2026-01-17 12:59:59.093 1.0 -12.4 1.0 \n",
|
| 1388 |
+
"12876767 2026-01-17 12:59:59.293 1.0 -12.4 1.0 \n",
|
| 1389 |
+
"12876768 2026-01-17 12:59:59.493 1.0 -12.4 1.0 \n",
|
| 1390 |
+
"12876769 2026-01-17 12:59:59.693 1.0 -12.4 1.0 \n",
|
| 1391 |
+
"12876770 2026-01-17 12:59:59.893 1.0 -12.4 1.0 \n",
|
| 1392 |
+
"\n",
|
| 1393 |
+
" date_created \n",
|
| 1394 |
+
"6923253 2025/10/21 \n",
|
| 1395 |
+
"6923254 2025/10/21 \n",
|
| 1396 |
+
"6923255 2025/10/21 \n",
|
| 1397 |
+
"6923256 2025/10/21 \n",
|
| 1398 |
+
"6923257 2025/10/21 \n",
|
| 1399 |
+
"... ... \n",
|
| 1400 |
+
"12876766 2026/01/17 \n",
|
| 1401 |
+
"12876767 2026/01/17 \n",
|
| 1402 |
+
"12876768 2026/01/17 \n",
|
| 1403 |
+
"12876769 2026/01/17 \n",
|
| 1404 |
+
"12876770 2026/01/17 \n",
|
| 1405 |
+
"\n",
|
| 1406 |
+
"[12981228 rows x 5 columns]"
|
| 1407 |
+
]
|
| 1408 |
+
},
|
| 1409 |
+
"execution_count": 15,
|
| 1410 |
+
"metadata": {},
|
| 1411 |
+
"output_type": "execute_result"
|
| 1412 |
+
}
|
| 1413 |
+
],
|
| 1414 |
+
"source": [
|
| 1415 |
+
"df.sort_values(by='t')"
|
| 1416 |
+
]
|
| 1417 |
+
},
|
| 1418 |
+
{
|
| 1419 |
+
"cell_type": "code",
|
| 1420 |
+
"execution_count": null,
|
| 1421 |
+
"id": "ed959149",
|
| 1422 |
+
"metadata": {},
|
| 1423 |
+
"outputs": [],
|
| 1424 |
+
"source": []
|
| 1425 |
+
}
|
| 1426 |
+
],
|
| 1427 |
+
"metadata": {
|
| 1428 |
+
"kernelspec": {
|
| 1429 |
+
"display_name": "venv",
|
| 1430 |
+
"language": "python",
|
| 1431 |
+
"name": "python3"
|
| 1432 |
+
},
|
| 1433 |
+
"language_info": {
|
| 1434 |
+
"codemirror_mode": {
|
| 1435 |
+
"name": "ipython",
|
| 1436 |
+
"version": 3
|
| 1437 |
+
},
|
| 1438 |
+
"file_extension": ".py",
|
| 1439 |
+
"mimetype": "text/x-python",
|
| 1440 |
+
"name": "python",
|
| 1441 |
+
"nbconvert_exporter": "python",
|
| 1442 |
+
"pygments_lexer": "ipython3",
|
| 1443 |
+
"version": "3.10.18"
|
| 1444 |
+
}
|
| 1445 |
+
},
|
| 1446 |
+
"nbformat": 4,
|
| 1447 |
+
"nbformat_minor": 5
|
| 1448 |
+
}
|