Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os | |
| import json | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from torch.utils.data import TensorDataset, DataLoader | |
| from tqdm import tqdm | |
| from flask import Flask, render_template, request, jsonify, send_file | |
| from flask_socketio import SocketIO, emit | |
| import tempfile | |
| import threading | |
| from pathlib import Path | |
| from werkzeug.utils import secure_filename | |
| app = Flask(__name__) | |
| app.config['SECRET_KEY'] = 'your-secret-key-here' | |
| app.config['UPLOAD_FOLDER'] = 'uploads' | |
| app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024 # 500MB max file size | |
| socketio = SocketIO(app, cors_allowed_origins="*") | |
| # Ensure upload directory exists with proper permissions | |
| os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) | |
| # Fix permissions for Hugging Face Spaces | |
| try: | |
| os.chmod(app.config['UPLOAD_FOLDER'], 0o755) | |
| except: | |
| pass # In case we don't have permission to change permissions | |
| # Try to create results directory at startup | |
| try: | |
| os.makedirs('results/inference_atlantic', exist_ok=True) | |
| os.chmod('results/inference_atlantic', 0o755) | |
| except: | |
| print("β οΈ WARNING: Could not create results directory - will use temp directory for results") | |
| # Global variables for progress tracking | |
| current_progress = {'step': 'idle', 'progress': 0, 'details': ''} | |
| ######################################## | |
| # MODEL DEFINITION # | |
| ######################################## | |
| class LSTMWithAttentionWithResid(nn.Module): | |
| def __init__(self, in_dim, hidden_dim, forecast_horizon, n_layers=10, dropout=0.2): | |
| super(LSTMWithAttentionWithResid, self).__init__() | |
| self.hidden_dim = hidden_dim | |
| self.forecast_horizon = forecast_horizon | |
| # Embedding layer | |
| self.embedding = nn.Linear(in_dim, hidden_dim) | |
| # LSTM layers | |
| self.lstm = nn.LSTM( | |
| hidden_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True | |
| ) | |
| # Layer normalization after residual connection | |
| self.layer_norm = nn.LayerNorm(hidden_dim) | |
| # Attention mechanism | |
| self.attention = nn.Linear(hidden_dim, hidden_dim) | |
| self.context_vector = nn.Linear(hidden_dim, 1, bias=False) # Linear layer for scoring | |
| # Fully connected layer to map attention context to output | |
| self.fc = nn.Linear(hidden_dim, forecast_horizon * 2) | |
| def forward(self, x): | |
| # x: [batch_size, seq_len, in_dim] | |
| # Pass through embedding layer | |
| x_embed = self.embedding(x) # [batch_size, seq_len, hidden_dim] | |
| # Pass through LSTM | |
| lstm_output, (hidden, cell) = self.lstm(x_embed) # [batch_size, seq_len, hidden_dim] | |
| # Add residual connection (out-of-place) | |
| lstm_output = lstm_output + x_embed # [batch_size, seq_len, hidden_dim] | |
| # Apply layer normalization | |
| lstm_output = self.layer_norm(lstm_output) # [batch_size, seq_len, hidden_dim] | |
| # Compute attention scores | |
| attention_weights = torch.tanh(self.attention(lstm_output)) # [batch_size, seq_len, hidden_dim] | |
| attention_scores = self.context_vector(attention_weights).squeeze(-1) # [batch_size, seq_len] | |
| # Apply softmax to normalize scores | |
| attention_weights = F.softmax(attention_scores, dim=1) # [batch_size, seq_len] | |
| # Compute the context vector as a weighted sum of LSTM outputs | |
| context_vector = torch.bmm( | |
| attention_weights.unsqueeze(1), lstm_output | |
| ) # [batch_size, 1, hidden_dim] | |
| context_vector = context_vector.squeeze(1) # [batch_size, hidden_dim] | |
| # Pass context vector through fully connected layer for forecasting | |
| output = self.fc(context_vector) # [batch_size, forecast_horizon * 2] | |
| # Reshape output to match the expected shape | |
| output = output.view(-1, self.forecast_horizon, 2) # [batch_size, forecast_horizon, 2] | |
| return output | |
| ######################################## | |
| # UTILITY FUNCTIONS # | |
| ######################################## | |
| def update_progress(step, progress, details=""): | |
| """Update global progress state""" | |
| global current_progress | |
| current_progress = { | |
| 'step': step, | |
| 'progress': progress, | |
| 'details': details | |
| } | |
| socketio.emit('progress_update', current_progress) | |
| def create_sequences_grouped_by_segment_lat_long_veloc(df_scaled, seq_len=12, forecast_horizon=1, features_to_scale=None): | |
| """ | |
| For each segment, creates overlapping sequences of length seq_len. | |
| Returns: | |
| - Xs: input sequences, | |
| - ys: target outputs (future latitude and longitude velocities), | |
| - segments: corresponding segment IDs, | |
| - last_positions: last known positions from each sequence. | |
| """ | |
| update_progress('Creating sequences', 10, f'Processing {len(df_scaled)} data points...') | |
| Xs, ys, segments, last_positions = [], [], [], [] | |
| if features_to_scale is None: | |
| # CRITICAL: Match YOUR EXACT inference logic (segment first, then removed) | |
| features_to_scale = [ | |
| "segment", # Index 0 - will be removed before model | |
| "latitude_velocity_km", # Index 1 -> 0 after segment removal | |
| "longitude_velocity_km", # Index 2 -> 1 after segment removal | |
| "latitude_degrees", # Index 3 -> 2 after segment removal | |
| "longitude_degrees", # Index 4 -> 3 after segment removal | |
| "time_difference_hours", # Index 5 -> 4 after segment removal | |
| "time_scalar" # Index 6 -> 5 after segment removal | |
| ] | |
| # Verify all required features exist | |
| missing_features = [f for f in features_to_scale if f not in df_scaled.columns] | |
| if missing_features: | |
| raise ValueError(f"Missing required features: {missing_features}") | |
| grouped = df_scaled.groupby('segment') | |
| total_segments = len(grouped) | |
| for i, (segment_id, group) in enumerate(grouped): | |
| group = group.reset_index(drop=True) | |
| L = len(group) | |
| # Progress update | |
| if i % max(1, total_segments // 20) == 0: | |
| progress = 10 + (i / total_segments) * 30 # 10-40% range | |
| update_progress('Creating sequences', progress, | |
| f'Processing segment {i+1}/{total_segments}') | |
| if L >= seq_len + forecast_horizon: | |
| for j in range(L - seq_len - forecast_horizon + 1): | |
| # Get sequence features | |
| seq = group.iloc[j:(j+seq_len)][features_to_scale].to_numpy() | |
| # Get future time scalar for the forecast horizon | |
| future_time = group['time_scalar'].iloc[j + seq_len + forecast_horizon - 1] | |
| future_time_feature = np.full((seq_len, 1), future_time) | |
| # Augment sequence with future time | |
| seq_aug = np.hstack((seq, future_time_feature)) | |
| Xs.append(seq_aug) | |
| # Target: future velocity | |
| target = group[['latitude_velocity_km', 'longitude_velocity_km']].iloc[j + seq_len + forecast_horizon - 1].to_numpy() | |
| ys.append(target) | |
| segments.append(segment_id) | |
| # Last known position | |
| last_pos = group[['latitude_degrees', 'longitude_degrees']].iloc[j + seq_len - 1].to_numpy() | |
| last_positions.append(last_pos) | |
| return (np.array(Xs, dtype=np.float32), | |
| np.array(ys, dtype=np.float32), | |
| np.array(segments), | |
| np.array(last_positions, dtype=np.float32)) | |
| def load_normalization_params(json_path): | |
| """Load normalization parameters from JSON file""" | |
| with open(json_path, "r") as f: | |
| normalization_params = json.load(f) | |
| return normalization_params["feature_mins"], normalization_params["feature_maxs"] | |
| def minmax_denormalize(scaled_series, feature_min, feature_max): | |
| """Denormalize data using min-max scaling""" | |
| return scaled_series * (feature_max - feature_min) + feature_min | |
| ######################################## | |
| # INFERENCE PIPELINE # | |
| ######################################## | |
| def run_inference_pipeline(csv_file_path, model_path, normalization_path): | |
| """Complete inference pipeline following Final_inference_maginet.py logic""" | |
| try: | |
| # Step 1: Load and validate data | |
| update_progress('Loading data', 5, 'Reading CSV file...') | |
| # Enhanced CSV parsing with error handling | |
| try: | |
| # Determine separator by reading first few lines | |
| with open(csv_file_path, 'r') as f: | |
| first_line = f.readline() | |
| separator = ';' if ';' in first_line else ',' | |
| # Try reading with detected separator | |
| df = pd.read_csv(csv_file_path, sep=separator, on_bad_lines='skip') | |
| update_progress('Loading data', 8, f'Loaded {len(df)} rows with separator "{separator}"') | |
| # Debug: Print actual column names | |
| print(f"π CSV COLUMNS FOUND: {list(df.columns)}") | |
| update_progress('Loading data', 8.5, f'Columns: {list(df.columns)}') | |
| except Exception as e: | |
| print(f"β CSV PARSING ERROR: {e}") | |
| # Try alternative parsing methods | |
| try: | |
| df = pd.read_csv(csv_file_path, sep=',', on_bad_lines='skip') | |
| update_progress('Loading data', 8, f'Loaded {len(df)} rows with comma separator (fallback)') | |
| print(f"π CSV COLUMNS FOUND (fallback): {list(df.columns)}") | |
| except Exception as e2: | |
| try: | |
| df = pd.read_csv(csv_file_path, sep=';', on_bad_lines='skip') | |
| update_progress('Loading data', 8, f'Loaded {len(df)} rows with semicolon separator (fallback)') | |
| print(f"π CSV COLUMNS FOUND (fallback): {list(df.columns)}") | |
| except Exception as e3: | |
| raise ValueError(f"Could not parse CSV file. Tried multiple separators. Errors: {e}, {e2}, {e3}") | |
| # CRITICAL: Create time_scalar (was missing from inference dataset!) | |
| if 'time_scalar' not in df.columns: | |
| if 'datetime' in df.columns: | |
| # Convert datetime to time_scalar (preferred method) | |
| df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce') | |
| reference_date = pd.Timestamp('2023-01-01') | |
| df['time_scalar'] = ((df['datetime'] - reference_date) / pd.Timedelta(days=1)).round(8) | |
| update_progress('Loading data', 9, 'Created time_scalar from datetime column') | |
| elif 'time_decimal' in df.columns: | |
| # Use time_decimal directly as time_scalar (alternative method) | |
| df['time_scalar'] = df['time_decimal'].copy() | |
| update_progress('Loading data', 9, 'Created time_scalar from time_decimal column') | |
| elif all(col in df.columns for col in ['day', 'month', 'time_decimal']): | |
| # Create datetime from components and then time_scalar | |
| df['year'] = df.get('year', 2024) # Default year if not present | |
| df['datetime'] = pd.to_datetime(df[['year', 'month', 'day']], errors='coerce') | |
| df['datetime'] += pd.to_timedelta(df['time_decimal'], unit='h') | |
| reference_date = pd.Timestamp('2023-01-01') | |
| df['time_scalar'] = ((df['datetime'] - reference_date) / pd.Timedelta(days=1)).round(8) | |
| update_progress('Loading data', 9, 'Created time_scalar from day/month/time_decimal') | |
| else: | |
| # Create a simple sequential time_scalar based on row order | |
| df['time_scalar'] = df.index / len(df) | |
| update_progress('Loading data', 9, 'Created sequential time_scalar') | |
| # Validate required columns with detailed error reporting | |
| required_columns = [ | |
| 'segment', 'latitude_velocity_km', 'longitude_velocity_km', | |
| 'latitude_degrees', 'longitude_degrees', 'time_difference_hours', 'time_scalar' | |
| ] | |
| print(f"π REQUIRED COLUMNS: {required_columns}") | |
| print(f"π ACTUAL COLUMNS: {list(df.columns)}") | |
| missing_columns = [col for col in required_columns if col not in df.columns] | |
| if missing_columns: | |
| available_cols = list(df.columns) | |
| error_msg = f""" | |
| β COLUMN VALIDATION ERROR: | |
| Missing required columns: {missing_columns} | |
| Available columns: {available_cols} | |
| Column mapping suggestions: | |
| - Check for extra spaces or different naming | |
| - Verify CSV file format and encoding | |
| - Ensure time_scalar column exists or can be created | |
| """ | |
| print(error_msg) | |
| raise ValueError(f"Missing required columns: {missing_columns}. Available: {available_cols}") | |
| # CRITICAL: Apply the SAME data filtering as training/notebook | |
| update_progress('Filtering data', 10, 'Applying quality filters...') | |
| original_count = len(df) | |
| # 1. Calculate speed column if missing (CRITICAL!) | |
| if 'speed_km_h' not in df.columns: | |
| df['speed_km_h'] = np.sqrt(df['latitude_velocity_km']**2 + df['longitude_velocity_km']**2) | |
| update_progress('Filtering data', 10.5, 'Calculated speed_km_h column') | |
| # 2. Speed filtering - EXACTLY like training | |
| df = df[(df['speed_km_h'] >= 2) & (df['speed_km_h'] <= 60)].copy() | |
| update_progress('Filtering data', 11, f'Speed filter: {original_count} -> {len(df)} rows') | |
| # 3. Velocity filtering - CRITICAL for performance! | |
| velocity_mask = ( | |
| (np.abs(df['latitude_velocity_km']) <= 100) & | |
| (np.abs(df['longitude_velocity_km']) <= 100) & | |
| (df['time_difference_hours'] > 0) & | |
| (df['time_difference_hours'] <= 24) # Max 24 hours between points | |
| ) | |
| df = df[velocity_mask].copy() | |
| update_progress('Filtering data', 12, f'Velocity filter: -> {len(df)} rows') | |
| # 4. Segment length filtering - Remove segments with < 20 points | |
| segment_counts = df['segment'].value_counts() | |
| segments_to_remove = segment_counts[segment_counts < 20].index | |
| before_segment_filter = len(df) | |
| df = df[~df['segment'].isin(segments_to_remove)].copy() | |
| update_progress('Filtering data', 13, f'Segment filter: {before_segment_filter} -> {len(df)} rows') | |
| # 5. Remove NaN and infinite values | |
| df = df.dropna().copy() | |
| numeric_cols = ['latitude_velocity_km', 'longitude_velocity_km', 'time_difference_hours'] | |
| for col in numeric_cols: | |
| if col in df.columns: | |
| df = df[~np.isinf(df[col])].copy() | |
| # DEBUGGING: Add detailed filtering statistics | |
| filtered_count = len(df) | |
| filter_percent = ((original_count - filtered_count) / original_count) * 100 | |
| update_progress('Filtering data', 14, f'Final filtered data: {filtered_count} rows ({original_count - filtered_count} removed = {filter_percent:.1f}%)') | |
| # Debug info for analysis | |
| print(f"π FILTERING SUMMARY:") | |
| print(f" Original: {original_count:,} rows") | |
| print(f" Final: {filtered_count:,} rows") | |
| print(f" Removed: {original_count - filtered_count:,} ({filter_percent:.1f}%)") | |
| if len(df) == 0: | |
| raise ValueError("No data remaining after quality filtering. Check your input data quality.") | |
| # Step 2: Load normalization parameters | |
| update_progress('Loading normalization', 12, 'Loading normalization parameters...') | |
| feature_mins, feature_maxs = load_normalization_params(normalization_path) | |
| # Step 2.5: CRITICAL - Normalize the test data (missing step causing 3373km error!) | |
| update_progress('Normalizing data', 15, 'Applying normalization to test data...') | |
| features_to_normalize = ['latitude_velocity_km', 'longitude_velocity_km', | |
| 'latitude_degrees', 'longitude_degrees', | |
| 'time_difference_hours', 'time_scalar'] | |
| for feature in features_to_normalize: | |
| if feature in df.columns and feature in feature_mins: | |
| min_val = feature_mins[feature] | |
| max_val = feature_maxs[feature] | |
| rng = max_val - min_val if max_val != min_val else 1 | |
| df[feature] = (df[feature] - min_val) / rng | |
| update_progress('Normalizing data', 18, f'Normalized {feature}') | |
| # Step 3: Create sequences | |
| SEQ_LENGTH = 12 | |
| FORECAST_HORIZON = 1 | |
| X_test, y_test, test_segments, last_known_positions_scaled = create_sequences_grouped_by_segment_lat_long_veloc( | |
| df, seq_len=SEQ_LENGTH, forecast_horizon=FORECAST_HORIZON | |
| ) | |
| update_progress('Preparing model', 45, f'Created {len(X_test)} sequences') | |
| if len(X_test) == 0: | |
| raise ValueError("No valid sequences could be created. Check your data and sequence length requirements.") | |
| # Step 4: Prepare data for model | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| X_test_tensor = torch.from_numpy(X_test).float().to(device) | |
| y_test_tensor = torch.from_numpy(y_test).float().to(device) | |
| test_dataset = TensorDataset(X_test_tensor, y_test_tensor) | |
| test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) | |
| # Step 5: Load model | |
| update_progress('Loading model', 50, 'Loading trained model...') | |
| # CRITICAL: Model expects 6 features (segment removed) + 1 future_time = 7 total | |
| in_dim = X_test.shape[2] - 1 # Remove segment column dimension | |
| # CRITICAL: Match the exact model architecture from Atlantic model weights | |
| hidden_dim = 250 # From best_model.pth | |
| n_layers = 7 # From best_model.pth (CRITICAL: not 10!) | |
| dropout = 0.2 | |
| model = LSTMWithAttentionWithResid( | |
| in_dim, hidden_dim, FORECAST_HORIZON, | |
| n_layers=n_layers, dropout=dropout | |
| ).to(device) | |
| model.load_state_dict(torch.load(model_path, map_location=device)) | |
| model.eval() | |
| # Step 6: Run inference | |
| update_progress('Running inference', 60, 'Making predictions...') | |
| # CRITICAL: Extract features batch-by-batch like your notebook | |
| all_preds = [] | |
| segments_extracted = [] | |
| time_scalars_extracted = [] | |
| time_diff_hours_extracted = [] | |
| with torch.no_grad(): | |
| for i, batch in enumerate(test_loader): | |
| x_batch, _ = batch | |
| # CRITICAL: Extract features exactly like your notebook | |
| segment_batch = x_batch[:, 0, 0].cpu().numpy() # Take segment from first time step | |
| time_scalar_batch = x_batch[:, -1, 6].cpu().numpy() # LAST timestep, index 6 = time_scalar | |
| time_diff_hours_batch = x_batch[:, 0, 5].cpu().numpy() # First timestep, index 5 | |
| segments_extracted.extend(segment_batch) | |
| time_scalars_extracted.extend(time_scalar_batch) | |
| time_diff_hours_extracted.extend(time_diff_hours_batch) | |
| # Remove segment column before model input | |
| x_batch_no_segment = x_batch[:, :, 1:] # Remove segment (index 0) but keep all other features | |
| preds = model(x_batch_no_segment) | |
| all_preds.append(preds.cpu().numpy()) | |
| # Progress update | |
| progress = 60 + (i / len(test_loader)) * 20 # 60-80% range | |
| update_progress('Running inference', progress, | |
| f'Processing batch {i+1}/{len(test_loader)}') | |
| all_preds = np.concatenate(all_preds, axis=0) | |
| # Step 7: Process results | |
| update_progress('Processing results', 80, 'Processing predictions...') | |
| # CRITICAL: Reshape predictions exactly like your notebook | |
| yhat = torch.from_numpy(all_preds) | |
| yhat = yhat.view(-1, 2) # Reshape to [batch_size, 2] - EXACTLY like your notebook | |
| # Extract predictions exactly like your notebook | |
| predicted_lat_vel = yhat[:, 0].numpy() # Predicted lat velocity | |
| predicted_lon_vel = yhat[:, 1].numpy() # Predicted lon velocity | |
| # Extract actual values exactly like your notebook | |
| y_real = y_test_tensor.cpu() | |
| actual_lat_vel = y_real[:, 0].numpy() # Actual lat velocity | |
| actual_lon_vel = y_real[:, 1].numpy() # Actual lon velocity | |
| # CRITICAL: Use extracted features from batches (matching your notebook exactly) | |
| # Ensure all arrays have consistent length | |
| num_samples = len(predicted_lat_vel) | |
| segments_extracted = segments_extracted[:num_samples] | |
| time_scalars_extracted = time_scalars_extracted[:num_samples] | |
| time_diff_hours_extracted = time_diff_hours_extracted[:num_samples] | |
| last_known_positions_scaled = last_known_positions_scaled[:num_samples] | |
| # Create results dataframe exactly like your notebook | |
| results_df = pd.DataFrame({ | |
| 'segment': segments_extracted, # From batch extraction | |
| 'time_difference_hours': time_diff_hours_extracted, # From batch extraction (first timestep) | |
| 'Time Scalar': time_scalars_extracted, # From batch extraction (LAST timestep) | |
| 'Last Known Latitude': [pos[0] for pos in last_known_positions_scaled], | |
| 'Last Known Longitude': [pos[1] for pos in last_known_positions_scaled], | |
| 'predicted_lat_km': predicted_lat_vel, | |
| 'predicted_lon_km': predicted_lon_vel, | |
| 'actual_lat_km': actual_lat_vel, | |
| 'actual_lon_km': actual_lon_vel | |
| }) | |
| # Step 8: Denormalize results | |
| update_progress('Denormalizing results', 85, 'Converting to real units...') | |
| # Column to feature mapping (COMPLETE mapping for all denormalizable columns) | |
| column_to_feature = { | |
| "predicted_lat_km": "latitude_velocity_km", | |
| "predicted_lon_km": "longitude_velocity_km", | |
| "actual_lat_km": "latitude_velocity_km", | |
| "actual_lon_km": "longitude_velocity_km", | |
| "Last Known Latitude": "latitude_degrees", | |
| "Last Known Longitude": "longitude_degrees", | |
| "time_difference_hours": "time_difference_hours", | |
| "Time Scalar": "time_scalar" | |
| } | |
| # Denormalize relevant columns | |
| for col, feat in column_to_feature.items(): | |
| if col in results_df.columns and feat in feature_mins: | |
| fmin = feature_mins[feat] | |
| fmax = feature_maxs[feat] | |
| results_df[col + "_unscaled"] = minmax_denormalize(results_df[col], fmin, fmax) | |
| update_progress('Denormalizing results', 85, f'Denormalized {col}') | |
| # Ensure all required _unscaled columns exist | |
| required_unscaled_cols = [ | |
| 'predicted_lat_km_unscaled', 'predicted_lon_km_unscaled', | |
| 'actual_lat_km_unscaled', 'actual_lon_km_unscaled', | |
| 'Last Known Latitude_unscaled', 'Last Known Longitude_unscaled', | |
| 'time_difference_hours_unscaled' | |
| ] | |
| for col in required_unscaled_cols: | |
| if col not in results_df.columns: | |
| base_col = col.replace('_unscaled', '') | |
| if base_col in results_df.columns: | |
| # If base column exists but wasn't denormalized, copy it | |
| results_df[col] = results_df[base_col] | |
| update_progress('Denormalizing results', 87, f'Created missing {col}') | |
| else: | |
| results_df[col] = 0.0 | |
| update_progress('Denormalizing results', 87, f'Defaulted missing {col} to 0') | |
| # --------------------------- | |
| # NEW: Clip predicted velocities to realistic physical bounds to avoid huge errors | |
| # --------------------------- | |
| VELOCITY_RANGE_KM_H = (-100, 100) # Same limits used during input filtering | |
| results_df["predicted_lat_km_unscaled"] = results_df["predicted_lat_km_unscaled"].clip(*VELOCITY_RANGE_KM_H) | |
| results_df["predicted_lon_km_unscaled"] = results_df["predicted_lon_km_unscaled"].clip(*VELOCITY_RANGE_KM_H) | |
| update_progress('Denormalizing results', 88, 'Clipped predicted velocities to realistic range') | |
| # Step 9: Calculate final positions and errors (EXACT column structure matching your notebook) | |
| update_progress('Calculating errors', 90, 'Computing prediction errors...') | |
| # Compute displacement components (in km) | |
| results_df["pred_final_lat_km_component"] = ( | |
| results_df["predicted_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) | |
| results_df["pred_final_lon_km_component"] = ( | |
| results_df["predicted_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) | |
| results_df["actual_final_lat_km_component"] = ( | |
| results_df["actual_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) | |
| results_df["actual_final_lon_km_component"] = ( | |
| results_df["actual_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) | |
| # Calculate total displacement magnitudes (MISSING COLUMNS!) | |
| results_df["pred_final_km"] = np.sqrt( | |
| results_df["pred_final_lat_km_component"]**2 + results_df["pred_final_lon_km_component"]**2 | |
| ) | |
| results_df["actual_final_km"] = np.sqrt( | |
| results_df["actual_final_lat_km_component"]**2 + results_df["actual_final_lon_km_component"]**2 | |
| ) | |
| # Calculate Euclidean distance error (in km) | |
| results_df["error_km"] = np.sqrt( | |
| (results_df["pred_final_lat_km_component"] - results_df["actual_final_lat_km_component"])**2 + | |
| (results_df["pred_final_lon_km_component"] - results_df["actual_final_lon_km_component"])**2 | |
| ) | |
| # Compute final positions in degrees | |
| km_per_deg_lat = 111 # approximate conversion for latitude | |
| results_df["pred_final_lat_deg"] = results_df["Last Known Latitude_unscaled"] + ( | |
| results_df["predicted_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) / km_per_deg_lat | |
| results_df["actual_final_lat_deg"] = results_df["Last Known Latitude_unscaled"] + ( | |
| results_df["actual_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) / km_per_deg_lat | |
| # Account for longitude scaling by latitude | |
| results_df["Last_Known_Lat_rad"] = np.deg2rad(results_df["Last Known Latitude_unscaled"]) | |
| results_df["pred_final_lon_deg"] = results_df["Last Known Longitude_unscaled"] + ( | |
| results_df["predicted_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) / (km_per_deg_lat * np.cos(results_df["Last_Known_Lat_rad"])) | |
| results_df["actual_final_lon_deg"] = results_df["Last Known Longitude_unscaled"] + ( | |
| results_df["actual_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"] | |
| ) / (km_per_deg_lat * np.cos(results_df["Last_Known_Lat_rad"])) | |
| # Step 10: Reorder columns to match your EXACT specification | |
| update_progress('Finalizing results', 93, 'Reordering columns to match notebook format...') | |
| # EXACT column order as specified by user | |
| column_order = [ | |
| 'segment', 'time_difference_hours', 'Time Scalar', 'Last Known Latitude', 'Last Known Longitude', | |
| 'predicted_lat_km', 'predicted_lon_km', 'actual_lat_km', 'actual_lon_km', | |
| 'predicted_lat_km_unscaled', 'predicted_lon_km_unscaled', 'actual_lat_km_unscaled', 'actual_lon_km_unscaled', | |
| 'Last Known Latitude_unscaled', 'Last Known Longitude_unscaled', 'time_difference_hours_unscaled', | |
| 'pred_final_km', 'actual_final_km', | |
| 'pred_final_lat_km_component', 'pred_final_lon_km_component', | |
| 'actual_final_lat_km_component', 'actual_final_lon_km_component', | |
| 'error_km', 'pred_final_lat_deg', 'actual_final_lat_deg', 'Last_Known_Lat_rad', | |
| 'pred_final_lon_deg', 'actual_final_lon_deg' | |
| ] | |
| # Validate all required columns exist - add missing ones with defaults if needed | |
| missing_columns = [col for col in column_order if col not in results_df.columns] | |
| if missing_columns: | |
| update_progress('Finalizing results', 94, f'Adding missing columns: {missing_columns}') | |
| for col in missing_columns: | |
| # Add default values for any missing columns | |
| if '_unscaled' in col: | |
| # For unscaled columns, try to find the original scaled column | |
| base_col = col.replace('_unscaled', '') | |
| if base_col in results_df.columns and base_col in column_to_feature: | |
| # Use the same denormalization process | |
| feat = column_to_feature[base_col] | |
| if feat in feature_mins: | |
| fmin = feature_mins[feat] | |
| fmax = feature_maxs[feat] | |
| results_df[col] = minmax_denormalize(results_df[base_col], fmin, fmax) | |
| else: | |
| results_df[col] = results_df[base_col] # No denormalization available | |
| else: | |
| results_df[col] = 0.0 # Default to 0 | |
| else: | |
| results_df[col] = 0.0 # Default to 0 for any other missing columns | |
| # Reorder columns to match exact specification | |
| results_df = results_df[column_order] | |
| # Step 11: Save results | |
| update_progress('Saving results', 95, 'Saving inference results...') | |
| # Create results directory with permission handling | |
| try: | |
| results_dir = Path('results/inference_atlantic') | |
| results_dir.mkdir(parents=True, exist_ok=True) | |
| # Save to results directory | |
| timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S') | |
| results_file = results_dir / f'inference_results_{timestamp}.csv' | |
| results_df.to_csv(results_file, index=False) | |
| print(f"β Results saved to: {results_file}") | |
| except PermissionError: | |
| # Fallback to temp directory if results directory has permission issues | |
| import tempfile | |
| temp_dir = Path(tempfile.gettempdir()) | |
| timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S') | |
| results_file = temp_dir / f'inference_results_{timestamp}.csv' | |
| results_df.to_csv(results_file, index=False) | |
| print(f"β οΈ WARNING: Results saved to temp directory due to permissions: {results_file}") | |
| except Exception as e: | |
| print(f"β ERROR: Could not save results file: {str(e)}") | |
| # Continue anyway - we still have the temp file below | |
| # Also save to temporary file for compatibility | |
| output_file = tempfile.NamedTemporaryFile( | |
| mode='w', suffix='_inference_results.csv', delete=False | |
| ) | |
| results_df.to_csv(output_file.name, index=False) | |
| # CRITICAL: Calculate SAME regression metrics as your notebook | |
| # Convert predictions and actuals to tensors for metric calculation | |
| yhat_tensor = torch.from_numpy(np.column_stack([predicted_lat_vel, predicted_lon_vel])).float() | |
| y_real_tensor = torch.from_numpy(np.column_stack([actual_lat_vel, actual_lon_vel])).float() | |
| # Calculate regression metrics exactly like your notebook | |
| def calc_metrics_like_notebook(preds, labels): | |
| """Calculate metrics exactly like your notebook's calc_metrics function""" | |
| EPS = 1e-8 | |
| mse = torch.mean((preds - labels) ** 2) | |
| mae = torch.mean(torch.abs(preds - labels)) | |
| rmse = torch.sqrt(mse) | |
| mape = torch.mean(torch.abs((preds - labels) / (labels + EPS))) * 100 # Convert to percentage | |
| rse = torch.sum((preds - labels) ** 2) / torch.sum((labels + EPS) ** 2) | |
| return rse.item(), mae.item(), mse.item(), mape.item(), rmse.item() | |
| # Calculate regression metrics on velocity predictions | |
| rse, mae, mse, mape, rmse = calc_metrics_like_notebook(yhat_tensor, y_real_tensor) | |
| # Calculate summary statistics | |
| error_stats = { | |
| # Distance-based metrics (web app specific) | |
| 'mean_error_km': float(results_df["error_km"].mean()), | |
| 'median_error_km': float(results_df["error_km"].median()), | |
| 'std_error_km': float(results_df["error_km"].std()), | |
| 'min_error_km': float(results_df["error_km"].min()), | |
| 'max_error_km': float(results_df["error_km"].max()), | |
| # Regression metrics (matching your notebook) | |
| 'rse': rse, | |
| 'mae': mae, | |
| 'mse': mse, | |
| 'mape': mape, | |
| 'rmse': rmse, | |
| # General stats | |
| 'total_predictions': len(results_df), | |
| 'total_segments': len(results_df['segment'].unique()), | |
| 'columns_generated': list(results_df.columns), | |
| 'total_columns': len(results_df.columns) | |
| } | |
| # NEW: Create histogram of error distribution (30 bins by default) | |
| hist_counts, bin_edges = np.histogram(results_df["error_km"], bins=30) | |
| histogram_data = { | |
| 'bins': bin_edges.tolist(), | |
| 'counts': hist_counts.tolist() | |
| } | |
| update_progress('Complete', 100, | |
| f'β Inference complete! Distance: {error_stats["mean_error_km"]:.2f} km | MAE: {error_stats["mae"]:.2f} | MAPE: {error_stats["mape"]:.2f}%') | |
| # Emit inference_complete with full statistics and histogram for the frontend chart | |
| try: | |
| socketio.emit('inference_complete', { | |
| 'success': True, | |
| 'stats': error_stats, | |
| 'histogram': histogram_data | |
| }) | |
| except Exception: | |
| pass # In case we are in CLI context without SocketIO | |
| return { | |
| 'success': True, | |
| 'results_file': output_file.name, | |
| 'stats': error_stats, | |
| 'histogram': histogram_data, | |
| 'message': f'Successfully processed {len(results_df)} predictions' | |
| } | |
| except Exception as e: | |
| error_msg = f"Error during inference: {str(e)}" | |
| update_progress('Error', 0, error_msg) | |
| return { | |
| 'success': False, | |
| 'error': error_msg | |
| } | |
| ######################################## | |
| # WEB ROUTES # | |
| ######################################## | |
| def index(): | |
| try: | |
| return render_template('vessel_inference.html') | |
| except Exception as e: | |
| # If template not found, return a simple HTML page | |
| return f""" | |
| <!DOCTYPE html> | |
| <html> | |
| <head><title>Vessel Inference - Template Missing</title></head> | |
| <body> | |
| <h1>π’ Vessel Trajectory Inference</h1> | |
| <p><strong>Error:</strong> Template file missing: {str(e)}</p> | |
| <p>Please upload the templates/vessel_inference.html file to your HF Space.</p> | |
| <form action="/upload" method="post" enctype="multipart/form-data"> | |
| <p>Upload CSV for inference:</p> | |
| <input type="file" name="csv_file" accept=".csv" required> | |
| <br><br> | |
| <input type="submit" value="Start Inference"> | |
| </form> | |
| </body> | |
| </html> | |
| """ | |
| def upload_file(): | |
| try: | |
| # Check if files were uploaded | |
| if 'csv_file' not in request.files: | |
| return jsonify({'success': False, 'error': 'No CSV file uploaded'}) | |
| csv_file = request.files['csv_file'] | |
| if csv_file.filename == '': | |
| return jsonify({'success': False, 'error': 'No CSV file selected'}) | |
| # Default model and normalization files | |
| model_path = 'best_model.pth' | |
| normalization_path = 'normalization_params_1_atlanttic_regular_intervals_with_lat_lon_velocity_and_time_difference_filter_outlier_segment_min_20_points.json' | |
| # Handle optional file uploads | |
| custom_model_uploaded = False | |
| custom_norm_uploaded = False | |
| if 'model_file' in request.files and request.files['model_file'].filename != '': | |
| model_file = request.files['model_file'] | |
| model_filename = secure_filename(model_file.filename) | |
| model_path = os.path.join(app.config['UPLOAD_FOLDER'], model_filename) | |
| try: | |
| model_file.save(model_path) | |
| custom_model_uploaded = True | |
| except PermissionError: | |
| # Try with temp directory | |
| import tempfile | |
| temp_dir = tempfile.gettempdir() | |
| model_path = os.path.join(temp_dir, model_filename) | |
| model_file.save(model_path) | |
| custom_model_uploaded = True | |
| print(f"β οΈ WARNING: Saved model to temp directory: {model_path}") | |
| except Exception as e: | |
| return jsonify({'success': False, 'error': f'Failed to save model file: {str(e)}'}) | |
| if 'normalization_file' in request.files and request.files['normalization_file'].filename != '': | |
| norm_file = request.files['normalization_file'] | |
| norm_filename = secure_filename(norm_file.filename) | |
| normalization_path = os.path.join(app.config['UPLOAD_FOLDER'], norm_filename) | |
| try: | |
| norm_file.save(normalization_path) | |
| custom_norm_uploaded = True | |
| except PermissionError: | |
| # Try with temp directory | |
| import tempfile | |
| temp_dir = tempfile.gettempdir() | |
| normalization_path = os.path.join(temp_dir, norm_filename) | |
| norm_file.save(normalization_path) | |
| custom_norm_uploaded = True | |
| print(f"β οΈ WARNING: Saved normalization to temp directory: {normalization_path}") | |
| except Exception as e: | |
| return jsonify({'success': False, 'error': f'Failed to save normalization file: {str(e)}'}) | |
| # Validate model file exists | |
| if not os.path.exists(model_path): | |
| if custom_model_uploaded: | |
| return jsonify({'success': False, 'error': f'Failed to save uploaded model file: {model_path}'}) | |
| else: | |
| return jsonify({'success': False, 'error': f'Default model file not found: {model_path}. Please upload a model file or ensure best_model.pth exists in the root directory.'}) | |
| # Validate normalization file exists | |
| if not os.path.exists(normalization_path): | |
| if custom_norm_uploaded: | |
| return jsonify({'success': False, 'error': f'Failed to save uploaded normalization file: {normalization_path}'}) | |
| else: | |
| return jsonify({'success': False, 'error': f'Default normalization file not found: {normalization_path}. Please upload a normalization file or ensure the JSON file exists in the root directory.'}) | |
| # Save CSV file with error handling | |
| csv_filename = secure_filename(csv_file.filename) | |
| csv_path = os.path.join(app.config['UPLOAD_FOLDER'], csv_filename) | |
| try: | |
| csv_file.save(csv_path) | |
| except PermissionError as e: | |
| # Try alternative approaches if uploads directory has permission issues | |
| try: | |
| # Try with different permissions | |
| os.chmod(app.config['UPLOAD_FOLDER'], 0o777) | |
| csv_file.save(csv_path) | |
| except: | |
| # Fall back to temporary directory | |
| import tempfile | |
| temp_dir = tempfile.gettempdir() | |
| csv_path = os.path.join(temp_dir, csv_filename) | |
| csv_file.save(csv_path) | |
| print(f"β οΈ WARNING: Saved CSV to temp directory due to permissions: {csv_path}") | |
| except Exception as e: | |
| return jsonify({'success': False, 'error': f'Failed to save CSV file: {str(e)}'}) | |
| # Debug logging | |
| print(f"π DEBUG: Using model_path: {model_path}") | |
| print(f"π DEBUG: Using normalization_path: {normalization_path}") | |
| print(f"π DEBUG: Model exists: {os.path.exists(model_path)}") | |
| print(f"π DEBUG: Norm exists: {os.path.exists(normalization_path)}") | |
| # Start inference in background thread | |
| def run_inference_background(): | |
| return run_inference_pipeline(csv_path, model_path, normalization_path) | |
| thread = threading.Thread(target=run_inference_background) | |
| thread.start() | |
| return jsonify({'success': True, 'message': 'Files uploaded successfully. Inference started.'}) | |
| except Exception as e: | |
| return jsonify({'success': False, 'error': str(e)}) | |
| def get_progress(): | |
| return jsonify(current_progress) | |
| def download_results(): | |
| # Find the most recent results file in multiple locations | |
| search_directories = [ | |
| 'results/inference_atlantic', # Default results directory | |
| app.config['UPLOAD_FOLDER'], # Uploads directory | |
| tempfile.gettempdir() # System temp directory | |
| ] | |
| latest_file = None | |
| latest_time = 0 | |
| for directory in search_directories: | |
| if os.path.exists(directory): | |
| try: | |
| files = [f for f in os.listdir(directory) if f.endswith('_inference_results.csv')] | |
| for file in files: | |
| file_path = os.path.join(directory, file) | |
| file_time = os.path.getctime(file_path) | |
| if file_time > latest_time: | |
| latest_time = file_time | |
| latest_file = file_path | |
| except PermissionError: | |
| continue # Skip directories we can't read | |
| if latest_file and os.path.exists(latest_file): | |
| return send_file( | |
| latest_file, | |
| as_attachment=True, | |
| download_name='vessel_inference_results.csv' | |
| ) | |
| return jsonify({'error': 'No results file found. Please run inference first.'}), 404 | |
| ######################################## | |
| # SOCKETIO EVENTS # | |
| ######################################## | |
| def handle_connect(): | |
| emit('progress_update', current_progress) | |
| def handle_start_inference(data): | |
| """Handle inference request via WebSocket""" | |
| try: | |
| csv_path = data.get('csv_path') | |
| model_path = data.get('model_path', 'best_model.pth') | |
| norm_path = data.get('normalization_path', 'normalization_params_1_atlanttic_regular_intervals_with_lat_lon_velocity_and_time_difference_filter_outlier_segment_min_20_points.json') | |
| def run_inference_background(): | |
| result = run_inference_pipeline(csv_path, model_path, norm_path) | |
| emit('inference_complete', result) | |
| thread = threading.Thread(target=run_inference_background) | |
| thread.start() | |
| except Exception as e: | |
| emit('inference_complete', {'success': False, 'error': str(e)}) | |
| if __name__ == '__main__': | |
| print("π’ Vessel Trajectory Inference Web App") | |
| print("π Using Final_inference_maginet.py logic") | |
| # Get port from environment variable (Hugging Face Spaces uses 7860) | |
| port = int(os.environ.get('PORT', 7860)) | |
| print(f"π Starting server at http://0.0.0.0:{port}") | |
| print("π Make sure you have:") | |
| print(" - best_model.pth") | |
| print(" - normalization_params_1_atlanttic_regular_intervals_...json") | |
| print(" - Your test dataset CSV") | |
| socketio.run(app, host='0.0.0.0', port=port, debug=False, allow_unsafe_werkzeug=True) |