Spaces:

OnsAouedi
/

Maginet_inference

Sleeping

App Files Files Community

Maginet_inference / app.py

OnsAouedi

Upload app.py

795f111 verified 9 months ago

raw

history blame contribute delete

44.9 kB

	#!/usr/bin/env python3

	import os
	import json
	import time
	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import TensorDataset, DataLoader
	from tqdm import tqdm
	from flask import Flask, render_template, request, jsonify, send_file
	from flask_socketio import SocketIO, emit
	import tempfile
	import threading
	from pathlib import Path
	from werkzeug.utils import secure_filename

	app = Flask(__name__)
	app.config['SECRET_KEY'] = 'your-secret-key-here'
	app.config['UPLOAD_FOLDER'] = 'uploads'
	app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024 # 500MB max file size
	socketio = SocketIO(app, cors_allowed_origins="*")

	# Ensure upload directory exists with proper permissions
	os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
	# Fix permissions for Hugging Face Spaces
	try:
	os.chmod(app.config['UPLOAD_FOLDER'], 0o755)
	except:
	pass # In case we don't have permission to change permissions

	# Try to create results directory at startup
	try:
	os.makedirs('results/inference_atlantic', exist_ok=True)
	os.chmod('results/inference_atlantic', 0o755)
	except:
	print("⚠️ WARNING: Could not create results directory - will use temp directory for results")

	# Global variables for progress tracking
	current_progress = {'step': 'idle', 'progress': 0, 'details': ''}

	########################################
	# MODEL DEFINITION #
	########################################

	class LSTMWithAttentionWithResid(nn.Module):
	def __init__(self, in_dim, hidden_dim, forecast_horizon, n_layers=10, dropout=0.2):
	super(LSTMWithAttentionWithResid, self).__init__()
	self.hidden_dim = hidden_dim
	self.forecast_horizon = forecast_horizon

	# Embedding layer
	self.embedding = nn.Linear(in_dim, hidden_dim)

	# LSTM layers
	self.lstm = nn.LSTM(
	hidden_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True
	)

	# Layer normalization after residual connection
	self.layer_norm = nn.LayerNorm(hidden_dim)

	# Attention mechanism
	self.attention = nn.Linear(hidden_dim, hidden_dim)
	self.context_vector = nn.Linear(hidden_dim, 1, bias=False) # Linear layer for scoring

	# Fully connected layer to map attention context to output
	self.fc = nn.Linear(hidden_dim, forecast_horizon * 2)

	def forward(self, x):
	# x: [batch_size, seq_len, in_dim]

	# Pass through embedding layer
	x_embed = self.embedding(x) # [batch_size, seq_len, hidden_dim]

	# Pass through LSTM
	lstm_output, (hidden, cell) = self.lstm(x_embed) # [batch_size, seq_len, hidden_dim]

	# Add residual connection (out-of-place)
	lstm_output = lstm_output + x_embed # [batch_size, seq_len, hidden_dim]

	# Apply layer normalization
	lstm_output = self.layer_norm(lstm_output) # [batch_size, seq_len, hidden_dim]

	# Compute attention scores
	attention_weights = torch.tanh(self.attention(lstm_output)) # [batch_size, seq_len, hidden_dim]
	attention_scores = self.context_vector(attention_weights).squeeze(-1) # [batch_size, seq_len]

	# Apply softmax to normalize scores
	attention_weights = F.softmax(attention_scores, dim=1) # [batch_size, seq_len]

	# Compute the context vector as a weighted sum of LSTM outputs
	context_vector = torch.bmm(
	attention_weights.unsqueeze(1), lstm_output
	) # [batch_size, 1, hidden_dim]
	context_vector = context_vector.squeeze(1) # [batch_size, hidden_dim]

	# Pass context vector through fully connected layer for forecasting
	output = self.fc(context_vector) # [batch_size, forecast_horizon * 2]

	# Reshape output to match the expected shape
	output = output.view(-1, self.forecast_horizon, 2) # [batch_size, forecast_horizon, 2]

	return output

	########################################
	# UTILITY FUNCTIONS #
	########################################

	def update_progress(step, progress, details=""):
	"""Update global progress state"""
	global current_progress
	current_progress = {
	'step': step,
	'progress': progress,
	'details': details
	}
	socketio.emit('progress_update', current_progress)

	def create_sequences_grouped_by_segment_lat_long_veloc(df_scaled, seq_len=12, forecast_horizon=1, features_to_scale=None):
	"""
	For each segment, creates overlapping sequences of length seq_len.
	Returns:
	- Xs: input sequences,
	- ys: target outputs (future latitude and longitude velocities),
	- segments: corresponding segment IDs,
	- last_positions: last known positions from each sequence.
	"""
	update_progress('Creating sequences', 10, f'Processing {len(df_scaled)} data points...')

	Xs, ys, segments, last_positions = [], [], [], []

	if features_to_scale is None:
	# CRITICAL: Match YOUR EXACT inference logic (segment first, then removed)
	features_to_scale = [
	"segment", # Index 0 - will be removed before model
	"latitude_velocity_km", # Index 1 -> 0 after segment removal
	"longitude_velocity_km", # Index 2 -> 1 after segment removal
	"latitude_degrees", # Index 3 -> 2 after segment removal
	"longitude_degrees", # Index 4 -> 3 after segment removal
	"time_difference_hours", # Index 5 -> 4 after segment removal
	"time_scalar" # Index 6 -> 5 after segment removal
	]

	# Verify all required features exist
	missing_features = [f for f in features_to_scale if f not in df_scaled.columns]
	if missing_features:
	raise ValueError(f"Missing required features: {missing_features}")

	grouped = df_scaled.groupby('segment')
	total_segments = len(grouped)

	for i, (segment_id, group) in enumerate(grouped):
	group = group.reset_index(drop=True)
	L = len(group)

	# Progress update
	if i % max(1, total_segments // 20) == 0:
	progress = 10 + (i / total_segments) * 30 # 10-40% range
	update_progress('Creating sequences', progress,
	f'Processing segment {i+1}/{total_segments}')

	if L >= seq_len + forecast_horizon:
	for j in range(L - seq_len - forecast_horizon + 1):
	# Get sequence features
	seq = group.iloc[j:(j+seq_len)][features_to_scale].to_numpy()

	# Get future time scalar for the forecast horizon
	future_time = group['time_scalar'].iloc[j + seq_len + forecast_horizon - 1]
	future_time_feature = np.full((seq_len, 1), future_time)

	# Augment sequence with future time
	seq_aug = np.hstack((seq, future_time_feature))
	Xs.append(seq_aug)

	# Target: future velocity
	target = group[['latitude_velocity_km', 'longitude_velocity_km']].iloc[j + seq_len + forecast_horizon - 1].to_numpy()
	ys.append(target)

	segments.append(segment_id)

	# Last known position
	last_pos = group[['latitude_degrees', 'longitude_degrees']].iloc[j + seq_len - 1].to_numpy()
	last_positions.append(last_pos)

	return (np.array(Xs, dtype=np.float32),
	np.array(ys, dtype=np.float32),
	np.array(segments),
	np.array(last_positions, dtype=np.float32))

	def load_normalization_params(json_path):
	"""Load normalization parameters from JSON file"""
	with open(json_path, "r") as f:
	normalization_params = json.load(f)
	return normalization_params["feature_mins"], normalization_params["feature_maxs"]

	def minmax_denormalize(scaled_series, feature_min, feature_max):
	"""Denormalize data using min-max scaling"""
	return scaled_series * (feature_max - feature_min) + feature_min

	########################################
	# INFERENCE PIPELINE #
	########################################

	def run_inference_pipeline(csv_file_path, model_path, normalization_path):
	"""Complete inference pipeline following Final_inference_maginet.py logic"""

	try:
	# Step 1: Load and validate data
	update_progress('Loading data', 5, 'Reading CSV file...')

	# Enhanced CSV parsing with error handling
	try:
	# Determine separator by reading first few lines
	with open(csv_file_path, 'r') as f:
	first_line = f.readline()
	separator = ';' if ';' in first_line else ','

	# Try reading with detected separator
	df = pd.read_csv(csv_file_path, sep=separator, on_bad_lines='skip')
	update_progress('Loading data', 8, f'Loaded {len(df)} rows with separator "{separator}"')

	# Debug: Print actual column names
	print(f"🔍 CSV COLUMNS FOUND: {list(df.columns)}")
	update_progress('Loading data', 8.5, f'Columns: {list(df.columns)}')

	except Exception as e:
	print(f"❌ CSV PARSING ERROR: {e}")
	# Try alternative parsing methods
	try:
	df = pd.read_csv(csv_file_path, sep=',', on_bad_lines='skip')
	update_progress('Loading data', 8, f'Loaded {len(df)} rows with comma separator (fallback)')
	print(f"🔍 CSV COLUMNS FOUND (fallback): {list(df.columns)}")
	except Exception as e2:
	try:
	df = pd.read_csv(csv_file_path, sep=';', on_bad_lines='skip')
	update_progress('Loading data', 8, f'Loaded {len(df)} rows with semicolon separator (fallback)')
	print(f"🔍 CSV COLUMNS FOUND (fallback): {list(df.columns)}")
	except Exception as e3:
	raise ValueError(f"Could not parse CSV file. Tried multiple separators. Errors: {e}, {e2}, {e3}")

	# CRITICAL: Create time_scalar (was missing from inference dataset!)
	if 'time_scalar' not in df.columns:
	if 'datetime' in df.columns:
	# Convert datetime to time_scalar (preferred method)
	df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
	reference_date = pd.Timestamp('2023-01-01')
	df['time_scalar'] = ((df['datetime'] - reference_date) / pd.Timedelta(days=1)).round(8)
	update_progress('Loading data', 9, 'Created time_scalar from datetime column')
	elif 'time_decimal' in df.columns:
	# Use time_decimal directly as time_scalar (alternative method)
	df['time_scalar'] = df['time_decimal'].copy()
	update_progress('Loading data', 9, 'Created time_scalar from time_decimal column')
	elif all(col in df.columns for col in ['day', 'month', 'time_decimal']):
	# Create datetime from components and then time_scalar
	df['year'] = df.get('year', 2024) # Default year if not present
	df['datetime'] = pd.to_datetime(df[['year', 'month', 'day']], errors='coerce')
	df['datetime'] += pd.to_timedelta(df['time_decimal'], unit='h')
	reference_date = pd.Timestamp('2023-01-01')
	df['time_scalar'] = ((df['datetime'] - reference_date) / pd.Timedelta(days=1)).round(8)
	update_progress('Loading data', 9, 'Created time_scalar from day/month/time_decimal')
	else:
	# Create a simple sequential time_scalar based on row order
	df['time_scalar'] = df.index / len(df)
	update_progress('Loading data', 9, 'Created sequential time_scalar')

	# Validate required columns with detailed error reporting
	required_columns = [
	'segment', 'latitude_velocity_km', 'longitude_velocity_km',
	'latitude_degrees', 'longitude_degrees', 'time_difference_hours', 'time_scalar'
	]

	print(f"🔍 REQUIRED COLUMNS: {required_columns}")
	print(f"🔍 ACTUAL COLUMNS: {list(df.columns)}")

	missing_columns = [col for col in required_columns if col not in df.columns]
	if missing_columns:
	available_cols = list(df.columns)
	error_msg = f"""
	❌ COLUMN VALIDATION ERROR:
	Missing required columns: {missing_columns}
	Available columns: {available_cols}

	Column mapping suggestions:
	- Check for extra spaces or different naming
	- Verify CSV file format and encoding
	- Ensure time_scalar column exists or can be created
	"""
	print(error_msg)
	raise ValueError(f"Missing required columns: {missing_columns}. Available: {available_cols}")

	# CRITICAL: Apply the SAME data filtering as training/notebook
	update_progress('Filtering data', 10, 'Applying quality filters...')
	original_count = len(df)

	# 1. Calculate speed column if missing (CRITICAL!)
	if 'speed_km_h' not in df.columns:
	df['speed_km_h'] = np.sqrt(df['latitude_velocity_km']2 + df['longitude_velocity_km']2)
	update_progress('Filtering data', 10.5, 'Calculated speed_km_h column')

	# 2. Speed filtering - EXACTLY like training
	df = df[(df['speed_km_h'] >= 2) & (df['speed_km_h'] <= 60)].copy()
	update_progress('Filtering data', 11, f'Speed filter: {original_count} -> {len(df)} rows')

	# 3. Velocity filtering - CRITICAL for performance!
	velocity_mask = (
	(np.abs(df['latitude_velocity_km']) <= 100) &
	(np.abs(df['longitude_velocity_km']) <= 100) &
	(df['time_difference_hours'] > 0) &
	(df['time_difference_hours'] <= 24) # Max 24 hours between points
	)
	df = df[velocity_mask].copy()
	update_progress('Filtering data', 12, f'Velocity filter: -> {len(df)} rows')

	# 4. Segment length filtering - Remove segments with < 20 points
	segment_counts = df['segment'].value_counts()
	segments_to_remove = segment_counts[segment_counts < 20].index
	before_segment_filter = len(df)
	df = df[~df['segment'].isin(segments_to_remove)].copy()
	update_progress('Filtering data', 13, f'Segment filter: {before_segment_filter} -> {len(df)} rows')

	# 5. Remove NaN and infinite values
	df = df.dropna().copy()
	numeric_cols = ['latitude_velocity_km', 'longitude_velocity_km', 'time_difference_hours']
	for col in numeric_cols:
	if col in df.columns:
	df = df[~np.isinf(df[col])].copy()

	# DEBUGGING: Add detailed filtering statistics
	filtered_count = len(df)
	filter_percent = ((original_count - filtered_count) / original_count) * 100
	update_progress('Filtering data', 14, f'Final filtered data: {filtered_count} rows ({original_count - filtered_count} removed = {filter_percent:.1f}%)')

	# Debug info for analysis
	print(f"🔍 FILTERING SUMMARY:")
	print(f" Original: {original_count:,} rows")
	print(f" Final: {filtered_count:,} rows")
	print(f" Removed: {original_count - filtered_count:,} ({filter_percent:.1f}%)")

	if len(df) == 0:
	raise ValueError("No data remaining after quality filtering. Check your input data quality.")

	# Step 2: Load normalization parameters
	update_progress('Loading normalization', 12, 'Loading normalization parameters...')
	feature_mins, feature_maxs = load_normalization_params(normalization_path)

	# Step 2.5: CRITICAL - Normalize the test data (missing step causing 3373km error!)
	update_progress('Normalizing data', 15, 'Applying normalization to test data...')
	features_to_normalize = ['latitude_velocity_km', 'longitude_velocity_km',
	'latitude_degrees', 'longitude_degrees',
	'time_difference_hours', 'time_scalar']

	for feature in features_to_normalize:
	if feature in df.columns and feature in feature_mins:
	min_val = feature_mins[feature]
	max_val = feature_maxs[feature]
	rng = max_val - min_val if max_val != min_val else 1
	df[feature] = (df[feature] - min_val) / rng
	update_progress('Normalizing data', 18, f'Normalized {feature}')

	# Step 3: Create sequences
	SEQ_LENGTH = 12
	FORECAST_HORIZON = 1

	X_test, y_test, test_segments, last_known_positions_scaled = create_sequences_grouped_by_segment_lat_long_veloc(
	df, seq_len=SEQ_LENGTH, forecast_horizon=FORECAST_HORIZON
	)

	update_progress('Preparing model', 45, f'Created {len(X_test)} sequences')

	if len(X_test) == 0:
	raise ValueError("No valid sequences could be created. Check your data and sequence length requirements.")

	# Step 4: Prepare data for model
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	X_test_tensor = torch.from_numpy(X_test).float().to(device)
	y_test_tensor = torch.from_numpy(y_test).float().to(device)
	test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
	test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

	# Step 5: Load model
	update_progress('Loading model', 50, 'Loading trained model...')

	# CRITICAL: Model expects 6 features (segment removed) + 1 future_time = 7 total
	in_dim = X_test.shape[2] - 1 # Remove segment column dimension
	# CRITICAL: Match the exact model architecture from Atlantic model weights
	hidden_dim = 250 # From best_model.pth
	n_layers = 7 # From best_model.pth (CRITICAL: not 10!)
	dropout = 0.2

	model = LSTMWithAttentionWithResid(
	in_dim, hidden_dim, FORECAST_HORIZON,
	n_layers=n_layers, dropout=dropout
	).to(device)

	model.load_state_dict(torch.load(model_path, map_location=device))
	model.eval()

	# Step 6: Run inference
	update_progress('Running inference', 60, 'Making predictions...')

	# CRITICAL: Extract features batch-by-batch like your notebook
	all_preds = []
	segments_extracted = []
	time_scalars_extracted = []
	time_diff_hours_extracted = []

	with torch.no_grad():
	for i, batch in enumerate(test_loader):
	x_batch, _ = batch

	# CRITICAL: Extract features exactly like your notebook
	segment_batch = x_batch[:, 0, 0].cpu().numpy() # Take segment from first time step
	time_scalar_batch = x_batch[:, -1, 6].cpu().numpy() # LAST timestep, index 6 = time_scalar
	time_diff_hours_batch = x_batch[:, 0, 5].cpu().numpy() # First timestep, index 5

	segments_extracted.extend(segment_batch)
	time_scalars_extracted.extend(time_scalar_batch)
	time_diff_hours_extracted.extend(time_diff_hours_batch)

	# Remove segment column before model input
	x_batch_no_segment = x_batch[:, :, 1:] # Remove segment (index 0) but keep all other features
	preds = model(x_batch_no_segment)
	all_preds.append(preds.cpu().numpy())

	# Progress update
	progress = 60 + (i / len(test_loader)) * 20 # 60-80% range
	update_progress('Running inference', progress,
	f'Processing batch {i+1}/{len(test_loader)}')

	all_preds = np.concatenate(all_preds, axis=0)

	# Step 7: Process results
	update_progress('Processing results', 80, 'Processing predictions...')

	# CRITICAL: Reshape predictions exactly like your notebook
	yhat = torch.from_numpy(all_preds)
	yhat = yhat.view(-1, 2) # Reshape to [batch_size, 2] - EXACTLY like your notebook

	# Extract predictions exactly like your notebook
	predicted_lat_vel = yhat[:, 0].numpy() # Predicted lat velocity
	predicted_lon_vel = yhat[:, 1].numpy() # Predicted lon velocity

	# Extract actual values exactly like your notebook
	y_real = y_test_tensor.cpu()
	actual_lat_vel = y_real[:, 0].numpy() # Actual lat velocity
	actual_lon_vel = y_real[:, 1].numpy() # Actual lon velocity

	# CRITICAL: Use extracted features from batches (matching your notebook exactly)
	# Ensure all arrays have consistent length
	num_samples = len(predicted_lat_vel)
	segments_extracted = segments_extracted[:num_samples]
	time_scalars_extracted = time_scalars_extracted[:num_samples]
	time_diff_hours_extracted = time_diff_hours_extracted[:num_samples]
	last_known_positions_scaled = last_known_positions_scaled[:num_samples]

	# Create results dataframe exactly like your notebook
	results_df = pd.DataFrame({
	'segment': segments_extracted, # From batch extraction
	'time_difference_hours': time_diff_hours_extracted, # From batch extraction (first timestep)
	'Time Scalar': time_scalars_extracted, # From batch extraction (LAST timestep)
	'Last Known Latitude': [pos[0] for pos in last_known_positions_scaled],
	'Last Known Longitude': [pos[1] for pos in last_known_positions_scaled],
	'predicted_lat_km': predicted_lat_vel,
	'predicted_lon_km': predicted_lon_vel,
	'actual_lat_km': actual_lat_vel,
	'actual_lon_km': actual_lon_vel
	})

	# Step 8: Denormalize results
	update_progress('Denormalizing results', 85, 'Converting to real units...')

	# Column to feature mapping (COMPLETE mapping for all denormalizable columns)
	column_to_feature = {
	"predicted_lat_km": "latitude_velocity_km",
	"predicted_lon_km": "longitude_velocity_km",
	"actual_lat_km": "latitude_velocity_km",
	"actual_lon_km": "longitude_velocity_km",
	"Last Known Latitude": "latitude_degrees",
	"Last Known Longitude": "longitude_degrees",
	"time_difference_hours": "time_difference_hours",
	"Time Scalar": "time_scalar"
	}

	# Denormalize relevant columns
	for col, feat in column_to_feature.items():
	if col in results_df.columns and feat in feature_mins:
	fmin = feature_mins[feat]
	fmax = feature_maxs[feat]
	results_df[col + "_unscaled"] = minmax_denormalize(results_df[col], fmin, fmax)
	update_progress('Denormalizing results', 85, f'Denormalized {col}')

	# Ensure all required _unscaled columns exist
	required_unscaled_cols = [
	'predicted_lat_km_unscaled', 'predicted_lon_km_unscaled',
	'actual_lat_km_unscaled', 'actual_lon_km_unscaled',
	'Last Known Latitude_unscaled', 'Last Known Longitude_unscaled',
	'time_difference_hours_unscaled'
	]

	for col in required_unscaled_cols:
	if col not in results_df.columns:
	base_col = col.replace('_unscaled', '')
	if base_col in results_df.columns:
	# If base column exists but wasn't denormalized, copy it
	results_df[col] = results_df[base_col]
	update_progress('Denormalizing results', 87, f'Created missing {col}')
	else:
	results_df[col] = 0.0
	update_progress('Denormalizing results', 87, f'Defaulted missing {col} to 0')

	# ---------------------------
	# NEW: Clip predicted velocities to realistic physical bounds to avoid huge errors
	# ---------------------------
	VELOCITY_RANGE_KM_H = (-100, 100) # Same limits used during input filtering
	results_df["predicted_lat_km_unscaled"] = results_df["predicted_lat_km_unscaled"].clip(*VELOCITY_RANGE_KM_H)
	results_df["predicted_lon_km_unscaled"] = results_df["predicted_lon_km_unscaled"].clip(*VELOCITY_RANGE_KM_H)
	update_progress('Denormalizing results', 88, 'Clipped predicted velocities to realistic range')

	# Step 9: Calculate final positions and errors (EXACT column structure matching your notebook)
	update_progress('Calculating errors', 90, 'Computing prediction errors...')

	# Compute displacement components (in km)
	results_df["pred_final_lat_km_component"] = (
	results_df["predicted_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	)
	results_df["pred_final_lon_km_component"] = (
	results_df["predicted_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	)
	results_df["actual_final_lat_km_component"] = (
	results_df["actual_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	)
	results_df["actual_final_lon_km_component"] = (
	results_df["actual_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	)

	# Calculate total displacement magnitudes (MISSING COLUMNS!)
	results_df["pred_final_km"] = np.sqrt(
	results_df["pred_final_lat_km_component"]2 + results_df["pred_final_lon_km_component"]2
	)
	results_df["actual_final_km"] = np.sqrt(
	results_df["actual_final_lat_km_component"]2 + results_df["actual_final_lon_km_component"]2
	)

	# Calculate Euclidean distance error (in km)
	results_df["error_km"] = np.sqrt(
	(results_df["pred_final_lat_km_component"] - results_df["actual_final_lat_km_component"])**2 +
	(results_df["pred_final_lon_km_component"] - results_df["actual_final_lon_km_component"])**2
	)

	# Compute final positions in degrees
	km_per_deg_lat = 111 # approximate conversion for latitude
	results_df["pred_final_lat_deg"] = results_df["Last Known Latitude_unscaled"] + (
	results_df["predicted_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	) / km_per_deg_lat
	results_df["actual_final_lat_deg"] = results_df["Last Known Latitude_unscaled"] + (
	results_df["actual_lat_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	) / km_per_deg_lat

	# Account for longitude scaling by latitude
	results_df["Last_Known_Lat_rad"] = np.deg2rad(results_df["Last Known Latitude_unscaled"])
	results_df["pred_final_lon_deg"] = results_df["Last Known Longitude_unscaled"] + (
	results_df["predicted_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	) / (km_per_deg_lat * np.cos(results_df["Last_Known_Lat_rad"]))
	results_df["actual_final_lon_deg"] = results_df["Last Known Longitude_unscaled"] + (
	results_df["actual_lon_km_unscaled"] * results_df["time_difference_hours_unscaled"]
	) / (km_per_deg_lat * np.cos(results_df["Last_Known_Lat_rad"]))

	# Step 10: Reorder columns to match your EXACT specification
	update_progress('Finalizing results', 93, 'Reordering columns to match notebook format...')

	# EXACT column order as specified by user
	column_order = [
	'segment', 'time_difference_hours', 'Time Scalar', 'Last Known Latitude', 'Last Known Longitude',
	'predicted_lat_km', 'predicted_lon_km', 'actual_lat_km', 'actual_lon_km',
	'predicted_lat_km_unscaled', 'predicted_lon_km_unscaled', 'actual_lat_km_unscaled', 'actual_lon_km_unscaled',
	'Last Known Latitude_unscaled', 'Last Known Longitude_unscaled', 'time_difference_hours_unscaled',
	'pred_final_km', 'actual_final_km',
	'pred_final_lat_km_component', 'pred_final_lon_km_component',
	'actual_final_lat_km_component', 'actual_final_lon_km_component',
	'error_km', 'pred_final_lat_deg', 'actual_final_lat_deg', 'Last_Known_Lat_rad',
	'pred_final_lon_deg', 'actual_final_lon_deg'
	]

	# Validate all required columns exist - add missing ones with defaults if needed
	missing_columns = [col for col in column_order if col not in results_df.columns]
	if missing_columns:
	update_progress('Finalizing results', 94, f'Adding missing columns: {missing_columns}')
	for col in missing_columns:
	# Add default values for any missing columns
	if '_unscaled' in col:
	# For unscaled columns, try to find the original scaled column
	base_col = col.replace('_unscaled', '')
	if base_col in results_df.columns and base_col in column_to_feature:
	# Use the same denormalization process
	feat = column_to_feature[base_col]
	if feat in feature_mins:
	fmin = feature_mins[feat]
	fmax = feature_maxs[feat]
	results_df[col] = minmax_denormalize(results_df[base_col], fmin, fmax)
	else:
	results_df[col] = results_df[base_col] # No denormalization available
	else:
	results_df[col] = 0.0 # Default to 0
	else:
	results_df[col] = 0.0 # Default to 0 for any other missing columns

	# Reorder columns to match exact specification
	results_df = results_df[column_order]

	# Step 11: Save results
	update_progress('Saving results', 95, 'Saving inference results...')

	# Create results directory with permission handling
	try:
	results_dir = Path('results/inference_atlantic')
	results_dir.mkdir(parents=True, exist_ok=True)

	# Save to results directory
	timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
	results_file = results_dir / f'inference_results_{timestamp}.csv'
	results_df.to_csv(results_file, index=False)
	print(f"✅ Results saved to: {results_file}")
	except PermissionError:
	# Fallback to temp directory if results directory has permission issues
	import tempfile
	temp_dir = Path(tempfile.gettempdir())
	timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
	results_file = temp_dir / f'inference_results_{timestamp}.csv'
	results_df.to_csv(results_file, index=False)
	print(f"⚠️ WARNING: Results saved to temp directory due to permissions: {results_file}")
	except Exception as e:
	print(f"❌ ERROR: Could not save results file: {str(e)}")
	# Continue anyway - we still have the temp file below

	# Also save to temporary file for compatibility
	output_file = tempfile.NamedTemporaryFile(
	mode='w', suffix='_inference_results.csv', delete=False
	)
	results_df.to_csv(output_file.name, index=False)

	# CRITICAL: Calculate SAME regression metrics as your notebook
	# Convert predictions and actuals to tensors for metric calculation
	yhat_tensor = torch.from_numpy(np.column_stack([predicted_lat_vel, predicted_lon_vel])).float()
	y_real_tensor = torch.from_numpy(np.column_stack([actual_lat_vel, actual_lon_vel])).float()

	# Calculate regression metrics exactly like your notebook
	def calc_metrics_like_notebook(preds, labels):
	"""Calculate metrics exactly like your notebook's calc_metrics function"""
	EPS = 1e-8
	mse = torch.mean((preds - labels) ** 2)
	mae = torch.mean(torch.abs(preds - labels))
	rmse = torch.sqrt(mse)
	mape = torch.mean(torch.abs((preds - labels) / (labels + EPS))) * 100 # Convert to percentage
	rse = torch.sum((preds - labels) 2) / torch.sum((labels + EPS) 2)
	return rse.item(), mae.item(), mse.item(), mape.item(), rmse.item()

	# Calculate regression metrics on velocity predictions
	rse, mae, mse, mape, rmse = calc_metrics_like_notebook(yhat_tensor, y_real_tensor)

	# Calculate summary statistics
	error_stats = {
	# Distance-based metrics (web app specific)
	'mean_error_km': float(results_df["error_km"].mean()),
	'median_error_km': float(results_df["error_km"].median()),
	'std_error_km': float(results_df["error_km"].std()),
	'min_error_km': float(results_df["error_km"].min()),
	'max_error_km': float(results_df["error_km"].max()),

	# Regression metrics (matching your notebook)
	'rse': rse,
	'mae': mae,
	'mse': mse,
	'mape': mape,
	'rmse': rmse,

	# General stats
	'total_predictions': len(results_df),
	'total_segments': len(results_df['segment'].unique()),
	'columns_generated': list(results_df.columns),
	'total_columns': len(results_df.columns)
	}

	# NEW: Create histogram of error distribution (30 bins by default)
	hist_counts, bin_edges = np.histogram(results_df["error_km"], bins=30)
	histogram_data = {
	'bins': bin_edges.tolist(),
	'counts': hist_counts.tolist()
	}

	update_progress('Complete', 100,
	f'✅ Inference complete! Distance: {error_stats["mean_error_km"]:.2f} km \| MAE: {error_stats["mae"]:.2f} \| MAPE: {error_stats["mape"]:.2f}%')

	# Emit inference_complete with full statistics and histogram for the frontend chart
	try:
	socketio.emit('inference_complete', {
	'success': True,
	'stats': error_stats,
	'histogram': histogram_data
	})
	except Exception:
	pass # In case we are in CLI context without SocketIO

	return {
	'success': True,
	'results_file': output_file.name,
	'stats': error_stats,
	'histogram': histogram_data,
	'message': f'Successfully processed {len(results_df)} predictions'
	}

	except Exception as e:
	error_msg = f"Error during inference: {str(e)}"
	update_progress('Error', 0, error_msg)
	return {
	'success': False,
	'error': error_msg
	}

	########################################
	# WEB ROUTES #
	########################################

	@app.route('/')
	def index():
	try:
	return render_template('vessel_inference.html')
	except Exception as e:
	# If template not found, return a simple HTML page
	return f"""
	<!DOCTYPE html>
	<html>
	<head><title>Vessel Inference - Template Missing</title></head>
	<body>
	<h1>🚢 Vessel Trajectory Inference</h1>
	<p><strong>Error:</strong> Template file missing: {str(e)}</p>
	<p>Please upload the templates/vessel_inference.html file to your HF Space.</p>
	<form action="/upload" method="post" enctype="multipart/form-data">
	<p>Upload CSV for inference:</p>
	<input type="file" name="csv_file" accept=".csv" required>
	<br><br>
	<input type="submit" value="Start Inference">
	</form>
	</body>
	</html>
	"""

	@app.route('/upload', methods=['POST'])
	def upload_file():
	try:
	# Check if files were uploaded
	if 'csv_file' not in request.files:
	return jsonify({'success': False, 'error': 'No CSV file uploaded'})

	csv_file = request.files['csv_file']
	if csv_file.filename == '':
	return jsonify({'success': False, 'error': 'No CSV file selected'})

	# Default model and normalization files
	model_path = 'best_model.pth'
	normalization_path = 'normalization_params_1_atlanttic_regular_intervals_with_lat_lon_velocity_and_time_difference_filter_outlier_segment_min_20_points.json'

	# Handle optional file uploads
	custom_model_uploaded = False
	custom_norm_uploaded = False

	if 'model_file' in request.files and request.files['model_file'].filename != '':
	model_file = request.files['model_file']
	model_filename = secure_filename(model_file.filename)
	model_path = os.path.join(app.config['UPLOAD_FOLDER'], model_filename)
	try:
	model_file.save(model_path)
	custom_model_uploaded = True
	except PermissionError:
	# Try with temp directory
	import tempfile
	temp_dir = tempfile.gettempdir()
	model_path = os.path.join(temp_dir, model_filename)
	model_file.save(model_path)
	custom_model_uploaded = True
	print(f"⚠️ WARNING: Saved model to temp directory: {model_path}")
	except Exception as e:
	return jsonify({'success': False, 'error': f'Failed to save model file: {str(e)}'})

	if 'normalization_file' in request.files and request.files['normalization_file'].filename != '':
	norm_file = request.files['normalization_file']
	norm_filename = secure_filename(norm_file.filename)
	normalization_path = os.path.join(app.config['UPLOAD_FOLDER'], norm_filename)
	try:
	norm_file.save(normalization_path)
	custom_norm_uploaded = True
	except PermissionError:
	# Try with temp directory
	import tempfile
	temp_dir = tempfile.gettempdir()
	normalization_path = os.path.join(temp_dir, norm_filename)
	norm_file.save(normalization_path)
	custom_norm_uploaded = True
	print(f"⚠️ WARNING: Saved normalization to temp directory: {normalization_path}")
	except Exception as e:
	return jsonify({'success': False, 'error': f'Failed to save normalization file: {str(e)}'})

	# Validate model file exists
	if not os.path.exists(model_path):
	if custom_model_uploaded:
	return jsonify({'success': False, 'error': f'Failed to save uploaded model file: {model_path}'})
	else:
	return jsonify({'success': False, 'error': f'Default model file not found: {model_path}. Please upload a model file or ensure best_model.pth exists in the root directory.'})

	# Validate normalization file exists
	if not os.path.exists(normalization_path):
	if custom_norm_uploaded:
	return jsonify({'success': False, 'error': f'Failed to save uploaded normalization file: {normalization_path}'})
	else:
	return jsonify({'success': False, 'error': f'Default normalization file not found: {normalization_path}. Please upload a normalization file or ensure the JSON file exists in the root directory.'})

	# Save CSV file with error handling
	csv_filename = secure_filename(csv_file.filename)
	csv_path = os.path.join(app.config['UPLOAD_FOLDER'], csv_filename)

	try:
	csv_file.save(csv_path)
	except PermissionError as e:
	# Try alternative approaches if uploads directory has permission issues
	try:
	# Try with different permissions
	os.chmod(app.config['UPLOAD_FOLDER'], 0o777)
	csv_file.save(csv_path)
	except:
	# Fall back to temporary directory
	import tempfile
	temp_dir = tempfile.gettempdir()
	csv_path = os.path.join(temp_dir, csv_filename)
	csv_file.save(csv_path)
	print(f"⚠️ WARNING: Saved CSV to temp directory due to permissions: {csv_path}")
	except Exception as e:
	return jsonify({'success': False, 'error': f'Failed to save CSV file: {str(e)}'})

	# Debug logging
	print(f"🔍 DEBUG: Using model_path: {model_path}")
	print(f"🔍 DEBUG: Using normalization_path: {normalization_path}")
	print(f"🔍 DEBUG: Model exists: {os.path.exists(model_path)}")
	print(f"🔍 DEBUG: Norm exists: {os.path.exists(normalization_path)}")

	# Start inference in background thread
	def run_inference_background():
	return run_inference_pipeline(csv_path, model_path, normalization_path)

	thread = threading.Thread(target=run_inference_background)
	thread.start()

	return jsonify({'success': True, 'message': 'Files uploaded successfully. Inference started.'})

	except Exception as e:
	return jsonify({'success': False, 'error': str(e)})

	@app.route('/progress')
	def get_progress():
	return jsonify(current_progress)

	@app.route('/download_results')
	def download_results():
	# Find the most recent results file in multiple locations
	search_directories = [
	'results/inference_atlantic', # Default results directory
	app.config['UPLOAD_FOLDER'], # Uploads directory
	tempfile.gettempdir() # System temp directory
	]

	latest_file = None
	latest_time = 0

	for directory in search_directories:
	if os.path.exists(directory):
	try:
	files = [f for f in os.listdir(directory) if f.endswith('_inference_results.csv')]
	for file in files:
	file_path = os.path.join(directory, file)
	file_time = os.path.getctime(file_path)
	if file_time > latest_time:
	latest_time = file_time
	latest_file = file_path
	except PermissionError:
	continue # Skip directories we can't read

	if latest_file and os.path.exists(latest_file):
	return send_file(
	latest_file,
	as_attachment=True,
	download_name='vessel_inference_results.csv'
	)

	return jsonify({'error': 'No results file found. Please run inference first.'}), 404

	########################################
	# SOCKETIO EVENTS #
	########################################

	@socketio.on('connect')
	def handle_connect():
	emit('progress_update', current_progress)

	@socketio.on('start_inference')
	def handle_start_inference(data):
	"""Handle inference request via WebSocket"""
	try:
	csv_path = data.get('csv_path')
	model_path = data.get('model_path', 'best_model.pth')
	norm_path = data.get('normalization_path', 'normalization_params_1_atlanttic_regular_intervals_with_lat_lon_velocity_and_time_difference_filter_outlier_segment_min_20_points.json')

	def run_inference_background():
	result = run_inference_pipeline(csv_path, model_path, norm_path)
	emit('inference_complete', result)

	thread = threading.Thread(target=run_inference_background)
	thread.start()

	except Exception as e:
	emit('inference_complete', {'success': False, 'error': str(e)})

	if __name__ == '__main__':
	print("🚢 Vessel Trajectory Inference Web App")
	print("📊 Using Final_inference_maginet.py logic")

	# Get port from environment variable (Hugging Face Spaces uses 7860)
	port = int(os.environ.get('PORT', 7860))
	print(f"🌐 Starting server at http://0.0.0.0:{port}")
	print("📝 Make sure you have:")
	print(" - best_model.pth")
	print(" - normalization_params_1_atlanttic_regular_intervals_...json")
	print(" - Your test dataset CSV")

	socketio.run(app, host='0.0.0.0', port=port, debug=False, allow_unsafe_werkzeug=True)