olofAstrand
/

gaze_test

Keras

Model card Files Files and versions

xet

Community

Olof Astrand commited on Jun 4, 2025

Commit

c17a15c

1 Parent(s): 0eda6ef

Minor updates

Browse files

Files changed (3) hide show

README.md +51 -2
inference_claude.py +319 -0
training_faces.py +349 -0

README.md CHANGED Viewed

@@ -13,8 +13,11 @@ Creating a dataset
 ==================
 collector.py
 collector.html
-When creating a dataset in the browser you will have to convert ti with
 convert.py
 Training from web based dataset
@@ -23,10 +26,56 @@ training.py
 Training from OpenCV created dataset
 ==============
 training_deepseek.py
 Inference
 ==========
 inference.py
-This does not work in a wsl environment

 ==================
 collector.py
 collector.html
+When creating a dataset in the browser you will have to convert it with
 convert.py
+When creating dataset with collector you have to preprocess it with
+preprocess.py
 Training from web based dataset
 Training from OpenCV created dataset
 ==============
+training_faces.py
 training_deepseek.py
 Inference
 ==========
 inference.py
+This does not work in a wsl environment as we cannot access the camera.
+Mobilenet used
+Model architecture:
+Model: "functional"
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
+┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
+│ input_layer (InputLayer)             │ (None, 60, 80, 3)           │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ conv2d (Conv2D)                      │ (None, 60, 80, 32)          │             896 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ max_pooling2d (MaxPooling2D)         │ (None, 30, 40, 32)          │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dropout (Dropout)                    │ (None, 30, 40, 32)          │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ conv2d_1 (Conv2D)                    │ (None, 30, 40, 64)          │          18,496 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ max_pooling2d_1 (MaxPooling2D)       │ (None, 15, 20, 64)          │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dropout_1 (Dropout)                  │ (None, 15, 20, 64)          │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ conv2d_2 (Conv2D)                    │ (None, 15, 20, 128)         │          73,856 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ max_pooling2d_2 (MaxPooling2D)       │ (None, 7, 10, 128)          │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dropout_2 (Dropout)                  │ (None, 7, 10, 128)          │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ global_average_pooling2d             │ (None, 128)                 │               0 │
+│ (GlobalAveragePooling2D)             │                             │                 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dense (Dense)                        │ (None, 128)                 │          16,512 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dropout_3 (Dropout)                  │ (None, 128)                 │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dense_1 (Dense)                      │ (None, 64)                  │           8,256 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dropout_4 (Dropout)                  │ (None, 64)                  │               0 │
+├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
+│ dense_2 (Dense)                      │ (None, 2)                   │             130 │
+└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
+ Total params: 118,146 (461.51 KB)
+ Trainable params: 118,146 (461.51 KB)

inference_claude.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import cv2
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+import tkinter as tk
+class GazeInference:
+    def __init__(self, model_path):
+        # Load model
+        self.model = keras.models.load_model(model_path)
+        # Initialize face cascade classifier
+        self.face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+        # Get actual screen resolution using tkinter
+        self._get_screen_resolution()
+        # Gaze smoothing
+        self.smoothing = True
+        self.smoothing_factor = 0.2
+        self.last_gaze = None
+        # Face visualization
+        self.show_face = False
+        self.face_roi = None
+        self.upper_face_roi = None
+        # Face detection parameters
+        self.min_face_size = (50, 50)
+        self.scale_factor = 1.1
+        self.min_neighbors = 5
+        # Adjustable crop parameters (can be modified with keys)
+        self.crop_top = 0.05    # Start at 25% from top of face
+        self.crop_bottom = 0.80  # End at 55% from top of face
+        self.crop_sides = 0.05   # Crop 15% from each side
+    def _get_screen_resolution(self):
+        """Get primary screen resolution using tkinter"""
+        root = tk.Tk()
+        self.screen_width = root.winfo_screenwidth()
+        self.screen_height = root.winfo_screenheight()
+        root.destroy()
+        print(f"Screen resolution: {self.screen_width}x{self.screen_height}")
+    def _extract_upper_face_region(self, frame):
+        """Extract upper half of face region from frame"""
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        # Detect faces
+        faces = self.face_cascade.detectMultiScale(
+            gray,
+            scaleFactor=self.scale_factor,
+            minNeighbors=self.min_neighbors,
+            minSize=self.min_face_size
+        )
+        if len(faces) > 0:
+            # Get the largest face (assuming it's the closest/main face)
+            face = max(faces, key=lambda x: x[2] * x[3])
+            x, y, w, h = face
+            # Store full face for visualization
+            face_padding = 10
+            x_start = max(0, x - face_padding)
+            y_start = max(0, y - face_padding)
+            x_end = min(frame.shape[1], x + w + face_padding)
+            y_end = min(frame.shape[0], y + h + face_padding)
+            self.face_roi = frame[y_start:y_end, x_start:x_end].copy()
+            # Extract tight crop around eyes region to match training data
+            # Based on typical face proportions:
+            # - Eyes are typically at 40-50% down from top of face
+            # - Eye region height is about 20-30% of face height
+            # Use adjustable crop parameters
+            eye_region_start = int(h * self.crop_top)
+            eye_region_end = int(h * self.crop_bottom)
+            # For width, focus on central portion of face
+            width_crop = int(w * self.crop_sides)
+            # Calculate bounds for eye region
+            uf_x_start = max(0, x + width_crop)
+            uf_y_start = max(0, y + eye_region_start)
+            uf_x_end = min(frame.shape[1], x + w - width_crop)
+            uf_y_end = min(frame.shape[0], y + eye_region_end)
+            # Extract and resize upper face region
+            upper_face = frame[uf_y_start:uf_y_end, uf_x_start:uf_x_end]
+            # Resize to model input size (80x60)
+            upper_face_resized = cv2.resize(upper_face, (80, 60))
+            self.upper_face_roi = upper_face_resized.copy()
+            return upper_face_resized
+        return None
+    def _predict_gaze(self, upper_face_region):
+        """Predict gaze position from upper face region"""
+        # Preprocess
+        face_input = cv2.cvtColor(upper_face_region, cv2.COLOR_BGR2RGB)
+        face_input = face_input.astype('float32') / 255.0
+        face_input = np.expand_dims(face_input, axis=0)
+        # Predict
+        pred = self.model.predict(face_input, verbose=0)[0]
+        # Convert to screen coordinates
+        screen_x = int(pred[0] * self.screen_width)
+        screen_y = int(pred[1] * self.screen_height)
+        # Fix left-right inversion: flip X coordinate
+        # Since camera is mirrored, we need to invert the X prediction
+        screen_x = self.screen_width - screen_x
+        # Clamp to screen bounds
+        screen_x = max(0, min(self.screen_width - 1, screen_x))
+        screen_y = max(0, min(self.screen_height - 1, screen_y))
+        # Apply smoothing if enabled
+        if self.smoothing and self.last_gaze is not None:
+            screen_x = int(self.smoothing_factor * screen_x +
+                          (1 - self.smoothing_factor) * self.last_gaze[0])
+            screen_y = int(self.smoothing_factor * screen_y +
+                          (1 - self.smoothing_factor) * self.last_gaze[1])
+        self.last_gaze = (screen_x, screen_y)
+        return screen_x, screen_y
+    def _draw_gaze_cross(self, frame, x, y):
+        """Draw crosshair at gaze position"""
+        color = (0, 255, 0)  # Green
+        size = 30
+        thickness = 3
+        # Horizontal line
+        cv2.line(frame, (x - size, y), (x + size, y), color, thickness)
+        # Vertical line
+        cv2.line(frame, (x, y - size), (x, y + size), color, thickness)
+        # Center circle
+        cv2.circle(frame, (x, y), 5, color, -1)
+    def _draw_face_roi(self, frame):
+        """Draw face region visualization in bottom left"""
+        if self.show_face and self.face_roi is not None and self.upper_face_roi is not None:
+            # Calculate display sizes
+            max_height = 200
+            # Display full face
+            face_h, face_w = self.face_roi.shape[:2]
+            display_h = min(max_height, self.screen_height // 4)
+            display_w = int(display_h * (face_w / face_h))
+            # Ensure we don't exceed screen dimensions
+            display_w = min(display_w, self.screen_width // 3)
+            display_h = min(display_h, self.screen_height // 3)
+            # Resize face for display
+            face_display = cv2.resize(self.face_roi, (display_w, display_h))
+            # Position for full face (bottom left)
+            face_y = self.screen_height - display_h - 10
+            face_x = 10
+            # Draw full face
+            try:
+                frame[face_y:face_y + display_h, face_x:face_x + display_w] = face_display
+                cv2.rectangle(frame, (face_x, face_y),
+                            (face_x + display_w, face_y + display_h), (255, 255, 255), 2)
+                cv2.putText(frame, "Full Face", (face_x + 5, face_y - 5),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+            except:
+                pass
+            # Display upper face region
+            uf_h = int(display_h * 0.75)  # 60:80 aspect ratio
+            uf_w = display_w
+            upper_face_display = cv2.resize(self.upper_face_roi, (uf_w, uf_h))
+            # Position for upper face (next to full face)
+            uf_x = face_x + display_w + 20
+            uf_y = self.screen_height - uf_h - 10
+            # Draw upper face
+            try:
+                frame[uf_y:uf_y + uf_h, uf_x:uf_x + uf_w] = upper_face_display
+                cv2.rectangle(frame, (uf_x, uf_y),
+                            (uf_x + uf_w, uf_y + uf_h), (0, 255, 255), 2)
+                cv2.putText(frame, "Upper Face (Model Input)", (uf_x + 5, uf_y - 5),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)
+            except:
+                pass
+    def run(self):
+        """Main inference loop"""
+        cap = cv2.VideoCapture(0)
+        if not cap.isOpened():
+            print("Error: Could not open webcam")
+            return
+        # Create fullscreen window
+        cv2.namedWindow('Gaze Prediction', cv2.WND_PROP_FULLSCREEN)
+        cv2.setWindowProperty('Gaze Prediction', cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
+        print("Controls:")
+        print("'s': Toggle Smoothing")
+        print("'f': Toggle Face View")
+        print("'q': Quit")
+        print("Crop Adjustment:")
+        print("'u'/'j': Move crop top up/down")
+        print("'i'/'k': Move crop bottom up/down")
+        print("'o'/'l': Decrease/increase side crop")
+        print(f"Using screen resolution: {self.screen_width}x{self.screen_height}")
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Create black canvas matching screen size
+            canvas = np.zeros((self.screen_height, self.screen_width, 3), dtype=np.uint8)
+            # Mirror the frame for more natural interaction
+            frame = cv2.flip(frame, 1)
+            # Process frame
+            upper_face = self._extract_upper_face_region(frame)
+            if upper_face is not None:
+                # Predict gaze
+                gaze_x, gaze_y = self._predict_gaze(upper_face)
+                # Draw gaze cross on canvas
+                self._draw_gaze_cross(canvas, gaze_x, gaze_y)
+                # Show coordinates for debugging
+                cv2.putText(canvas, f"Gaze: ({gaze_x}, {gaze_y})",
+                           (20, self.screen_height - 30), cv2.FONT_HERSHEY_SIMPLEX,
+                           0.7, (255, 255, 0), 2)
+                # Draw face regions if enabled
+                if self.show_face:
+                    self._draw_face_roi(canvas)
+            else:
+                # Show "no face detected" message
+                cv2.putText(canvas, "No face detected",
+                           (20, self.screen_height - 30), cv2.FONT_HERSHEY_SIMPLEX,
+                           0.7, (0, 0, 255), 2)
+            # Show instructions
+            cv2.putText(canvas, "'s': Smoothing | 'f': Face View | 'u/j': Top | 'i/k': Bottom | 'o/l': Sides | 'q': Quit",
+                       (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+            # Show smoothing status
+            status = "ON" if self.smoothing else "OFF"
+            cv2.putText(canvas, f"Smoothing: {status}",
+                       (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
+                       (0, 255, 0) if self.smoothing else (0, 0, 255), 2)
+            # Show face view status
+            status = "ON" if self.show_face else "OFF"
+            cv2.putText(canvas, f"Face View: {status}",
+                       (20, 110), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
+                       (0, 255, 0) if self.show_face else (0, 0, 255), 2)
+            # Show screen resolution info and crop parameters
+            cv2.putText(canvas, f"Screen: {self.screen_width}x{self.screen_height}",
+                       (20, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
+            cv2.putText(canvas, f"Crop: Top={self.crop_top:.2f} Bottom={self.crop_bottom:.2f} Sides={self.crop_sides:.2f}",
+                       (20, 190), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (200, 200, 200), 2)
+            # Display
+            cv2.imshow('Gaze Prediction', canvas)
+            # Handle key presses
+            key = cv2.waitKey(1) & 0xFF
+            if key == ord('q'):
+                break
+            elif key == ord('s'):
+                self.smoothing = not self.smoothing
+                print(f"Smoothing: {'ON' if self.smoothing else 'OFF'}")
+            elif key == ord('f'):
+                self.show_face = not self.show_face
+                print(f"Face View: {'ON' if self.show_face else 'OFF'}")
+            # Crop adjustment keys
+            elif key == ord('u'):  # Move top up
+                self.crop_top = max(0.0, self.crop_top - 0.05)
+                print(f"Crop top: {self.crop_top:.2f}")
+            elif key == ord('j'):  # Move top down
+                self.crop_top = min(self.crop_bottom - 0.1, self.crop_top + 0.05)
+                print(f"Crop top: {self.crop_top:.2f}")
+            elif key == ord('i'):  # Move bottom up
+                self.crop_bottom = max(self.crop_top + 0.1, self.crop_bottom - 0.05)
+                print(f"Crop bottom: {self.crop_bottom:.2f}")
+            elif key == ord('k'):  # Move bottom down
+                self.crop_bottom = min(1.0, self.crop_bottom + 0.05)
+                print(f"Crop bottom: {self.crop_bottom:.2f}")
+            elif key == ord('o'):  # Decrease side crop
+                self.crop_sides = max(0.0, self.crop_sides - 0.05)
+                print(f"Crop sides: {self.crop_sides:.2f}")
+            elif key == ord('l'):  # Increase side crop
+                self.crop_sides = min(0.4, self.crop_sides + 0.05)
+                print(f"Crop sides: {self.crop_sides:.2f}")
+        cap.release()
+        cv2.destroyAllWindows()
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, required=True,
+                       help='Path to trained gaze estimation model')
+    args = parser.parse_args()
+    gaze_inference = GazeInference(args.model)
+    gaze_inference.run()

training_faces.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+import json
+import cv2
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+# Set memory growth for GPU
+gpus = tf.config.experimental.list_physical_devices('GPU')
+if gpus:
+    try:
+        for gpu in gpus:
+            tf.config.experimental.set_memory_growth(gpu, True)
+    except RuntimeError as e:
+        print(e)
+class ImprovedGazeModel:
+    def __init__(self, input_shape=(60, 80, 3)):
+        self.input_shape = input_shape
+        self.model = None
+    def build_simple_model(self):
+        """Build a simpler, more effective model."""
+        inputs = keras.Input(shape=self.input_shape)
+        # First conv block
+        x = layers.Conv2D(16, (5, 5), padding='same')(inputs)
+        x = layers.BatchNormalization()(x)
+        x = layers.Activation('relu')(x)
+        x = layers.MaxPooling2D((2, 2))(x)
+        # Second conv block
+        x = layers.Conv2D(32, (3, 3), padding='same')(x)
+        x = layers.BatchNormalization()(x)
+        x = layers.Activation('relu')(x)
+        x = layers.MaxPooling2D((2, 2))(x)
+        # Third conv block
+        x = layers.Conv2D(64, (3, 3), padding='same')(x)
+        x = layers.BatchNormalization()(x)
+        x = layers.Activation('relu')(x)
+        x = layers.MaxPooling2D((2, 2))(x)
+        # Flatten and dense layers
+        x = layers.Flatten()(x)
+        x = layers.Dense(128)(x)
+        x = layers.BatchNormalization()(x)
+        x = layers.Activation('relu')(x)
+        x = layers.Dropout(0.3)(x)
+        x = layers.Dense(64)(x)
+        x = layers.BatchNormalization()(x)
+        x = layers.Activation('relu')(x)
+        x = layers.Dropout(0.3)(x)
+        # Output layer - no activation (linear regression)
+        outputs = layers.Dense(2)(x)
+        self.model = keras.Model(inputs, outputs, name='gaze_model')
+        return self.model
+    def compile_model(self, learning_rate=0.0001):
+        """Compile with better optimizer settings."""
+        self.model.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
+            loss='mse',
+            metrics=['mae']
+        )
+def load_preprocessed_data(data_dir):
+    """Load the preprocessed face dataset."""
+    data_dir = Path(data_dir)
+    # Load metadata
+    with open(data_dir / 'metadata.json', 'r') as f:
+        metadata = json.load(f)
+    screen_width = metadata['screen_width']
+    screen_height = metadata['screen_height']
+    data_points = metadata['data_points']
+    print(f"Loading {len(data_points)} data points...")
+    print(f"Screen dimensions: {screen_width}x{screen_height}")
+    # Load images and gaze coordinates
+    images = []
+    gaze_coords = []
+    print("Loading images...")
+    for i, point in enumerate(data_points):
+        if i % 500 == 0:
+            print(f"Progress: {i}/{len(data_points)}")
+        img_path = data_dir / 'images' / point['image']
+        if img_path.exists():
+            img = cv2.imread(str(img_path))
+            if img is not None:
+                # Convert to RGB and normalize
+                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+                img = img.astype(np.float32) / 255.0
+                images.append(img)
+                # Normalize gaze coordinates to [0, 1]
+                norm_x = point['screen_x'] / screen_width
+                norm_y = point['screen_y'] / screen_height
+                gaze_coords.append([norm_x, norm_y])
+    images = np.array(images)
+    gaze_coords = np.array(gaze_coords)
+    print(f"\nSuccessfully loaded {len(images)} images")
+    print(f"Image shape: {images[0].shape}")
+    print(f"Gaze X range: [{gaze_coords[:, 0].min():.3f}, {gaze_coords[:, 0].max():.3f}]")
+    print(f"Gaze Y range: [{gaze_coords[:, 1].min():.3f}, {gaze_coords[:, 1].max():.3f}]")
+    # Check for any outliers
+    x_outliers = np.sum((gaze_coords[:, 0] < 0) | (gaze_coords[:, 0] > 1))
+    y_outliers = np.sum((gaze_coords[:, 1] < 0) | (gaze_coords[:, 1] > 1))
+    if x_outliers > 0 or y_outliers > 0:
+        print(f"WARNING: Found {x_outliers} X outliers and {y_outliers} Y outliers")
+        # Clip to valid range
+        gaze_coords = np.clip(gaze_coords, 0, 1)
+    return images, gaze_coords, screen_width, screen_height
+def create_augmented_generator(X, y, batch_size=32, augment=True):
+    """Create a generator with augmentation."""
+    n_samples = len(X)
+    indices = np.arange(n_samples)
+    while True:
+        np.random.shuffle(indices)
+        for start in range(0, n_samples, batch_size):
+            end = min(start + batch_size, n_samples)
+            batch_indices = indices[start:end]
+            batch_X = X[batch_indices].copy()
+            batch_y = y[batch_indices].copy()
+            if augment:
+                for i in range(len(batch_X)):
+                    # Random brightness adjustment
+                    if np.random.random() > 0.5:
+                        brightness = np.random.uniform(0.7, 1.3)
+                        batch_X[i] = np.clip(batch_X[i] * brightness, 0, 1)
+                    # Random contrast adjustment
+                    if np.random.random() > 0.5:
+                        contrast = np.random.uniform(0.8, 1.2)
+                        mean = batch_X[i].mean()
+                        batch_X[i] = np.clip((batch_X[i] - mean) * contrast + mean, 0, 1)
+                    # Small random noise
+                    if np.random.random() > 0.5:
+                        noise = np.random.normal(0, 0.01, batch_X[i].shape)
+                        batch_X[i] = np.clip(batch_X[i] + noise, 0, 1)
+            yield batch_X, batch_y
+def visualize_data_distribution(gaze_coords, screen_width, screen_height, save_path='gaze_distribution.png'):
+    """Visualize the distribution of gaze points."""
+    plt.figure(figsize=(12, 6))
+    # Denormalize for visualization
+    x_pixels = gaze_coords[:, 0] * screen_width
+    y_pixels = gaze_coords[:, 1] * screen_height
+    # 2D histogram
+    plt.subplot(1, 2, 1)
+    plt.hist2d(x_pixels, y_pixels, bins=50, cmap='hot')
+    plt.colorbar(label='Count')
+    plt.xlabel('X (pixels)')
+    plt.ylabel('Y (pixels)')
+    plt.title('Gaze Point Distribution')
+    plt.gca().invert_yaxis()  # Invert Y axis to match screen coordinates
+    # 1D distributions
+    plt.subplot(1, 2, 2)
+    plt.hist(x_pixels, bins=50, alpha=0.5, label='X distribution', density=True)
+    plt.hist(y_pixels, bins=50, alpha=0.5, label='Y distribution', density=True)
+    plt.xlabel('Position (pixels)')
+    plt.ylabel('Density')
+    plt.title('X and Y Distributions')
+    plt.legend()
+    plt.tight_layout()
+    plt.savefig(save_path)
+    plt.close()
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description='Train improved gaze model')
+    parser.add_argument('--data', type=str, default='gaze_data_faces',
+                       help='Preprocessed face dataset directory')
+    parser.add_argument('--epochs', type=int, default=100,
+                       help='Number of training epochs')
+    parser.add_argument('--batch-size', type=int, default=32,
+                       help='Batch size for training')
+    parser.add_argument('--lr', type=float, default=0.0001,
+                       help='Learning rate')
+    args = parser.parse_args()
+    # Load data
+    images, gaze_coords, screen_width, screen_height = load_preprocessed_data(args.data)
+    # Visualize data distribution
+    visualize_data_distribution(gaze_coords, screen_width, screen_height)
+    # Split data
+    X_temp, X_test, y_temp, y_test = train_test_split(
+        images, gaze_coords, test_size=0.15, random_state=42, shuffle=True
+    )
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_temp, y_temp, test_size=0.176, random_state=42, shuffle=True  # 0.176 ≈ 0.15/(1-0.15)
+    )
+    print(f"\nDataset splits:")
+    print(f"Training: {len(X_train)} samples")
+    print(f"Validation: {len(X_val)} samples")
+    print(f"Test: {len(X_test)} samples")
+    # Build and compile model
+    model = ImprovedGazeModel(input_shape=X_train.shape[1:])
+    model.build_simple_model()
+    model.compile_model(learning_rate=args.lr)
+    print("\nModel architecture:")
+    model.model.summary()
+    # Create generators
+    train_gen = create_augmented_generator(X_train, y_train, args.batch_size, augment=True)
+    val_gen = create_augmented_generator(X_val, y_val, args.batch_size, augment=False)
+    steps_per_epoch = len(X_train) // args.batch_size
+    validation_steps = len(X_val) // args.batch_size
+    # Callbacks
+    callbacks = [
+        keras.callbacks.ModelCheckpoint(
+            'best_gaze_model_improved.keras',
+            monitor='val_loss',
+            save_best_only=True,
+            verbose=1
+        ),
+        keras.callbacks.EarlyStopping(
+            monitor='val_loss',
+            patience=20,
+            restore_best_weights=True,
+            verbose=1
+        ),
+        keras.callbacks.ReduceLROnPlateau(
+            monitor='val_loss',
+            factor=0.5,
+            patience=10,
+            min_lr=1e-7,
+            verbose=1
+        ),
+        keras.callbacks.CSVLogger('training_log.csv')
+    ]
+    # Train
+    print("\nStarting training...")
+    history = model.model.fit(
+        train_gen,
+        steps_per_epoch=steps_per_epoch,
+        validation_data=val_gen,
+        validation_steps=validation_steps,
+        epochs=args.epochs,
+        callbacks=callbacks,
+        verbose=1
+    )
+    # Evaluate on test set
+    print("\nEvaluating on test set...")
+    test_loss, test_mae = model.model.evaluate(X_test, y_test, batch_size=args.batch_size)
+    # Get predictions
+    predictions = model.model.predict(X_test, batch_size=args.batch_size)
+    # Calculate pixel errors
+    pred_pixels = predictions * np.array([screen_width, screen_height])
+    actual_pixels = y_test * np.array([screen_width, screen_height])
+    pixel_errors = np.abs(pred_pixels - actual_pixels)
+    euclidean_errors = np.sqrt(np.sum((pred_pixels - actual_pixels)**2, axis=1))
+    print(f"\nTest Results:")
+    print(f"Loss: {test_loss:.6f}")
+    print(f"MAE (normalized): {test_mae:.6f}")
+    print(f"Mean X error: {pixel_errors[:, 0].mean():.1f} pixels")
+    print(f"Mean Y error: {pixel_errors[:, 1].mean():.1f} pixels")
+    print(f"Mean Euclidean error: {euclidean_errors.mean():.1f} pixels")
+    print(f"Median Euclidean error: {np.median(euclidean_errors):.1f} pixels")
+    print(f"95th percentile error: {np.percentile(euclidean_errors, 95):.1f} pixels")
+    # Plot training history
+    plt.figure(figsize=(12, 4))
+    plt.subplot(1, 2, 1)
+    plt.plot(history.history['loss'], label='Training Loss')
+    plt.plot(history.history['val_loss'], label='Validation Loss')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.title('Model Loss')
+    plt.legend()
+    plt.yscale('log')
+    plt.subplot(1, 2, 2)
+    plt.plot(history.history['mae'], label='Training MAE')
+    plt.plot(history.history['val_mae'], label='Validation MAE')
+    plt.xlabel('Epoch')
+    plt.ylabel('MAE')
+    plt.title('Model MAE')
+    plt.legend()
+    plt.tight_layout()
+    plt.savefig('improved_training_history.png')
+    plt.close()
+    # Save configuration
+    config = {
+        'model_path': 'best_gaze_model_improved.keras',
+        'input_shape': list(model.input_shape),
+        'screen_width': int(screen_width),
+        'screen_height': int(screen_height),
+        'test_loss': float(test_loss),
+        'test_mae': float(test_mae),
+        'mean_euclidean_error': float(euclidean_errors.mean()),
+        'preprocessing': {
+            'crop_top': 0.25,
+            'crop_bottom': 0.55,
+            'crop_sides': 0.15
+        }
+    }
+    with open('model_config_improved.json', 'w') as f:
+        json.dump(config, f, indent=2)
+    print(f"\nModel saved to: best_gaze_model_improved.keras")
+    print(f"Config saved to: model_config_improved.json")
+if __name__ == "__main__":
+    main()