In [None]:
!pip install --upgrade tensorflow
!pip install --upgrade keras


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers

# Define the model architecture.
class VisionTransformer(keras.Model):

    def __init__(self, num_classes):
        super(VisionTransformer, self).__init__()

        # The embedding layer converts each image patch into a vector representation.
        self.embedding = layers.Embedding(
            input_dim=256,
            output_dim=512,
            input_shape=(7, 7, 3),
            trainable=True,
        )

        # The transformer encoder consists of a stack of self-attention layers.
        self.transformer_encoder = layers.TransformerEncoder(
            num_heads=8,
            num_layers=12,
            dropout=0.1,
        )

        # The classification layer outputs the class probabilities for each image.
        self.classification = layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        # Extract the image patches from the input image.
        patches = tf.image.extract_patches(
            inputs=inputs,
            size=(7, 7, 3),
            strides=(2, 2, 1),
            padding="SAME",
        )

        # Convert the image patches into vector representations.
        embedded_patches = self.embedding(patches)

        # Encode the image patches using the transformer encoder.
        encoded_patches = self.transformer_encoder(embedded_patches)

        # Classify the image using the classification layer.
        predictions = self.classification(encoded_patches)

        return predictions

# Load the model weights from a checkpoint.
model = VisionTransformer(num_classes=100)
model.load_weights("checkpoints/model.ckpt")

# Create a video capture object.
cap = cv2.VideoCapture(0)

# Start a loop to capture frames from the camera and classify them.
while True:

    # Capture a frame from the camera.
    ret, frame = cap.read()

    # Convert the frame to a NumPy array.
    frame = np.array(frame)

    # Classify the frame.
    predictions = model.predict(frame)

    # Display the classification results.
    cv2.putText(frame, "Prediction: {}".format(predictions[0]), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the frame.
    cv2.imshow("Frame", frame)

    # Wait for a key press.
    key = cv2.waitKey(1) & 0xFF

    # If the key `q` is pressed, break out of the loop.
    if key == ord("q"):
        break

# Release the video capture object.
cap.release()

# Close all open windows.
cv2.destroyAllWindows()

AttributeError: ignored