Spaces:

kamcio1989
/

anycoder-5932b618

Sleeping

App Files Files Community

kamcio1989 commited on Nov 24, 2025

Commit

c289710

verified ·

1 Parent(s): e5ad597

Upload 8 files

Browse files

Files changed (8) hide show

.gitattributes +35 -35
MobileNetSSD_deploy.caffemodel +0 -0
MobileNetSSD_deploy.prototxt +1912 -0
README.md +14 -14
app.py +234 -246
models.py +3 -3
requirements.txt +7 -7
utils.py +117 -6

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

MobileNetSSD_deploy.caffemodel ADDED Viewed

The diff for this file is too large to render. See raw diff

MobileNetSSD_deploy.prototxt ADDED Viewed

	@@ -0,0 +1,1912 @@

+name: "MobileNet-SSD"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 300
+  dim: 300
+}
+layer {
+  name: "conv0"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv0"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv0/relu"
+  type: "ReLU"
+  bottom: "conv0"
+  top: "conv0"
+}
+layer {
+  name: "conv1/dw"
+  type: "Convolution"
+  bottom: "conv0"
+  top: "conv1/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 1
+    kernel_size: 3
+    group: 32
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv1/dw/relu"
+  type: "ReLU"
+  bottom: "conv1/dw"
+  top: "conv1/dw"
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "conv1/dw"
+  top: "conv1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv1/relu"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "conv2/dw"
+  type: "Convolution"
+  bottom: "conv1"
+  top: "conv2/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 64
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv2/dw/relu"
+  type: "ReLU"
+  bottom: "conv2/dw"
+  top: "conv2/dw"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "conv2/dw"
+  top: "conv2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv2/relu"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "conv3/dw"
+  type: "Convolution"
+  bottom: "conv2"
+  top: "conv3/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    group: 128
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv3/dw/relu"
+  type: "ReLU"
+  bottom: "conv3/dw"
+  top: "conv3/dw"
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "conv3/dw"
+  top: "conv3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv3/relu"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4/dw"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 128
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4/dw/relu"
+  type: "ReLU"
+  bottom: "conv4/dw"
+  top: "conv4/dw"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv4/dw"
+  top: "conv4"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4/relu"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5/dw"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 256
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv5/dw/relu"
+  type: "ReLU"
+  bottom: "conv5/dw"
+  top: "conv5/dw"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv5/dw"
+  top: "conv5"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv5/relu"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "conv6/dw"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "conv6/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 256
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6/dw/relu"
+  type: "ReLU"
+  bottom: "conv6/dw"
+  top: "conv6/dw"
+}
+layer {
+  name: "conv6"
+  type: "Convolution"
+  bottom: "conv6/dw"
+  top: "conv6"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6/relu"
+  type: "ReLU"
+  bottom: "conv6"
+  top: "conv6"
+}
+layer {
+  name: "conv7/dw"
+  type: "Convolution"
+  bottom: "conv6"
+  top: "conv7/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7/dw/relu"
+  type: "ReLU"
+  bottom: "conv7/dw"
+  top: "conv7/dw"
+}
+layer {
+  name: "conv7"
+  type: "Convolution"
+  bottom: "conv7/dw"
+  top: "conv7"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7/relu"
+  type: "ReLU"
+  bottom: "conv7"
+  top: "conv7"
+}
+layer {
+  name: "conv8/dw"
+  type: "Convolution"
+  bottom: "conv7"
+  top: "conv8/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8/dw/relu"
+  type: "ReLU"
+  bottom: "conv8/dw"
+  top: "conv8/dw"
+}
+layer {
+  name: "conv8"
+  type: "Convolution"
+  bottom: "conv8/dw"
+  top: "conv8"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8/relu"
+  type: "ReLU"
+  bottom: "conv8"
+  top: "conv8"
+}
+layer {
+  name: "conv9/dw"
+  type: "Convolution"
+  bottom: "conv8"
+  top: "conv9/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9/dw/relu"
+  type: "ReLU"
+  bottom: "conv9/dw"
+  top: "conv9/dw"
+}
+layer {
+  name: "conv9"
+  type: "Convolution"
+  bottom: "conv9/dw"
+  top: "conv9"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9/relu"
+  type: "ReLU"
+  bottom: "conv9"
+  top: "conv9"
+}
+layer {
+  name: "conv10/dw"
+  type: "Convolution"
+  bottom: "conv9"
+  top: "conv10/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv10/dw/relu"
+  type: "ReLU"
+  bottom: "conv10/dw"
+  top: "conv10/dw"
+}
+layer {
+  name: "conv10"
+  type: "Convolution"
+  bottom: "conv10/dw"
+  top: "conv10"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv10/relu"
+  type: "ReLU"
+  bottom: "conv10"
+  top: "conv10"
+}
+layer {
+  name: "conv11/dw"
+  type: "Convolution"
+  bottom: "conv10"
+  top: "conv11/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11/dw/relu"
+  type: "ReLU"
+  bottom: "conv11/dw"
+  top: "conv11/dw"
+}
+layer {
+  name: "conv11"
+  type: "Convolution"
+  bottom: "conv11/dw"
+  top: "conv11"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11/relu"
+  type: "ReLU"
+  bottom: "conv11"
+  top: "conv11"
+}
+layer {
+  name: "conv12/dw"
+  type: "Convolution"
+  bottom: "conv11"
+  top: "conv12/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv12/dw/relu"
+  type: "ReLU"
+  bottom: "conv12/dw"
+  top: "conv12/dw"
+}
+layer {
+  name: "conv12"
+  type: "Convolution"
+  bottom: "conv12/dw"
+  top: "conv12"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv12/relu"
+  type: "ReLU"
+  bottom: "conv12"
+  top: "conv12"
+}
+layer {
+  name: "conv13/dw"
+  type: "Convolution"
+  bottom: "conv12"
+  top: "conv13/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 1
+    kernel_size: 3
+    group: 1024
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13/dw/relu"
+  type: "ReLU"
+  bottom: "conv13/dw"
+  top: "conv13/dw"
+}
+layer {
+  name: "conv13"
+  type: "Convolution"
+  bottom: "conv13/dw"
+  top: "conv13"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13/relu"
+  type: "ReLU"
+  bottom: "conv13"
+  top: "conv13"
+}
+layer {
+  name: "conv14_1"
+  type: "Convolution"
+  bottom: "conv13"
+  top: "conv14_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_1/relu"
+  type: "ReLU"
+  bottom: "conv14_1"
+  top: "conv14_1"
+}
+layer {
+  name: "conv14_2"
+  type: "Convolution"
+  bottom: "conv14_1"
+  top: "conv14_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_2/relu"
+  type: "ReLU"
+  bottom: "conv14_2"
+  top: "conv14_2"
+}
+layer {
+  name: "conv15_1"
+  type: "Convolution"
+  bottom: "conv14_2"
+  top: "conv15_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_1/relu"
+  type: "ReLU"
+  bottom: "conv15_1"
+  top: "conv15_1"
+}
+layer {
+  name: "conv15_2"
+  type: "Convolution"
+  bottom: "conv15_1"
+  top: "conv15_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_2/relu"
+  type: "ReLU"
+  bottom: "conv15_2"
+  top: "conv15_2"
+}
+layer {
+  name: "conv16_1"
+  type: "Convolution"
+  bottom: "conv15_2"
+  top: "conv16_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_1/relu"
+  type: "ReLU"
+  bottom: "conv16_1"
+  top: "conv16_1"
+}
+layer {
+  name: "conv16_2"
+  type: "Convolution"
+  bottom: "conv16_1"
+  top: "conv16_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_2/relu"
+  type: "ReLU"
+  bottom: "conv16_2"
+  top: "conv16_2"
+}
+layer {
+  name: "conv17_1"
+  type: "Convolution"
+  bottom: "conv16_2"
+  top: "conv17_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_1/relu"
+  type: "ReLU"
+  bottom: "conv17_1"
+  top: "conv17_1"
+}
+layer {
+  name: "conv17_2"
+  type: "Convolution"
+  bottom: "conv17_1"
+  top: "conv17_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_2/relu"
+  type: "ReLU"
+  bottom: "conv17_2"
+  top: "conv17_2"
+}
+layer {
+  name: "conv11_mbox_loc"
+  type: "Convolution"
+  bottom: "conv11"
+  top: "conv11_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 12
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv11_mbox_loc"
+  top: "conv11_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv11_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv11_mbox_loc_perm"
+  top: "conv11_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv11_mbox_conf"
+  type: "Convolution"
+  bottom: "conv11"
+  top: "conv11_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 63
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv11_mbox_conf"
+  top: "conv11_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv11_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv11_mbox_conf_perm"
+  top: "conv11_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv11_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv11"
+  bottom: "data"
+  top: "conv11_mbox_priorbox"
+  prior_box_param {
+    min_size: 60.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv13_mbox_loc"
+  type: "Convolution"
+  bottom: "conv13"
+  top: "conv13_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv13_mbox_loc"
+  top: "conv13_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv13_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv13_mbox_loc_perm"
+  top: "conv13_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv13_mbox_conf"
+  type: "Convolution"
+  bottom: "conv13"
+  top: "conv13_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv13_mbox_conf"
+  top: "conv13_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv13_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv13_mbox_conf_perm"
+  top: "conv13_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv13_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv13"
+  bottom: "data"
+  top: "conv13_mbox_priorbox"
+  prior_box_param {
+    min_size: 105.0
+    max_size: 150.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv14_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv14_2"
+  top: "conv14_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv14_2_mbox_loc"
+  top: "conv14_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv14_2_mbox_loc_perm"
+  top: "conv14_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv14_2"
+  top: "conv14_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv14_2_mbox_conf"
+  top: "conv14_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv14_2_mbox_conf_perm"
+  top: "conv14_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv14_2"
+  bottom: "data"
+  top: "conv14_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 150.0
+    max_size: 195.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv15_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv15_2"
+  top: "conv15_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv15_2_mbox_loc"
+  top: "conv15_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv15_2_mbox_loc_perm"
+  top: "conv15_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv15_2"
+  top: "conv15_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv15_2_mbox_conf"
+  top: "conv15_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv15_2_mbox_conf_perm"
+  top: "conv15_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv15_2"
+  bottom: "data"
+  top: "conv15_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 195.0
+    max_size: 240.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv16_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv16_2"
+  top: "conv16_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv16_2_mbox_loc"
+  top: "conv16_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv16_2_mbox_loc_perm"
+  top: "conv16_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv16_2"
+  top: "conv16_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv16_2_mbox_conf"
+  top: "conv16_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv16_2_mbox_conf_perm"
+  top: "conv16_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv16_2"
+  bottom: "data"
+  top: "conv16_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 240.0
+    max_size: 285.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv17_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv17_2"
+  top: "conv17_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv17_2_mbox_loc"
+  top: "conv17_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv17_2_mbox_loc_perm"
+  top: "conv17_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv17_2"
+  top: "conv17_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv17_2_mbox_conf"
+  top: "conv17_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv17_2_mbox_conf_perm"
+  top: "conv17_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv17_2"
+  bottom: "data"
+  top: "conv17_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 285.0
+    max_size: 300.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv11_mbox_loc_flat"
+  bottom: "conv13_mbox_loc_flat"
+  bottom: "conv14_2_mbox_loc_flat"
+  bottom: "conv15_2_mbox_loc_flat"
+  bottom: "conv16_2_mbox_loc_flat"
+  bottom: "conv17_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv11_mbox_conf_flat"
+  bottom: "conv13_mbox_conf_flat"
+  bottom: "conv14_2_mbox_conf_flat"
+  bottom: "conv15_2_mbox_conf_flat"
+  bottom: "conv16_2_mbox_conf_flat"
+  bottom: "conv17_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv11_mbox_priorbox"
+  bottom: "conv13_mbox_priorbox"
+  bottom: "conv14_2_mbox_priorbox"
+  bottom: "conv15_2_mbox_priorbox"
+  bottom: "conv16_2_mbox_priorbox"
+  bottom: "conv17_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 21
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 21
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.45
+      top_k: 100
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 100
+    confidence_threshold: 0.25
+  }
+}

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
----
-title: Anycoder 5932b618
-emoji: 💻
-colorFrom: green
-colorTo: blue
-sdk: gradio
-sdk_version: 6.0.0
-app_file: app.py
-pinned: false
-tags:
-- anycoder
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Anycoder 5932b618
+emoji: 💻
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 6.0.0
+app_file: app.py
+pinned: false
+tags:
+- anycoder
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -4,237 +4,58 @@ from PIL import Image, ImageDraw
 import json
 from typing import Tuple, List, Dict, Any
 import time
-# Try to import cv2, but make it optional
-try:
-    import cv2
-    CV2_AVAILABLE = True
-except ImportError:
-    CV2_AVAILABLE = False
-    print("Warning: OpenCV (cv2) not available. Using fallback image processing.")
-def load_detection_models():
-    """Load detection models or return mock models if cv2 is not available."""
-    if CV2_AVAILABLE:
-        try:
-            # Load face cascade
-            face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
-            # Load object detection model (MobileNet SSD)
-            model_path = "MobileNetSSD_deploy.prototxt"
-            weights_path = "MobileNetSSD_deploy.caffemodel"
-            # Try to load the model, fall back to mock if not available
-            try:
-                object_net = cv2.dnn.readNetFromCaffe(model_path, weights_path)
-                object_classes = [
-                    "background", "aeroplane", "bicycle", "bird", "boat", "bottle",
-                    "bus", "car", "cat", "chair", "cow", "diningtable", "dog",
-                    "horse", "motorbike", "person", "pottedplant", "sheep", "sofa",
-                    "train", "tvmonitor"
-                ]
-            except:
-                object_net = None
-                object_classes = [
-                    "background", "aeroplane", "bicycle", "bird", "boat", "bottle",
-                    "bus", "car", "cat", "chair", "cow", "diningtable", "dog",
-                    "horse", "motorbike", "person", "pottedplant", "sheep", "sofa",
-                    "train", "tvmonitor"
-                ]
-            return face_cascade, object_net, object_classes
-        except Exception as e:
-            print(f"Error loading models: {e}")
-            return None, None, []
-    else:
-        # Return mock models for PIL-based processing
-        return None, None, []
-def detect_faces_pil(image: np.ndarray, confidence: float) -> List[Dict[str, Any]]:
-    """Simple face detection simulation using PIL (fallback when cv2 not available)."""
-    try:
-        pil_image = Image.fromarray(image)
-        width, height = pil_image.size
-        # Simulate face detection with random bounding boxes
-        # In a real scenario, you'd use a face detection library that works with PIL
-        faces = []
-        # For demonstration, detect faces based on skin color approximation
-        img_array = np.array(pil_image)
-        # Simple skin color detection (very basic approximation)
-        lower_skin = np.array([0, 48, 80], dtype=np.uint8)
-        upper_skin = np.array([20, 255, 255], dtype=np.uint8)
-        # Convert to HSV for better color detection
-        try:
-            import colorsys
-            # Simple heuristic: detect regions that might be faces
-            # This is a placeholder - real face detection would require a proper model
-            for i in range(0, min(3, np.random.randint(0, 3) + 1)):  # Random 0-3 faces
-                x = np.random.randint(0, max(1, width - 100))
-                y = np.random.randint(0, max(1, height - 100))
-                w = np.random.randint(50, min(150, width - x))
-                h = np.random.randint(50, min(150, height - y))
-                faces.append({
-                    "bbox": [x, y, w, h],
-                    "confidence": round(np.random.uniform(0.5, 0.95), 3),
-                    "label": "face"
-                })
-        except:
-            pass
-        return faces
-    except Exception as e:
-        print(f"Error in face detection: {e}")
-        return []
-def detect_objects_pil(image: np.ndarray, confidence: float) -> List[Dict[str, Any]]:
-    """Simple object detection simulation using PIL (fallback when cv2 not available)."""
-    try:
-        pil_image = Image.fromarray(image)
-        width, height = pil_image.size
-        # Simulate object detection
-        objects = []
-        # For demonstration, detect random objects
-        object_classes = ["person", "car", "dog", "cat", "bottle", "chair", "laptop", "phone"]
-        for i in range(0, min(5, np.random.randint(0, 5) + 1)):  # Random 0-5 objects
-            x = np.random.randint(0, max(1, width - 100))
-            y = np.random.randint(0, max(1, height - 100))
-            w = np.random.randint(50, min(150, width - x))
-            h = np.random.randint(50, min(150, height - y))
-            obj_class = np.random.choice(object_classes)
-            objects.append({
-                "bbox": [x, y, w, h],
-                "confidence": round(np.random.uniform(0.4, 0.9), 3),
-                "label": obj_class
-            })
-        return objects
-    except Exception as e:
-        print(f"Error in object detection: {e}")
-        return []
-def detect_faces_cv2(image: np.ndarray, face_cascade, confidence: float) -> List[Dict[str, Any]]:
-    """Face detection using OpenCV Haar Cascade."""
-    try:
-        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
-        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
-        face_results = []
-        for (x, y, w, h) in faces:
-            face_results.append({
-                "bbox": [int(x), int(y), int(w), int(h)],
-                "confidence": round(np.random.uniform(0.7, 0.95), 3),  # Haar cascade doesn't provide confidence
-                "label": "face"
-            })
-        return face_results
-    except Exception as e:
-        print(f"Error in face detection: {e}")
-        return []
-def detect_objects_cv2(image: np.ndarray, net, classes, confidence: float) -> List[Dict[str, Any]]:
-    """Object detection using OpenCV DNN."""
-    try:
-        if net is None:
-            return []
-        h, w = image.shape[:2]
-        # Create blob from image
-        blob = cv2.dnn.blobFromImage(image, 0.007843, (300, 300), 127.5)
-        net.setInput(blob)
-        detections = net.forward()
-        objects = []
-        for i in range(detections.shape[2]):
-            confidence_score = detections[0, 0, i, 2]
-            if confidence_score > confidence:
-                idx = int(detections[0, 0, i, 1])
-                if idx < len(classes):
-                    x1 = int(detections[0, 0, i, 3] * w)
-                    y1 = int(detections[0, 0, i, 4] * h)
-                    x2 = int(detections[0, 0, i, 5] * w)
-                    y2 = int(detections[0, 0, i, 6] * h)
-                    objects.append({
-                        "bbox": [x1, y1, x2 - x1, y2 - y1],
-                        "confidence": round(float(confidence_score), 3),
-                        "label": classes[idx]
-                    })
-        return objects
-    except Exception as e:
-        print(f"Error in object detection: {e}")
-        return []
-def process_image(image, face_cascade, object_net, object_classes, enable_face, enable_objects, face_conf, object_conf):
-    """Process image and detect faces and objects."""
-    face_results = []
-    object_results = []
-    if enable_face:
-        if CV2_AVAILABLE and face_cascade is not None:
-            face_results = detect_faces_cv2(image, face_cascade, face_conf)
-        else:
-            face_results = detect_faces_pil(image, face_conf)
-    if enable_objects:
-        if CV2_AVAILABLE and object_net is not None:
-            object_results = detect_objects_cv2(image, object_net, object_classes, object_conf)
         else:
-            object_results = detect_objects_pil(image, object_conf)
-    return image.copy(), face_results, object_results
-def draw_detections(image, face_results, object_results, show_labels, box_color):
-    """Draw detection boxes on image."""
-    try:
-        pil_image = Image.fromarray(image)
-        draw = ImageDraw.Draw(pil_image)
-        # Convert color name to RGB
-        color_map = {
-            "red": (255, 0, 0),
-            "green": (0, 255, 0),
-            "blue": (0, 0, 255),
-            "yellow": (255, 255, 0),
-            "purple": (128, 0, 128),
-            "orange": (255, 165, 0)
-        }
-        color = color_map.get(box_color, (255, 0, 0))
-        # Draw face boxes
-        for face in face_results:
-            x, y, w, h = face["bbox"]
-            draw.rectangle([x, y, x + w, y + h], outline=color, width=3)
-            if show_labels:
-                label = f"Face {face.get('confidence', '')}"
-                draw.text((x, y - 20), label, fill=color)
-        # Draw object boxes
-        for obj in object_results:
-            x, y, w, h = obj["bbox"]
-            draw.rectangle([x, y, x + w, y + h], outline=color, width=3)
-            if show_labels:
-                label = f"{obj['label']} {obj.get('confidence', '')}"
-                draw.text((x, y - 20), label, fill=color)
-        return np.array(pil_image)
-    except Exception as e:
-        print(f"Error drawing detections: {e}")
-        return image
-# Load models at startup
-face_cascade, object_net, object_classes = load_detection_models()
 def recognize_face_and_objects(
     image: np.ndarray,
@@ -244,18 +65,34 @@ def recognize_face_and_objects(
     object_confidence: float,
     draw_boxes: bool,
     show_labels: bool,
-    box_color: str
-) -> Tuple[np.ndarray, str, str]:
     """
-    Perform face and object detection on the input image.
     """
     if image is None:
-        return None, "No image provided", "No image provided"
     # Convert PIL to numpy if needed
     if isinstance(image, Image.Image):
         image = np.array(image)
     # Process image
     processed_image, face_results, object_results = process_image(
         image,
@@ -268,6 +105,9 @@ def recognize_face_and_objects(
         object_confidence
     )
     # Draw detections if requested
     if draw_boxes:
         processed_image = draw_detections(
@@ -279,10 +119,10 @@ def recognize_face_and_objects(
         )
     # Convert results to JSON
-    face_json = json.dumps(face_results, indent=2) if face_results else "No faces detected"
-    object_json = json.dumps(object_results, indent=2) if object_results else "No objects detected"
-    return processed_image, face_json, object_json
 def webcam_recognition(
     image: np.ndarray,
@@ -292,13 +132,28 @@ def webcam_recognition(
     object_confidence: float,
     draw_boxes: bool,
     show_labels: bool,
-    box_color: str
 ) -> np.ndarray:
-    """Real-time webcam recognition."""
     if image is None:
         return None
-    processed_image, _, _ = recognize_face_and_objects(
         image,
         enable_face_detection,
         enable_object_detection,
@@ -306,7 +161,13 @@ def webcam_recognition(
         object_confidence,
         draw_boxes,
         show_labels,
-        box_color
     )
     return processed_image
@@ -349,6 +210,28 @@ def get_detection_statistics() -> str:
         }
     return json.dumps(stats, indent=2)
 # Create custom CSS for better styling
 custom_css = """
 .main-container {
@@ -377,17 +260,31 @@ custom_css = """
     padding: 15px;
     margin-bottom: 20px;
 }
 """
-with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as demo:
     gr.Markdown("""
-    # 🔍 Face & Object Recognition Platform
     Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
-    Advanced computer vision platform for real-time face and object detection with customizable settings.
     """)
-    # Show warning if OpenCV is not available
     if not CV2_AVAILABLE:
         with gr.Row():
             gr.Markdown("""
@@ -397,6 +294,17 @@ with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as de
             </div>
             """)
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("### 📤 Input Source")
@@ -417,7 +325,7 @@ with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as de
                         streaming=True,
                         height=400
                     )
-                    gr.Markdown("*Webcam provides real-time detection (may have slight delay)*")
         with gr.Column(scale=1):
             gr.Markdown("### ⚙️ Detection Settings")
@@ -463,6 +371,14 @@ with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as de
                 height=400,
                 elem_classes=["image-container"]
             )
         with gr.Column():
             with gr.Tabs():
@@ -478,6 +394,44 @@ with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as de
                         elem_classes=["result-panel"]
                     )
                 with gr.TabItem("ℹ️ Model Info"):
                     model_info = gr.JSON(
                         label="Detection Models Information",
@@ -486,6 +440,9 @@ with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as de
                     )
     # Event handlers
     analyze_btn.click(
         fn=recognize_face_and_objects,
         inputs=[
@@ -496,9 +453,16 @@ with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as de
             object_conf,
             draw_boxes,
             show_labels,
-            box_color
         ],
-        outputs=[output_image, face_results, object_results]
     )
     # Real-time webcam processing
@@ -512,29 +476,53 @@ with gr.Blocks(css=custom_css, title="Face & Object Recognition Platform") as de
             object_conf,
             draw_boxes,
             show_labels,
-            box_color
         ],
         outputs=[output_image],
         time_limit=30,
         stream_every=0.5
     )
     gr.Markdown("""
     ---
     ### 📚 Usage Instructions
     1. **Upload Image**: Select an image from your device for analysis
-    2. **Webcam**: Use your webcam for real-time detection
     3. **Adjust Settings**: Customize confidence thresholds and display options
-    4. **View Results**: See detections overlayed on the image with detailed JSON data
     ### 🎯 Features
     - **Face Detection**: Identifies faces in images using Haar Cascade classifiers (or simulation mode)
     - **Object Detection**: Recognizes object classes using MobileNet-SSD (or simulation mode)
-    - **Real-time Processing**: Webcam support with live detection
     - **Customizable**: Adjustable confidence thresholds and visual settings
     - **Detailed Output**: JSON formatted results with coordinates and confidence scores
     ### ⚙️ Installation Notes
-    Install OpenCV for full functionality: `pip install opencv-python`
     """)
 if __name__ == "__main__":

 import json
 from typing import Tuple, List, Dict, Any
 import time
+import threading
+import queue
+from models import load_detection_models, CV2_AVAILABLE # CV2_AVAILABLE needs to come from models.py
+from utils import draw_detections, process_image, generate_tone, play_sound, AlarmSystem, AUDIO_AVAILABLE
+# Global alarm system
+alarm_system = AlarmSystem()
+# Load models at startup
+face_cascade, object_net, object_classes = load_detection_models()
+def check_and_trigger_alarm(face_results, object_results, alarm_settings):
+    """Check detection results and trigger alarm if conditions are met."""
+    if not alarm_settings.get("alarm_enabled", False):
+        return False, "Alarm disabled"
+    alarm_triggered = False
+    alarm_reason = ""
+    # Check face detection alarm
+    if alarm_settings.get("face_alarm", False) and face_results:
+        alarm_triggered = True
+        alarm_reason = f"Face detected ({len(face_results)} faces)"
+    # Check object detection alarm
+    elif alarm_settings.get("object_alarm", False) and object_results:
+        # Check for specific object types if specified
+        target_objects = alarm_settings.get("target_objects", [])
+        if target_objects:
+            detected_objects = [obj["label"] for obj in object_results if obj["label"] in target_objects]
+            if detected_objects:
+                alarm_triggered = True
+                alarm_reason = f"Target object detected: {', '.join(set(detected_objects))}"
         else:
+            alarm_triggered = True
+            alarm_reason = f"Object detected ({len(object_results)} objects)"
+    # Trigger alarm if conditions are met
+    if alarm_triggered:
+        sound_type = alarm_settings.get("alarm_sound", "Beep")
+        if sound_type == "Custom":
+            sound_to_play = alarm_settings.get("custom_alarm_sound")
+        else:
+            sound_to_play = sound_type
+        if alarm_system.trigger_alarm(sound_to_play):
+            return True, f"🚨 ALARM TRIGGERED: {alarm_reason}"
+        else:
+            return False, "Alarm cooldown active"
+    return False, "No alarm conditions met"
 def recognize_face_and_objects(
     image: np.ndarray,
     object_confidence: float,
     draw_boxes: bool,
     show_labels: bool,
+    box_color: str,
+    alarm_enabled_val: bool, # New parameter for alarm_enabled
+    face_alarm_val: bool,    # New parameter for face_alarm
+    object_alarm_val: bool,  # New parameter for object_alarm
+    alarm_sound_val: str,    # New parameter for alarm_sound
+    target_objects_val: List[str], # New parameter for target_objects
+    custom_alarm_sound_val: str
+) -> Tuple[np.ndarray, str, str, str]:
     """
+    Perform face and object detection on the input image with alarm support.
     """
     if image is None:
+        return None, "[]", "[]", "No image provided" # Changed this line to return empty JSON arrays for face and object results
     # Convert PIL to numpy if needed
     if isinstance(image, Image.Image):
         image = np.array(image)
+    # Construct alarm_settings dictionary from the passed values
+    alarm_settings = {
+        "alarm_enabled": alarm_enabled_val,
+        "face_alarm": face_alarm_val,
+        "object_alarm": object_alarm_val,
+        "alarm_sound": alarm_sound_val,
+        "target_objects": target_objects_val,
+        "custom_alarm_sound": custom_alarm_sound_val
+    }
     # Process image
     processed_image, face_results, object_results = process_image(
         image,
         object_confidence
     )
+    # Check alarm conditions
+    alarm_status, alarm_message = check_and_trigger_alarm(face_results, object_results, alarm_settings)
     # Draw detections if requested
     if draw_boxes:
         processed_image = draw_detections(
         )
     # Convert results to JSON
+    face_json = json.dumps(face_results, indent=2) if face_results else "[]"
+    object_json = json.dumps(object_results, indent=2) if object_results else "[]"
+    return processed_image, face_json, object_json, alarm_message
 def webcam_recognition(
     image: np.ndarray,
     object_confidence: float,
     draw_boxes: bool,
     show_labels: bool,
+    box_color: str,
+    alarm_enabled_val: bool, # New parameter for alarm_enabled
+    face_alarm_val: bool,    # New parameter for face_alarm
+    object_alarm_val: bool,  # New parameter for object_alarm
+    alarm_sound_val: str,    # New parameter for alarm_sound
+    target_objects_val: List[str], # New parameter for target_objects
+    custom_alarm_sound_val: str
 ) -> np.ndarray:
+    """Real-time webcam recognition with alarm."""
     if image is None:
         return None
+    # Construct alarm_settings dictionary from the passed values
+    alarm_settings = {
+        "alarm_enabled": alarm_enabled_val,
+        "face_alarm": face_alarm_val,
+        "object_alarm": object_alarm_val,
+        "alarm_sound": alarm_sound_val,
+        "target_objects": target_objects_val
+    }
+    processed_image, _, _, _ = recognize_face_and_objects(
         image,
         enable_face_detection,
         enable_object_detection,
         object_confidence,
         draw_boxes,
         show_labels,
+        box_color,
+        alarm_enabled_val, # Pass these directly
+        face_alarm_val,
+        object_alarm_val,
+        alarm_sound_val,
+        target_objects_val,
+        custom_alarm_sound_val
     )
     return processed_image
         }
     return json.dumps(stats, indent=2)
+def test_alarm_sound(sound_type, custom_sound_file):
+    """Test alarm sound."""
+    if not AUDIO_AVAILABLE:
+        return "⚠️ Audio not available. Install pyaudio for sound support."
+    try:
+        if sound_type == "Custom":
+            sound_to_play = custom_sound_file
+            if sound_to_play is None:
+                return "Custom sound selected, but no file uploaded."
+        else:
+            sound_to_play = sound_type
+        play_sound(sound_to_play)
+        # Give a more descriptive message for custom sounds
+        if sound_type == "Custom":
+             return f"✅ Played custom sound"
+        else:
+             return f"✅ Played {sound_type} sound"
+    except Exception as e:
+        return f"❌ Error playing sound: {str(e)}"
 # Create custom CSS for better styling
 custom_css = """
 .main-container {
     padding: 15px;
     margin-bottom: 20px;
 }
+.alarm-box {
+    background-color: #f8d7da;
+    border: 2px solid #f5c6cb;
+    border-radius: 8px;
+    padding: 15px;
+    margin-bottom: 20px;
+    animation: pulse 1s infinite;
+}
+@keyframes pulse {
+    0% { opacity: 1; }
+    50% { opacity: 0.7; }
+    100% { opacity: 1; }
+}
 """
+with gr.Blocks(title="Face & Object Recognition Platform") as demo:
+    gr.HTML(f"<style>{custom_css}</style>")
     gr.Markdown("""
+    # 🔍 Face & Object Recognition Platform with Alarm System
     Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
+    Advanced computer vision platform for real-time face and object detection with customizable settings and alarm notifications.
     """)
+    # Show warnings if dependencies are not available
     if not CV2_AVAILABLE:
         with gr.Row():
             gr.Markdown("""
             </div>
             """)
+    if not AUDIO_AVAILABLE:
+        with gr.Row():
+            gr.Markdown("""
+            <div class="warning-box">
+            ⚠️ **Audio Not Available**: Install audio libraries for alarm sounds: `pip install pyaudio`
+            </div>
+            """)
+    # Alarm state
+    alarm_status = gr.Textbox(label="Alarm Status", visible=False, interactive=False)
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("### 📤 Input Source")
                         streaming=True,
                         height=400
                     )
+                    gr.Markdown("*Webcam provides real-time detection with alarm system*")
         with gr.Column(scale=1):
             gr.Markdown("### ⚙️ Detection Settings")
                 height=400,
                 elem_classes=["image-container"]
             )
+            # Alarm status display
+            alarm_display = gr.Textbox(
+                label="🚨 Alarm Status",
+                value="Ready",
+                interactive=False,
+                elem_classes=["alarm-box" if False else ""]
+            )
         with gr.Column():
             with gr.Tabs():
                         elem_classes=["result-panel"]
                     )
+                with gr.TabItem("🚨 Alarm Settings"):
+                    gr.Markdown("#### Configure Alarm System")
+                    alarm_enabled = gr.Checkbox(label="🔔 Enable Alarm System", value=False)
+                    face_alarm = gr.Checkbox(label="👤 Alarm on Face Detection", value=True)
+                    object_alarm = gr.Checkbox(label="📦 Alarm on Object Detection", value=True)
+                    alarm_sound = gr.Dropdown(
+                        label="🔊 Alarm Sound",
+                        choices=["Beep", "Siren", "Chime", "Alert", "Buzzer", "Ring", "Custom"],
+                        value="Beep",
+                        info="Select alarm sound type"
+                    )
+                    custom_alarm_sound = gr.File(
+                        label="Upload Custom Alarm Sound (.wav)",
+                        file_types=[".wav"],
+                        visible=False
+                    )
+                    def toggle_custom_sound(sound_choice):
+                        return gr.update(visible=sound_choice == "Custom")
+                    alarm_sound.change(
+                        fn=toggle_custom_sound,
+                        inputs=alarm_sound,
+                        outputs=custom_alarm_sound
+                    )
+                    target_objects = gr.CheckboxGroup(
+                        label="🎯 Specific Objects to Trigger Alarm (optional)",
+                        choices=["person", "car", "dog", "cat", "bottle", "chair", "laptop", "phone"],
+                        info="Leave empty to alarm on any object"
+                    )
+                    test_sound_btn = gr.Button("🔊 Test Sound", variant="secondary")
+                    sound_test_result = gr.Textbox(label="Sound Test Result", interactive=False)
                 with gr.TabItem("ℹ️ Model Info"):
                     model_info = gr.JSON(
                         label="Detection Models Information",
                     )
     # Event handlers
+    # NOTE: The gr.State values are captured at the time the UI is created.
+    # To get the current values, we need to pass the Gradio components themselves
+    # and then read their values in the `recognize_face_and_objects` function.
     analyze_btn.click(
         fn=recognize_face_and_objects,
         inputs=[
             object_conf,
             draw_boxes,
             show_labels,
+            box_color,
+            # Pass the Gradio components, not their values
+            alarm_enabled,
+            face_alarm,
+            object_alarm,
+            alarm_sound,
+            target_objects,
+            custom_alarm_sound
         ],
+        outputs=[output_image, face_results, object_results, alarm_display]
     )
     # Real-time webcam processing
             object_conf,
             draw_boxes,
             show_labels,
+            box_color,
+            # Pass the Gradio components, not their values
+            alarm_enabled,
+            face_alarm,
+            object_alarm,
+            alarm_sound,
+            target_objects,
+            custom_alarm_sound
         ],
         outputs=[output_image],
         time_limit=30,
         stream_every=0.5
     )
+    # Test sound button
+    test_sound_btn.click(
+        fn=test_alarm_sound,
+        inputs=[alarm_sound, custom_alarm_sound],
+        outputs=[sound_test_result]
+    )
     gr.Markdown("""
     ---
     ### 📚 Usage Instructions
     1. **Upload Image**: Select an image from your device for analysis
+    2. **Webcam**: Use your webcam for real-time detection with alarms
     3. **Adjust Settings**: Customize confidence thresholds and display options
+    4. **Configure Alarm**: Set up alarm conditions and sounds in the Alarm Settings tab
+    5. **View Results**: See detections overlayed on the image with detailed JSON data
+    ### 🚨 Alarm Features
+    - **Face Detection Alarm**: Triggers when faces are detected
+    - **Object Detection Alarm**: Triggers when objects are detected (all or specific types)
+    - **Multiple Sounds**: Choose from 6 different alarm sounds
+    - **Cooldown Period**: Prevents alarm spam (2-second cooldown)
+    - **Real-time Monitoring**: Works with webcam for continuous monitoring
     ### 🎯 Features
     - **Face Detection**: Identifies faces in images using Haar Cascade classifiers (or simulation mode)
     - **Object Detection**: Recognizes object classes using MobileNet-SSD (or simulation mode)
+    - **Real-time Processing**: Webcam support with live detection and alarms
     - **Customizable**: Adjustable confidence thresholds and visual settings
     - **Detailed Output**: JSON formatted results with coordinates and confidence scores
     ### ⚙️ Installation Notes
+    - Install OpenCV for full functionality: `pip install opencv-python`
+    - Install audio support for alarms: `pip install pyaudio`
     """)
 if __name__ == "__main__":

models.py CHANGED Viewed

@@ -14,7 +14,7 @@ def load_detection_models():
     if CV2_AVAILABLE:
         try:
             # Load face cascade
-            face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
             # Load object detection model (MobileNet SSD)
             model_path = "MobileNetSSD_deploy.prototxt"
@@ -29,7 +29,8 @@ def load_detection_models():
                     "horse", "motorbike", "person", "pottedplant", "sheep", "sofa",
                     "train", "tvmonitor"
                 ]
-            except:
                 object_net = None
                 object_classes = [
                     "background", "aeroplane", "bicycle", "bird", "boat", "bottle",
@@ -124,7 +125,6 @@ def detect_faces_pil(image, confidence):
         # Simulate face detection with random bounding boxes
         faces = []
-        # For demonstration, detect faces based on basic heuristics
         for i in range(0, min(3, np.random.randint(0, 3) + 1)):  # Random 0-3 faces
             x = np.random.randint(0, max(1, width - 100))
             y = np.random.randint(0, max(1, height - 100))

     if CV2_AVAILABLE:
         try:
             # Load face cascade
+            face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
             # Load object detection model (MobileNet SSD)
             model_path = "MobileNetSSD_deploy.prototxt"
                     "horse", "motorbike", "person", "pottedplant", "sheep", "sofa",
                     "train", "tvmonitor"
                 ]
+            except Exception as e:
+                print(f"Error loading object detection model: {e}")
                 object_net = None
                 object_classes = [
                     "background", "aeroplane", "bicycle", "bird", "boat", "bottle",
         # Simulate face detection with random bounding boxes
         faces = []
         for i in range(0, min(3, np.random.randint(0, 3) + 1)):  # Random 0-3 faces
             x = np.random.randint(0, max(1, width - 100))
             y = np.random.randint(0, max(1, height - 100))

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-opencv-python
-Pillow
-gradio
-numpy
-requests
-matplotlib
-scipy

+opencv-python
+Pillow
+gradio
+numpy
+requests
+matplotlib
+scipy

utils.py CHANGED Viewed

@@ -1,6 +1,122 @@
 import numpy as np
 from PIL import Image, ImageDraw
 import json
 def draw_detections(image, face_results, object_results, show_labels, box_color):
     """Draw detection boxes on image using PIL."""
@@ -53,9 +169,4 @@ def process_image(image, face_cascade, object_net, object_classes, enable_face,
     if enable_objects:
         object_results = detect_objects(image, object_net, object_classes, object_conf)
-    return image.copy(), face_results, object_results
-def load_detection_models():
-    """Load detection models."""
-    from models import load_detection_models as load_models
-    return load_models()

 import numpy as np
 from PIL import Image, ImageDraw
+import wave
+import os
 import json
+import time
+import threading
+import queue
+# Try to import cv2, but make it optional
+try:
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+# Try to import sound libraries
+try:
+    import pyaudio
+    import numpy as np
+    AUDIO_AVAILABLE = True
+except ImportError:
+    AUDIO_AVAILABLE = False
+def generate_tone(frequency, duration, sample_rate=44100, volume=0.5):
+    """Generate a simple tone."""
+    if not AUDIO_AVAILABLE:
+        return None
+    frames = int(duration * sample_rate)
+    arr = np.zeros(frames)
+    for i in range(frames):
+        arr[i] = volume * np.sin(2 * np.pi * frequency * i / sample_rate)
+    return arr.astype(np.float32)
+def play_sound(sound_type):
+    """Play different alarm sounds or a custom audio file."""
+    if not AUDIO_AVAILABLE:
+        print(f"Alarm: {sound_type} (audio not available)")
+        return
+    p = pyaudio.PyAudio()
+    try:
+        # Check if sound_type is a path to a custom .wav file
+        if isinstance(sound_type, str) and sound_type.endswith('.wav') and os.path.exists(sound_type):
+            with wave.open(sound_type, 'rb') as wf:
+                stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
+                               channels=wf.getnchannels(),
+                               rate=wf.getframerate(),
+                               output=True)
+                data = wf.readframes(1024)
+                while data:
+                    stream.write(data)
+                    data = wf.readframes(1024)
+                stream.stop_stream()
+                stream.close()
+        else:
+            # Existing tone generation logic
+            sound_patterns = {
+                "Beep": [(440, 0.2), (440, 0.2)],
+                "Siren": [(600, 0.1), (800, 0.1), (600, 0.1), (800, 0.1)],
+                "Chime": [(523, 0.3), (659, 0.3), (784, 0.5)],
+                "Alert": [(1000, 0.1), (1500, 0.1), (2000, 0.1)],
+                "Buzzer": [(200, 0.5)],
+                "Ring": [(800, 0.2), (600, 0.2), (800, 0.2), (600, 0.2)]
+            }
+            stream = p.open(format=pyaudio.paFloat32,
+                           channels=1,
+                           rate=44100,
+                           output=True)
+            if sound_type in sound_patterns:
+                for freq, duration in sound_patterns[sound_type]:
+                    tone = generate_tone(freq, duration)
+                    if tone is not None:
+                        stream.write(tone.tobytes())
+            stream.stop_stream()
+            stream.close()
+    except Exception as e:
+        print(f"Error playing sound: {e}")
+    finally:
+        p.terminate()
+class AlarmSystem:
+    """Manages alarm functionality."""
+    def __init__(self):
+        self.alarm_queue = queue.Queue()
+        self.alarm_thread = threading.Thread(target=self._alarm_worker, daemon=True)
+        self.alarm_thread.start()
+        self.last_alarm_time = 0
+        self.alarm_cooldown = 2  # seconds between alarms
+    def _alarm_worker(self):
+        """Worker thread for playing alarms."""
+        while True:
+            try:
+                sound_type = self.alarm_queue.get(timeout=1)
+                if sound_type:
+                    play_sound(sound_type)
+                self.alarm_queue.task_done()
+            except queue.Empty:
+                continue
+            except Exception as e:
+                print(f"Alarm worker error: {e}")
+    def trigger_alarm(self, sound_type):
+        """Trigger an alarm with cooldown."""
+        current_time = time.time()
+        if current_time - self.last_alarm_time > self.alarm_cooldown:
+            self.alarm_queue.put(sound_type)
+            self.last_alarm_time = current_time
+            return True
+        return False
 def draw_detections(image, face_results, object_results, show_labels, box_color):
     """Draw detection boxes on image using PIL."""
     if enable_objects:
         object_results = detect_objects(image, object_net, object_classes, object_conf)
+    return image.copy(), face_results, object_results