InspiratioNULL
commited on
Commit
·
f24f82b
unverified
·
0
Parent(s):
Initial Commit
Browse files- .gitattributes +35 -0
- CLIP_ImageEncoder.mlmodelc/analytics/coremldata.bin +3 -0
- CLIP_ImageEncoder.mlmodelc/coremldata.bin +3 -0
- CLIP_ImageEncoder.mlmodelc/metadata.json +81 -0
- CLIP_ImageEncoder.mlmodelc/model.mil +0 -0
- CLIP_ImageEncoder.mlmodelc/weights/weight.bin +3 -0
- CLIP_ImageEncoder.swift +307 -0
- CLIP_TextEncoder.mlmodelc/analytics/coremldata.bin +3 -0
- CLIP_TextEncoder.mlmodelc/coremldata.bin +3 -0
- CLIP_TextEncoder.mlmodelc/metadata.json +81 -0
- CLIP_TextEncoder.mlmodelc/model.mil +0 -0
- CLIP_TextEncoder.mlmodelc/weights/weight.bin +3 -0
- CLIP_TextEncoder.swift +313 -0
- README.md +165 -0
- merges.txt +0 -0
- open_clip_config.json +30 -0
- special_tokens_map.json +24 -0
- tokenizer.json +0 -0
- tokenizer_config.json +33 -0
- vocab.json +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodelc filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
CLIP_ImageEncoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d4292a5186e1d7b5fee66d202e904b13f8036e3a19fb7b91444478b0bf997ea
|
| 3 |
+
size 243
|
CLIP_ImageEncoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68a77b743ee02d373e0f0a152d5146ea1eb9860a9c81b1979576a99972a9dcc2
|
| 3 |
+
size 472
|
CLIP_ImageEncoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"shortDescription" : "CLIP ViT-B\/32 model trained with DataComp-1B (Image Encoder Model)",
|
| 4 |
+
"metadataOutputVersion" : "3.0",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 512)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 512]",
|
| 13 |
+
"name" : "var_1240",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"version" : "1.0.0",
|
| 18 |
+
"modelParameters" : [
|
| 19 |
+
|
| 20 |
+
],
|
| 21 |
+
"author" : "InspiratioNULL 2026",
|
| 22 |
+
"specificationVersion" : 6,
|
| 23 |
+
"storagePrecision" : "Float16",
|
| 24 |
+
"license" : "MIT",
|
| 25 |
+
"mlProgramOperationTypeHistogram" : {
|
| 26 |
+
"Concat" : 1,
|
| 27 |
+
"Linear" : 49,
|
| 28 |
+
"SliceByIndex" : 37,
|
| 29 |
+
"LayerNorm" : 26,
|
| 30 |
+
"Transpose" : 85,
|
| 31 |
+
"Matmul" : 24,
|
| 32 |
+
"Gelu" : 12,
|
| 33 |
+
"Softmax" : 12,
|
| 34 |
+
"Mul" : 13,
|
| 35 |
+
"Cast" : 2,
|
| 36 |
+
"Reshape" : 109,
|
| 37 |
+
"Add" : 26,
|
| 38 |
+
"ExpandDims" : 12,
|
| 39 |
+
"Squeeze" : 12,
|
| 40 |
+
"Conv" : 1
|
| 41 |
+
},
|
| 42 |
+
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 43 |
+
"stateSchema" : [
|
| 44 |
+
|
| 45 |
+
],
|
| 46 |
+
"isUpdatable" : "0",
|
| 47 |
+
"availability" : {
|
| 48 |
+
"macOS" : "12.0",
|
| 49 |
+
"tvOS" : "15.0",
|
| 50 |
+
"visionOS" : "1.0",
|
| 51 |
+
"watchOS" : "8.0",
|
| 52 |
+
"iOS" : "15.0",
|
| 53 |
+
"macCatalyst" : "15.0"
|
| 54 |
+
},
|
| 55 |
+
"modelType" : {
|
| 56 |
+
"name" : "MLModelType_mlProgram"
|
| 57 |
+
},
|
| 58 |
+
"inputSchema" : [
|
| 59 |
+
{
|
| 60 |
+
"height" : "224",
|
| 61 |
+
"colorspace" : "RGB",
|
| 62 |
+
"isOptional" : "0",
|
| 63 |
+
"width" : "224",
|
| 64 |
+
"isColor" : "1",
|
| 65 |
+
"formattedType" : "Image (Color 224 × 224)",
|
| 66 |
+
"hasSizeFlexibility" : "0",
|
| 67 |
+
"type" : "Image",
|
| 68 |
+
"shortDescription" : "",
|
| 69 |
+
"name" : "image"
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"userDefinedMetadata" : {
|
| 73 |
+
"com.github.apple.coremltools.conversion_date" : "2026-01-15",
|
| 74 |
+
"com.github.apple.coremltools.source" : "torch==2.9.1",
|
| 75 |
+
"com.github.apple.coremltools.version" : "9.0",
|
| 76 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 77 |
+
},
|
| 78 |
+
"generatedClassName" : "CLIP_ImageEncoder",
|
| 79 |
+
"method" : "predict"
|
| 80 |
+
}
|
| 81 |
+
]
|
CLIP_ImageEncoder.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
CLIP_ImageEncoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa215c95d527ea6508f368b70329299ffd387d23a193adc891dd971f1d268b56
|
| 3 |
+
size 175709312
|
CLIP_ImageEncoder.swift
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
///MARK: This is the generated class file, useful for proper implementation.
|
| 2 |
+
//Created by InspiratioNULL on 1/20/2026
|
| 3 |
+
// CLIP_ImageEncoder.swift
|
| 4 |
+
//
|
| 5 |
+
// This file was automatically generated and should not be edited.
|
| 6 |
+
//
|
| 7 |
+
|
| 8 |
+
import CoreML
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
/// Model Prediction Input Type
|
| 12 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 13 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 14 |
+
public class CLIP_ImageEncoderInput : MLFeatureProvider {
|
| 15 |
+
|
| 16 |
+
/// image as color (kCVPixelFormatType_32BGRA) image buffer, 224 pixels wide by 224 pixels high
|
| 17 |
+
/// image as color (kCVPixelFormatType_32BGRA) image buffer, 224 pixels wide by 224 pixels high
|
| 18 |
+
public var image: CVPixelBuffer
|
| 19 |
+
|
| 20 |
+
public var featureNames: Set<String> { ["image"] }
|
| 21 |
+
|
| 22 |
+
public func featureValue(for featureName: String) -> MLFeatureValue? {
|
| 23 |
+
if featureName == "image" {
|
| 24 |
+
return MLFeatureValue(pixelBuffer: image)
|
| 25 |
+
}
|
| 26 |
+
return nil
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
public init(image: CVPixelBuffer) {
|
| 30 |
+
self.image = image
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
public convenience init(imageWith image: CGImage) throws {
|
| 34 |
+
self.init(image: try MLFeatureValue(cgImage: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!)
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
public convenience init(imageAt image: URL) throws {
|
| 38 |
+
self.init(image: try MLFeatureValue(imageAt: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!)
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
func setImage(with image: CGImage) throws {
|
| 42 |
+
self.image = try MLFeatureValue(cgImage: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
func setImage(with image: URL) throws {
|
| 46 |
+
self.image = try MLFeatureValue(imageAt: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
/// Model Prediction Output Type
|
| 53 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 54 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 55 |
+
public class CLIP_ImageEncoderOutput : MLFeatureProvider {
|
| 56 |
+
|
| 57 |
+
/// Source provided by CoreML
|
| 58 |
+
private let provider : MLFeatureProvider
|
| 59 |
+
|
| 60 |
+
/// var_1240 as 1 by 512 matrix of floats
|
| 61 |
+
/// var_1240 as 1 by 512 matrix of floats
|
| 62 |
+
public var var_1240: MLMultiArray {
|
| 63 |
+
provider.featureValue(for: "var_1240")!.multiArrayValue!
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
/// var_1240 as 1 by 512 matrix of floats
|
| 67 |
+
/// var_1240 as 1 by 512 matrix of floats
|
| 68 |
+
public var var_1240ShapedArray: MLShapedArray<Float> {
|
| 69 |
+
MLShapedArray<Float>(var_1240)
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
public var featureNames: Set<String> {
|
| 73 |
+
provider.featureNames
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
public func featureValue(for featureName: String) -> MLFeatureValue? {
|
| 77 |
+
provider.featureValue(for: featureName)
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
public init(var_1240: MLMultiArray) {
|
| 81 |
+
self.provider = try! MLDictionaryFeatureProvider(dictionary: ["var_1240" : MLFeatureValue(multiArray: var_1240)])
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
public init(features: MLFeatureProvider) {
|
| 85 |
+
self.provider = features
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
/// Class for model loading and prediction
|
| 91 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 92 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 93 |
+
public class CLIP_ImageEncoder {
|
| 94 |
+
public let model: MLModel
|
| 95 |
+
|
| 96 |
+
/// URL of model assuming it was installed in the same bundle as this class
|
| 97 |
+
/// URL of model assuming it was installed in the same bundle as this class
|
| 98 |
+
public class var urlOfModelInThisBundle : URL {
|
| 99 |
+
let bundle = Bundle(for: self)
|
| 100 |
+
return bundle.url(forResource: "CLIP_ImageEncoder", withExtension:"mlmodelc")!
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
/**
|
| 104 |
+
Construct CLIP_ImageEncoder instance with an existing MLModel object.
|
| 105 |
+
|
| 106 |
+
Usually the application does not use this initializer unless it makes a subclass of CLIP_ImageEncoder.
|
| 107 |
+
Such application may want to use `MLModel(contentsOfURL:configuration:)` and `CLIP_ImageEncoder.urlOfModelInThisBundle` to create a MLModel object to pass-in.
|
| 108 |
+
|
| 109 |
+
- parameters:
|
| 110 |
+
- model: MLModel object
|
| 111 |
+
*/
|
| 112 |
+
public init(model: MLModel) {
|
| 113 |
+
self.model = model
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/**
|
| 117 |
+
Construct a model with configuration
|
| 118 |
+
|
| 119 |
+
- parameters:
|
| 120 |
+
- configuration: the desired model configuration
|
| 121 |
+
|
| 122 |
+
- throws: an NSError object that describes the problem
|
| 123 |
+
*/
|
| 124 |
+
public convenience init(configuration: MLModelConfiguration = MLModelConfiguration()) throws {
|
| 125 |
+
try self.init(contentsOf: type(of:self).urlOfModelInThisBundle, configuration: configuration)
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
/**
|
| 129 |
+
Construct CLIP_ImageEncoder instance with explicit path to mlmodelc file
|
| 130 |
+
- parameters:
|
| 131 |
+
- modelURL: the file url of the model
|
| 132 |
+
|
| 133 |
+
- throws: an NSError object that describes the problem
|
| 134 |
+
*/
|
| 135 |
+
public convenience init(contentsOf modelURL: URL) throws {
|
| 136 |
+
try self.init(model: MLModel(contentsOf: modelURL))
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/**
|
| 140 |
+
Construct a model with URL of the .mlmodelc directory and configuration
|
| 141 |
+
|
| 142 |
+
- parameters:
|
| 143 |
+
- modelURL: the file url of the model
|
| 144 |
+
- configuration: the desired model configuration
|
| 145 |
+
|
| 146 |
+
- throws: an NSError object that describes the problem
|
| 147 |
+
*/
|
| 148 |
+
public convenience init(contentsOf modelURL: URL, configuration: MLModelConfiguration) throws {
|
| 149 |
+
try self.init(model: MLModel(contentsOf: modelURL, configuration: configuration))
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
/**
|
| 153 |
+
Construct CLIP_ImageEncoder instance asynchronously with optional configuration.
|
| 154 |
+
|
| 155 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 156 |
+
|
| 157 |
+
- parameters:
|
| 158 |
+
- configuration: the desired model configuration
|
| 159 |
+
- handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
|
| 160 |
+
*/
|
| 161 |
+
public class func load(configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_ImageEncoder, Error>) -> Void) {
|
| 162 |
+
load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration, completionHandler: handler)
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
/**
|
| 166 |
+
Construct CLIP_ImageEncoder instance asynchronously with optional configuration.
|
| 167 |
+
|
| 168 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 169 |
+
|
| 170 |
+
- parameters:
|
| 171 |
+
- configuration: the desired model configuration
|
| 172 |
+
*/
|
| 173 |
+
public class func load(configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_ImageEncoder {
|
| 174 |
+
try await load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration)
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
/**
|
| 178 |
+
Construct CLIP_ImageEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
|
| 179 |
+
|
| 180 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 181 |
+
|
| 182 |
+
- parameters:
|
| 183 |
+
- modelURL: the URL to the model
|
| 184 |
+
- configuration: the desired model configuration
|
| 185 |
+
- handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
|
| 186 |
+
*/
|
| 187 |
+
public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_ImageEncoder, Error>) -> Void) {
|
| 188 |
+
MLModel.load(contentsOf: modelURL, configuration: configuration) { result in
|
| 189 |
+
switch result {
|
| 190 |
+
case .failure(let error):
|
| 191 |
+
handler(.failure(error))
|
| 192 |
+
case .success(let model):
|
| 193 |
+
handler(.success(CLIP_ImageEncoder(model: model)))
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
/**
|
| 199 |
+
Construct CLIP_ImageEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
|
| 200 |
+
|
| 201 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 202 |
+
|
| 203 |
+
- parameters:
|
| 204 |
+
- modelURL: the URL to the model
|
| 205 |
+
- configuration: the desired model configuration
|
| 206 |
+
*/
|
| 207 |
+
public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_ImageEncoder {
|
| 208 |
+
let model = try await MLModel.load(contentsOf: modelURL, configuration: configuration)
|
| 209 |
+
return CLIP_ImageEncoder(model: model)
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
/**
|
| 213 |
+
Make a prediction using the structured interface
|
| 214 |
+
|
| 215 |
+
It uses the default function if the model has multiple functions.
|
| 216 |
+
|
| 217 |
+
- parameters:
|
| 218 |
+
- input: the input to the prediction as CLIP_ImageEncoderInput
|
| 219 |
+
|
| 220 |
+
- throws: an NSError object that describes the problem
|
| 221 |
+
|
| 222 |
+
- returns: the result of the prediction as CLIP_ImageEncoderOutput
|
| 223 |
+
*/
|
| 224 |
+
public func prediction(input: CLIP_ImageEncoderInput) throws -> CLIP_ImageEncoderOutput {
|
| 225 |
+
try prediction(input: input, options: MLPredictionOptions())
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
/**
|
| 229 |
+
Make a prediction using the structured interface
|
| 230 |
+
|
| 231 |
+
It uses the default function if the model has multiple functions.
|
| 232 |
+
|
| 233 |
+
- parameters:
|
| 234 |
+
- input: the input to the prediction as CLIP_ImageEncoderInput
|
| 235 |
+
- options: prediction options
|
| 236 |
+
|
| 237 |
+
- throws: an NSError object that describes the problem
|
| 238 |
+
|
| 239 |
+
- returns: the result of the prediction as CLIP_ImageEncoderOutput
|
| 240 |
+
*/
|
| 241 |
+
public func prediction(input: CLIP_ImageEncoderInput, options: MLPredictionOptions) throws -> CLIP_ImageEncoderOutput {
|
| 242 |
+
let outFeatures = try model.prediction(from: input, options: options)
|
| 243 |
+
return CLIP_ImageEncoderOutput(features: outFeatures)
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
/**
|
| 247 |
+
Make an asynchronous prediction using the structured interface
|
| 248 |
+
|
| 249 |
+
It uses the default function if the model has multiple functions.
|
| 250 |
+
|
| 251 |
+
- parameters:
|
| 252 |
+
- input: the input to the prediction as CLIP_ImageEncoderInput
|
| 253 |
+
- options: prediction options
|
| 254 |
+
|
| 255 |
+
- throws: an NSError object that describes the problem
|
| 256 |
+
|
| 257 |
+
- returns: the result of the prediction as CLIP_ImageEncoderOutput
|
| 258 |
+
*/
|
| 259 |
+
@available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, visionOS 1.0, *)
|
| 260 |
+
public func prediction(input: CLIP_ImageEncoderInput, options: MLPredictionOptions = MLPredictionOptions()) async throws -> CLIP_ImageEncoderOutput {
|
| 261 |
+
let outFeatures = try await model.prediction(from: input, options: options)
|
| 262 |
+
return CLIP_ImageEncoderOutput(features: outFeatures)
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
/**
|
| 266 |
+
Make a prediction using the convenience interface
|
| 267 |
+
|
| 268 |
+
It uses the default function if the model has multiple functions.
|
| 269 |
+
|
| 270 |
+
- parameters:
|
| 271 |
+
- image: color (kCVPixelFormatType_32BGRA) image buffer, 224 pixels wide by 224 pixels high
|
| 272 |
+
|
| 273 |
+
- throws: an NSError object that describes the problem
|
| 274 |
+
|
| 275 |
+
- returns: the result of the prediction as CLIP_ImageEncoderOutput
|
| 276 |
+
*/
|
| 277 |
+
public func prediction(image: CVPixelBuffer) throws -> CLIP_ImageEncoderOutput {
|
| 278 |
+
let input_ = CLIP_ImageEncoderInput(image: image)
|
| 279 |
+
return try prediction(input: input_)
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
/**
|
| 283 |
+
Make a batch prediction using the structured interface
|
| 284 |
+
|
| 285 |
+
It uses the default function if the model has multiple functions.
|
| 286 |
+
|
| 287 |
+
- parameters:
|
| 288 |
+
- inputs: the inputs to the prediction as [CLIP_ImageEncoderInput]
|
| 289 |
+
- options: prediction options
|
| 290 |
+
|
| 291 |
+
- throws: an NSError object that describes the problem
|
| 292 |
+
|
| 293 |
+
- returns: the result of the prediction as [CLIP_ImageEncoderOutput]
|
| 294 |
+
*/
|
| 295 |
+
public func predictions(inputs: [CLIP_ImageEncoderInput], options: MLPredictionOptions = MLPredictionOptions()) throws -> [CLIP_ImageEncoderOutput] {
|
| 296 |
+
let batchIn = MLArrayBatchProvider(array: inputs)
|
| 297 |
+
let batchOut = try model.predictions(from: batchIn, options: options)
|
| 298 |
+
var results : [CLIP_ImageEncoderOutput] = []
|
| 299 |
+
results.reserveCapacity(inputs.count)
|
| 300 |
+
for i in 0..<batchOut.count {
|
| 301 |
+
let outProvider = batchOut.features(at: i)
|
| 302 |
+
let result = CLIP_ImageEncoderOutput(features: outProvider)
|
| 303 |
+
results.append(result)
|
| 304 |
+
}
|
| 305 |
+
return results
|
| 306 |
+
}
|
| 307 |
+
}
|
CLIP_TextEncoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1c58c2cf62ade0826a10b4441c8bb7e4fcd64a0fe5db2ed2d0a77cb8c6bbc0c
|
| 3 |
+
size 243
|
CLIP_TextEncoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ea6f6dfb60c6f0f8d80c2e06500b1bf6a835a114aad66094dc4bf097f238935
|
| 3 |
+
size 478
|
CLIP_TextEncoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"shortDescription" : "CLIP ViT-B\/32 model trained with DataComp-1B (Text Encoder Model)",
|
| 4 |
+
"metadataOutputVersion" : "3.0",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 512)",
|
| 11 |
+
"shortDescription" : "--",
|
| 12 |
+
"shape" : "[1, 512]",
|
| 13 |
+
"name" : "var_1317",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"version" : "1.0.0",
|
| 18 |
+
"modelParameters" : [
|
| 19 |
+
|
| 20 |
+
],
|
| 21 |
+
"author" : "InspiratioNULL 2026",
|
| 22 |
+
"specificationVersion" : 6,
|
| 23 |
+
"storagePrecision" : "Float16",
|
| 24 |
+
"license" : "MIT",
|
| 25 |
+
"mlProgramOperationTypeHistogram" : {
|
| 26 |
+
"Linear" : 49,
|
| 27 |
+
"SliceByIndex" : 36,
|
| 28 |
+
"LayerNorm" : 25,
|
| 29 |
+
"Transpose" : 84,
|
| 30 |
+
"Matmul" : 24,
|
| 31 |
+
"Gelu" : 12,
|
| 32 |
+
"Stack" : 1,
|
| 33 |
+
"Softmax" : 12,
|
| 34 |
+
"Squeeze" : 12,
|
| 35 |
+
"Reshape" : 108,
|
| 36 |
+
"Mul" : 12,
|
| 37 |
+
"Add" : 37,
|
| 38 |
+
"ExpandDims" : 12,
|
| 39 |
+
"ReduceArgmax" : 1,
|
| 40 |
+
"Gather" : 1,
|
| 41 |
+
"GatherNd" : 1,
|
| 42 |
+
"Cast" : 1
|
| 43 |
+
},
|
| 44 |
+
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 45 |
+
"stateSchema" : [
|
| 46 |
+
|
| 47 |
+
],
|
| 48 |
+
"isUpdatable" : "0",
|
| 49 |
+
"availability" : {
|
| 50 |
+
"macOS" : "12.0",
|
| 51 |
+
"tvOS" : "15.0",
|
| 52 |
+
"visionOS" : "1.0",
|
| 53 |
+
"watchOS" : "8.0",
|
| 54 |
+
"iOS" : "15.0",
|
| 55 |
+
"macCatalyst" : "15.0"
|
| 56 |
+
},
|
| 57 |
+
"modelType" : {
|
| 58 |
+
"name" : "MLModelType_mlProgram"
|
| 59 |
+
},
|
| 60 |
+
"inputSchema" : [
|
| 61 |
+
{
|
| 62 |
+
"hasShapeFlexibility" : "0",
|
| 63 |
+
"isOptional" : "0",
|
| 64 |
+
"dataType" : "Int32",
|
| 65 |
+
"formattedType" : "MultiArray (Int32 1 × 77)",
|
| 66 |
+
"shortDescription" : "--",
|
| 67 |
+
"shape" : "[1, 77]",
|
| 68 |
+
"name" : "text",
|
| 69 |
+
"type" : "MultiArray"
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"userDefinedMetadata" : {
|
| 73 |
+
"com.github.apple.coremltools.conversion_date" : "2026-01-15",
|
| 74 |
+
"com.github.apple.coremltools.source" : "torch==2.9.1",
|
| 75 |
+
"com.github.apple.coremltools.version" : "9.0",
|
| 76 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 77 |
+
},
|
| 78 |
+
"generatedClassName" : "CLIP_TextEncoder",
|
| 79 |
+
"method" : "predict"
|
| 80 |
+
}
|
| 81 |
+
]
|
CLIP_TextEncoder.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
CLIP_TextEncoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fa0dfd9975b5e10a37e3cd8cc4a563d28be2eeb7fc84e575ba74c0136176a01
|
| 3 |
+
size 126878848
|
CLIP_TextEncoder.swift
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
///MARK: This is the generated class file, useful for proper implementation.
|
| 2 |
+
//Created by InspiratioNULL on 1/20/2026
|
| 3 |
+
// CLIP_TextEncoder.swift
|
| 4 |
+
//
|
| 5 |
+
// This file was automatically generated and should not be edited.
|
| 6 |
+
//
|
| 7 |
+
|
| 8 |
+
import CoreML
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
/// Model Prediction Input Type
|
| 12 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 13 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 14 |
+
public class CLIP_TextEncoderInput : MLFeatureProvider {
|
| 15 |
+
|
| 16 |
+
/// text as 1 by 77 matrix of 32-bit integers
|
| 17 |
+
/// text as 1 by 77 matrix of 32-bit integers
|
| 18 |
+
public var text: MLMultiArray
|
| 19 |
+
|
| 20 |
+
public var featureNames: Set<String> { ["text"] }
|
| 21 |
+
|
| 22 |
+
public func featureValue(for featureName: String) -> MLFeatureValue? {
|
| 23 |
+
if featureName == "text" {
|
| 24 |
+
return MLFeatureValue(multiArray: text)
|
| 25 |
+
}
|
| 26 |
+
return nil
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
public init(text: MLMultiArray) {
|
| 30 |
+
self.text = text
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
public convenience init(text: MLShapedArray<Int32>) {
|
| 34 |
+
self.init(text: MLMultiArray(text))
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
/// Model Prediction Output Type
|
| 41 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 42 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 43 |
+
public class CLIP_TextEncoderOutput : MLFeatureProvider {
|
| 44 |
+
|
| 45 |
+
/// Source provided by CoreML
|
| 46 |
+
private let provider : MLFeatureProvider
|
| 47 |
+
|
| 48 |
+
/// var_1317 as 1 by 512 matrix of floats
|
| 49 |
+
/// var_1317 as 1 by 512 matrix of floats
|
| 50 |
+
public var var_1317: MLMultiArray {
|
| 51 |
+
provider.featureValue(for: "var_1317")!.multiArrayValue!
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
/// var_1317 as 1 by 512 matrix of floats
|
| 55 |
+
/// var_1317 as 1 by 512 matrix of floats
|
| 56 |
+
public var var_1317ShapedArray: MLShapedArray<Float> {
|
| 57 |
+
MLShapedArray<Float>(var_1317)
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
public var featureNames: Set<String> {
|
| 61 |
+
provider.featureNames
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
public func featureValue(for featureName: String) -> MLFeatureValue? {
|
| 65 |
+
provider.featureValue(for: featureName)
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
public init(var_1317: MLMultiArray) {
|
| 69 |
+
self.provider = try! MLDictionaryFeatureProvider(dictionary: ["var_1317" : MLFeatureValue(multiArray: var_1317)])
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
public init(features: MLFeatureProvider) {
|
| 73 |
+
self.provider = features
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
/// Class for model loading and prediction
|
| 79 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 80 |
+
@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
|
| 81 |
+
public class CLIP_TextEncoder {
|
| 82 |
+
public let model: MLModel
|
| 83 |
+
|
| 84 |
+
/// URL of model assuming it was installed in the same bundle as this class
|
| 85 |
+
/// URL of model assuming it was installed in the same bundle as this class
|
| 86 |
+
public class var urlOfModelInThisBundle : URL {
|
| 87 |
+
let bundle = Bundle(for: self)
|
| 88 |
+
return bundle.url(forResource: "CLIP_TextEncoder", withExtension:"mlmodelc")!
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/**
|
| 92 |
+
Construct CLIP_TextEncoder instance with an existing MLModel object.
|
| 93 |
+
|
| 94 |
+
Usually the application does not use this initializer unless it makes a subclass of CLIP_TextEncoder.
|
| 95 |
+
Such application may want to use `MLModel(contentsOfURL:configuration:)` and `CLIP_TextEncoder.urlOfModelInThisBundle` to create a MLModel object to pass-in.
|
| 96 |
+
|
| 97 |
+
- parameters:
|
| 98 |
+
- model: MLModel object
|
| 99 |
+
*/
|
| 100 |
+
public init(model: MLModel) {
|
| 101 |
+
self.model = model
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
/**
|
| 105 |
+
Construct a model with configuration
|
| 106 |
+
|
| 107 |
+
- parameters:
|
| 108 |
+
- configuration: the desired model configuration
|
| 109 |
+
|
| 110 |
+
- throws: an NSError object that describes the problem
|
| 111 |
+
*/
|
| 112 |
+
public convenience init(configuration: MLModelConfiguration = MLModelConfiguration()) throws {
|
| 113 |
+
try self.init(contentsOf: type(of:self).urlOfModelInThisBundle, configuration: configuration)
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/**
|
| 117 |
+
Construct CLIP_TextEncoder instance with explicit path to mlmodelc file
|
| 118 |
+
- parameters:
|
| 119 |
+
- modelURL: the file url of the model
|
| 120 |
+
|
| 121 |
+
- throws: an NSError object that describes the problem
|
| 122 |
+
*/
|
| 123 |
+
public convenience init(contentsOf modelURL: URL) throws {
|
| 124 |
+
try self.init(model: MLModel(contentsOf: modelURL))
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
/**
|
| 128 |
+
Construct a model with URL of the .mlmodelc directory and configuration
|
| 129 |
+
|
| 130 |
+
- parameters:
|
| 131 |
+
- modelURL: the file url of the model
|
| 132 |
+
- configuration: the desired model configuration
|
| 133 |
+
|
| 134 |
+
- throws: an NSError object that describes the problem
|
| 135 |
+
*/
|
| 136 |
+
public convenience init(contentsOf modelURL: URL, configuration: MLModelConfiguration) throws {
|
| 137 |
+
try self.init(model: MLModel(contentsOf: modelURL, configuration: configuration))
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
/**
|
| 141 |
+
Construct CLIP_TextEncoder instance asynchronously with optional configuration.
|
| 142 |
+
|
| 143 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 144 |
+
|
| 145 |
+
- parameters:
|
| 146 |
+
- configuration: the desired model configuration
|
| 147 |
+
- handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
|
| 148 |
+
*/
|
| 149 |
+
public class func load(configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_TextEncoder, Error>) -> Void) {
|
| 150 |
+
load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration, completionHandler: handler)
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
/**
|
| 154 |
+
Construct CLIP_TextEncoder instance asynchronously with optional configuration.
|
| 155 |
+
|
| 156 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 157 |
+
|
| 158 |
+
- parameters:
|
| 159 |
+
- configuration: the desired model configuration
|
| 160 |
+
*/
|
| 161 |
+
public class func load(configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_TextEncoder {
|
| 162 |
+
try await load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration)
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
/**
|
| 166 |
+
Construct CLIP_TextEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
|
| 167 |
+
|
| 168 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 169 |
+
|
| 170 |
+
- parameters:
|
| 171 |
+
- modelURL: the URL to the model
|
| 172 |
+
- configuration: the desired model configuration
|
| 173 |
+
- handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
|
| 174 |
+
*/
|
| 175 |
+
public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_TextEncoder, Error>) -> Void) {
|
| 176 |
+
MLModel.load(contentsOf: modelURL, configuration: configuration) { result in
|
| 177 |
+
switch result {
|
| 178 |
+
case .failure(let error):
|
| 179 |
+
handler(.failure(error))
|
| 180 |
+
case .success(let model):
|
| 181 |
+
handler(.success(CLIP_TextEncoder(model: model)))
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/**
|
| 187 |
+
Construct CLIP_TextEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
|
| 188 |
+
|
| 189 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 190 |
+
|
| 191 |
+
- parameters:
|
| 192 |
+
- modelURL: the URL to the model
|
| 193 |
+
- configuration: the desired model configuration
|
| 194 |
+
*/
|
| 195 |
+
public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_TextEncoder {
|
| 196 |
+
let model = try await MLModel.load(contentsOf: modelURL, configuration: configuration)
|
| 197 |
+
return CLIP_TextEncoder(model: model)
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
/**
|
| 201 |
+
Make a prediction using the structured interface
|
| 202 |
+
|
| 203 |
+
It uses the default function if the model has multiple functions.
|
| 204 |
+
|
| 205 |
+
- parameters:
|
| 206 |
+
- input: the input to the prediction as CLIP_TextEncoderInput
|
| 207 |
+
|
| 208 |
+
- throws: an NSError object that describes the problem
|
| 209 |
+
|
| 210 |
+
- returns: the result of the prediction as CLIP_TextEncoderOutput
|
| 211 |
+
*/
|
| 212 |
+
public func prediction(input: CLIP_TextEncoderInput) throws -> CLIP_TextEncoderOutput {
|
| 213 |
+
try prediction(input: input, options: MLPredictionOptions())
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
/**
|
| 217 |
+
Make a prediction using the structured interface
|
| 218 |
+
|
| 219 |
+
It uses the default function if the model has multiple functions.
|
| 220 |
+
|
| 221 |
+
- parameters:
|
| 222 |
+
- input: the input to the prediction as CLIP_TextEncoderInput
|
| 223 |
+
- options: prediction options
|
| 224 |
+
|
| 225 |
+
- throws: an NSError object that describes the problem
|
| 226 |
+
|
| 227 |
+
- returns: the result of the prediction as CLIP_TextEncoderOutput
|
| 228 |
+
*/
|
| 229 |
+
public func prediction(input: CLIP_TextEncoderInput, options: MLPredictionOptions) throws -> CLIP_TextEncoderOutput {
|
| 230 |
+
let outFeatures = try model.prediction(from: input, options: options)
|
| 231 |
+
return CLIP_TextEncoderOutput(features: outFeatures)
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
/**
|
| 235 |
+
Make an asynchronous prediction using the structured interface
|
| 236 |
+
|
| 237 |
+
It uses the default function if the model has multiple functions.
|
| 238 |
+
|
| 239 |
+
- parameters:
|
| 240 |
+
- input: the input to the prediction as CLIP_TextEncoderInput
|
| 241 |
+
- options: prediction options
|
| 242 |
+
|
| 243 |
+
- throws: an NSError object that describes the problem
|
| 244 |
+
|
| 245 |
+
- returns: the result of the prediction as CLIP_TextEncoderOutput
|
| 246 |
+
*/
|
| 247 |
+
@available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, visionOS 1.0, *)
|
| 248 |
+
public func prediction(input: CLIP_TextEncoderInput, options: MLPredictionOptions = MLPredictionOptions()) async throws -> CLIP_TextEncoderOutput {
|
| 249 |
+
let outFeatures = try await model.prediction(from: input, options: options)
|
| 250 |
+
return CLIP_TextEncoderOutput(features: outFeatures)
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
/**
|
| 254 |
+
Make a prediction using the convenience interface
|
| 255 |
+
|
| 256 |
+
It uses the default function if the model has multiple functions.
|
| 257 |
+
|
| 258 |
+
- parameters:
|
| 259 |
+
- text: 1 by 77 matrix of 32-bit integers
|
| 260 |
+
|
| 261 |
+
- throws: an NSError object that describes the problem
|
| 262 |
+
|
| 263 |
+
- returns: the result of the prediction as CLIP_TextEncoderOutput
|
| 264 |
+
*/
|
| 265 |
+
public func prediction(text: MLMultiArray) throws -> CLIP_TextEncoderOutput {
|
| 266 |
+
let input_ = CLIP_TextEncoderInput(text: text)
|
| 267 |
+
return try prediction(input: input_)
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
/**
|
| 271 |
+
Make a prediction using the convenience interface
|
| 272 |
+
|
| 273 |
+
It uses the default function if the model has multiple functions.
|
| 274 |
+
|
| 275 |
+
- parameters:
|
| 276 |
+
- text: 1 by 77 matrix of 32-bit integers
|
| 277 |
+
|
| 278 |
+
- throws: an NSError object that describes the problem
|
| 279 |
+
|
| 280 |
+
- returns: the result of the prediction as CLIP_TextEncoderOutput
|
| 281 |
+
*/
|
| 282 |
+
|
| 283 |
+
public func prediction(text: MLShapedArray<Int32>) throws -> CLIP_TextEncoderOutput {
|
| 284 |
+
let input_ = CLIP_TextEncoderInput(text: text)
|
| 285 |
+
return try prediction(input: input_)
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
/**
|
| 289 |
+
Make a batch prediction using the structured interface
|
| 290 |
+
|
| 291 |
+
It uses the default function if the model has multiple functions.
|
| 292 |
+
|
| 293 |
+
- parameters:
|
| 294 |
+
- inputs: the inputs to the prediction as [CLIP_TextEncoderInput]
|
| 295 |
+
- options: prediction options
|
| 296 |
+
|
| 297 |
+
- throws: an NSError object that describes the problem
|
| 298 |
+
|
| 299 |
+
- returns: the result of the prediction as [CLIP_TextEncoderOutput]
|
| 300 |
+
*/
|
| 301 |
+
public func predictions(inputs: [CLIP_TextEncoderInput], options: MLPredictionOptions = MLPredictionOptions()) throws -> [CLIP_TextEncoderOutput] {
|
| 302 |
+
let batchIn = MLArrayBatchProvider(array: inputs)
|
| 303 |
+
let batchOut = try model.predictions(from: batchIn, options: options)
|
| 304 |
+
var results : [CLIP_TextEncoderOutput] = []
|
| 305 |
+
results.reserveCapacity(inputs.count)
|
| 306 |
+
for i in 0..<batchOut.count {
|
| 307 |
+
let outProvider = batchOut.features(at: i)
|
| 308 |
+
let result = CLIP_TextEncoderOutput(features: outProvider)
|
| 309 |
+
results.append(result)
|
| 310 |
+
}
|
| 311 |
+
return results
|
| 312 |
+
}
|
| 313 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
widget:
|
| 4 |
+
- src: >-
|
| 5 |
+
https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
|
| 6 |
+
candidate_labels: playing music, playing sports
|
| 7 |
+
example_title: Cat & Dog
|
| 8 |
+
library_name: open_clip
|
| 9 |
+
datasets:
|
| 10 |
+
- mlfoundations/datacomp_pools
|
| 11 |
+
pipeline_tag: zero-shot-image-classification
|
| 12 |
+
---
|
| 13 |
+
# Model card for CLIP ViT-B-32 trained DataComp-1B
|
| 14 |
+
|
| 15 |
+
# Table of Contents
|
| 16 |
+
|
| 17 |
+
1. [Model Details](#model-details)
|
| 18 |
+
2. [Uses](#uses)
|
| 19 |
+
3. [Training Details](#training-details)
|
| 20 |
+
4. [Evaluation](#evaluation)
|
| 21 |
+
5. [Acknowledgements](#acknowledgements)
|
| 22 |
+
6. [Citation](#citation)
|
| 23 |
+
7. [How To Get Started With the Model](#how-to-get-started-with-the-model)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Model Details
|
| 27 |
+
|
| 28 |
+
This repo contains a converted model for the CoreML model format.
|
| 29 |
+
```
|
| 30 |
+
CLIP_ImageEncoder.mlmodelc, CLIP_TextEncoder.mlmodelc, + swift classes
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Model Description
|
| 34 |
+
|
| 35 |
+
A CLIP ViT-B/32 model trained with the DataComp-1B (https://github.com/mlfoundations/datacomp) using OpenCLIP (https://github.com/mlfoundations/open_clip).
|
| 36 |
+
|
| 37 |
+
Model training done on the [stability.ai](https://stability.ai/) cluster.
|
| 38 |
+
|
| 39 |
+
# Uses
|
| 40 |
+
|
| 41 |
+
As per the original [OpenAI CLIP model card](https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/model-card.md), this model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such model.
|
| 42 |
+
|
| 43 |
+
The OpenAI CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. Additionally, the DataComp paper (https://arxiv.org/abs/2304.14108) include additional discussion as it relates specifically to the training dataset.
|
| 44 |
+
|
| 45 |
+
## Direct Use
|
| 46 |
+
|
| 47 |
+
Zero-shot image classification, image and text retrieval, among others.
|
| 48 |
+
|
| 49 |
+
## Downstream Use
|
| 50 |
+
|
| 51 |
+
Image classification and other image task fine-tuning, linear probe image classification, image generation guiding and conditioning, among others.
|
| 52 |
+
|
| 53 |
+
## Out-of-Scope Use
|
| 54 |
+
|
| 55 |
+
As per the OpenAI models,
|
| 56 |
+
|
| 57 |
+
**Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful.
|
| 58 |
+
|
| 59 |
+
Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use.
|
| 60 |
+
|
| 61 |
+
# Training Details
|
| 62 |
+
|
| 63 |
+
## Training Data
|
| 64 |
+
|
| 65 |
+
This model was trained with the 1.4 Billion samples of the DataComp-1B dataset (https://arxiv.org/abs/2304.14108).
|
| 66 |
+
|
| 67 |
+
**IMPORTANT NOTE:** The motivation behind dataset creation is to democratize research and experimentation around large-scale multi-modal model training and handling of uncurated, large-scale datasets crawled from publically available internet. Our recommendation is therefore to use the dataset for research purposes. Be aware that this large-scale dataset is uncurated. Keep in mind that the uncurated nature of the dataset means that collected links may lead to strongly discomforting and disturbing content for a human viewer. Therefore, please use the demo links with caution and at your own risk. It is possible to extract a “safe” subset by filtering out samples based on the safety tags (using a customized trained NSFW classifier that we built). While this strongly reduces the chance for encountering potentially harmful content when viewing, we cannot entirely exclude the possibility for harmful content being still present in safe mode, so that the warning holds also there. We think that providing the dataset openly to broad research and other interested communities will allow for transparent investigation of benefits that come along with training large-scale models as well as pitfalls and dangers that may stay unreported or unnoticed when working with closed large datasets that remain restricted to a small community. Providing our dataset openly, we however do not recommend using it for creating ready-to-go industrial products, as the basic research about general properties and safety of such large-scale models, which we would like to encourage with this release, is still in progress.
|
| 68 |
+
|
| 69 |
+
## Training Procedure
|
| 70 |
+
|
| 71 |
+
Please see https://arxiv.org/abs/2304.14108.
|
| 72 |
+
|
| 73 |
+
# Evaluation
|
| 74 |
+
|
| 75 |
+
Evaluation done on 38 datasets, using the [DataComp repo](https://github.com/mlfoundations/datacomp) and the [LAION CLIP Benchmark](https://github.com/LAION-AI/CLIP_benchmark).
|
| 76 |
+
|
| 77 |
+
## Testing Data, Factors & Metrics
|
| 78 |
+
|
| 79 |
+
### Testing Data
|
| 80 |
+
|
| 81 |
+
The testing is performed on a suite of 38 datasets. See our paper for more details (https://arxiv.org/abs/2304.14108).
|
| 82 |
+
|
| 83 |
+
## Results
|
| 84 |
+
|
| 85 |
+
The model achieves a 72.7% zero-shot top-1 accuracy on ImageNet-1k. See our paper for more details and results (https://arxiv.org/abs/2304.14108).
|
| 86 |
+
|
| 87 |
+
# Acknowledgements
|
| 88 |
+
|
| 89 |
+
Acknowledging [stability.ai](https://stability.ai/) for the compute used to train this model.
|
| 90 |
+
|
| 91 |
+
# Citation
|
| 92 |
+
|
| 93 |
+
**BibTeX:**
|
| 94 |
+
Conversion To CoreML
|
| 95 |
+
```
|
| 96 |
+
@coreml{
|
| 97 |
+
author = {InspiratioNULL},
|
| 98 |
+
month = Jan
|
| 99 |
+
year = 2026
|
| 100 |
+
note = {See CLIP_ImageEncoder.swift &
|
| 101 |
+
CLIP_TextEncoder.swift for
|
| 102 |
+
implementation details.
|
| 103 |
+
|
| 104 |
+
Special Thanks to the individuals
|
| 105 |
+
and organizations below for
|
| 106 |
+
creating this model},
|
| 107 |
+
url = {Inspirationull.com},
|
| 108 |
+
github = {https://github.com/InspiratioNULL},
|
| 109 |
+
huggingface = {huggingface.co/InspiratioNULL},
|
| 110 |
+
}
|
| 111 |
+
```
|
| 112 |
+
DataComp
|
| 113 |
+
```bibtex
|
| 114 |
+
@article{datacomp,
|
| 115 |
+
title={DataComp: In search of the next generation of multimodal datasets},
|
| 116 |
+
author={Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt},
|
| 117 |
+
journal={arXiv preprint arXiv:2304.14108},
|
| 118 |
+
year={2023}
|
| 119 |
+
}
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
OpenAI CLIP paper
|
| 124 |
+
```
|
| 125 |
+
@inproceedings{Radford2021LearningTV,
|
| 126 |
+
title={Learning Transferable Visual Models From Natural Language Supervision},
|
| 127 |
+
author={Alec Radford and Jong Wook Kim and Chris Hallacy and A. Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
|
| 128 |
+
booktitle={ICML},
|
| 129 |
+
year={2021}
|
| 130 |
+
}
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
OpenCLIP software
|
| 134 |
+
```
|
| 135 |
+
@software{ilharco_gabriel_2021_5143773,
|
| 136 |
+
author = {Ilharco, Gabriel and
|
| 137 |
+
Wortsman, Mitchell and
|
| 138 |
+
Wightman, Ross and
|
| 139 |
+
Gordon, Cade and
|
| 140 |
+
Carlini, Nicholas and
|
| 141 |
+
Taori, Rohan and
|
| 142 |
+
Dave, Achal and
|
| 143 |
+
Shankar, Vaishaal and
|
| 144 |
+
Namkoong, Hongseok and
|
| 145 |
+
Miller, John and
|
| 146 |
+
Hajishirzi, Hannaneh and
|
| 147 |
+
Farhadi, Ali and
|
| 148 |
+
Schmidt, Ludwig},
|
| 149 |
+
title = {OpenCLIP},
|
| 150 |
+
month = jul,
|
| 151 |
+
year = 2021,
|
| 152 |
+
note = {If you use this software, please cite it as below.},
|
| 153 |
+
publisher = {Zenodo},
|
| 154 |
+
version = {0.1},
|
| 155 |
+
doi = {10.5281/zenodo.5143773},
|
| 156 |
+
url = {https://doi.org/10.5281/zenodo.5143773}
|
| 157 |
+
}
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
# How to Get Started with the Model
|
| 161 |
+
|
| 162 |
+
See https://github.com/mlfoundations/open_clip
|
| 163 |
+
# For CoreML
|
| 164 |
+
See https://huggingface.co/apple
|
| 165 |
+
See https://developer.apple.com/documentation/
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
open_clip_config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_cfg": {
|
| 3 |
+
"embed_dim": 512,
|
| 4 |
+
"vision_cfg": {
|
| 5 |
+
"image_size": 224,
|
| 6 |
+
"layers": 12,
|
| 7 |
+
"width": 768,
|
| 8 |
+
"patch_size": 32
|
| 9 |
+
},
|
| 10 |
+
"text_cfg": {
|
| 11 |
+
"context_length": 77,
|
| 12 |
+
"vocab_size": 49408,
|
| 13 |
+
"width": 512,
|
| 14 |
+
"heads": 8,
|
| 15 |
+
"layers": 12
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"preprocess_cfg": {
|
| 19 |
+
"mean": [
|
| 20 |
+
0.48145466,
|
| 21 |
+
0.4578275,
|
| 22 |
+
0.40821073
|
| 23 |
+
],
|
| 24 |
+
"std": [
|
| 25 |
+
0.26862954,
|
| 26 |
+
0.26130258,
|
| 27 |
+
0.27577711
|
| 28 |
+
]
|
| 29 |
+
}
|
| 30 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<|startoftext|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<|endoftext|>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<|endoftext|>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<|endoftext|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": true,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"bos_token": {
|
| 4 |
+
"__type": "AddedToken",
|
| 5 |
+
"content": "<|startoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false
|
| 10 |
+
},
|
| 11 |
+
"clean_up_tokenization_spaces": true,
|
| 12 |
+
"do_lower_case": true,
|
| 13 |
+
"eos_token": {
|
| 14 |
+
"__type": "AddedToken",
|
| 15 |
+
"content": "<|endoftext|>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": true,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false
|
| 20 |
+
},
|
| 21 |
+
"errors": "replace",
|
| 22 |
+
"model_max_length": 77,
|
| 23 |
+
"pad_token": "<|endoftext|>",
|
| 24 |
+
"tokenizer_class": "CLIPTokenizer",
|
| 25 |
+
"unk_token": {
|
| 26 |
+
"__type": "AddedToken",
|
| 27 |
+
"content": "<|endoftext|>",
|
| 28 |
+
"lstrip": false,
|
| 29 |
+
"normalized": true,
|
| 30 |
+
"rstrip": false,
|
| 31 |
+
"single_word": false
|
| 32 |
+
}
|
| 33 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|