InspiratioNULL commited on
Commit
f24f82b
·
unverified ·
0 Parent(s):

Initial Commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodelc filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
CLIP_ImageEncoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d4292a5186e1d7b5fee66d202e904b13f8036e3a19fb7b91444478b0bf997ea
3
+ size 243
CLIP_ImageEncoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a77b743ee02d373e0f0a152d5146ea1eb9860a9c81b1979576a99972a9dcc2
3
+ size 472
CLIP_ImageEncoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "shortDescription" : "CLIP ViT-B\/32 model trained with DataComp-1B (Image Encoder Model)",
4
+ "metadataOutputVersion" : "3.0",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 512)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 512]",
13
+ "name" : "var_1240",
14
+ "type" : "MultiArray"
15
+ }
16
+ ],
17
+ "version" : "1.0.0",
18
+ "modelParameters" : [
19
+
20
+ ],
21
+ "author" : "InspiratioNULL 2026",
22
+ "specificationVersion" : 6,
23
+ "storagePrecision" : "Float16",
24
+ "license" : "MIT",
25
+ "mlProgramOperationTypeHistogram" : {
26
+ "Concat" : 1,
27
+ "Linear" : 49,
28
+ "SliceByIndex" : 37,
29
+ "LayerNorm" : 26,
30
+ "Transpose" : 85,
31
+ "Matmul" : 24,
32
+ "Gelu" : 12,
33
+ "Softmax" : 12,
34
+ "Mul" : 13,
35
+ "Cast" : 2,
36
+ "Reshape" : 109,
37
+ "Add" : 26,
38
+ "ExpandDims" : 12,
39
+ "Squeeze" : 12,
40
+ "Conv" : 1
41
+ },
42
+ "computePrecision" : "Mixed (Float16, Float32, Int32)",
43
+ "stateSchema" : [
44
+
45
+ ],
46
+ "isUpdatable" : "0",
47
+ "availability" : {
48
+ "macOS" : "12.0",
49
+ "tvOS" : "15.0",
50
+ "visionOS" : "1.0",
51
+ "watchOS" : "8.0",
52
+ "iOS" : "15.0",
53
+ "macCatalyst" : "15.0"
54
+ },
55
+ "modelType" : {
56
+ "name" : "MLModelType_mlProgram"
57
+ },
58
+ "inputSchema" : [
59
+ {
60
+ "height" : "224",
61
+ "colorspace" : "RGB",
62
+ "isOptional" : "0",
63
+ "width" : "224",
64
+ "isColor" : "1",
65
+ "formattedType" : "Image (Color 224 × 224)",
66
+ "hasSizeFlexibility" : "0",
67
+ "type" : "Image",
68
+ "shortDescription" : "",
69
+ "name" : "image"
70
+ }
71
+ ],
72
+ "userDefinedMetadata" : {
73
+ "com.github.apple.coremltools.conversion_date" : "2026-01-15",
74
+ "com.github.apple.coremltools.source" : "torch==2.9.1",
75
+ "com.github.apple.coremltools.version" : "9.0",
76
+ "com.github.apple.coremltools.source_dialect" : "TorchScript"
77
+ },
78
+ "generatedClassName" : "CLIP_ImageEncoder",
79
+ "method" : "predict"
80
+ }
81
+ ]
CLIP_ImageEncoder.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
CLIP_ImageEncoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa215c95d527ea6508f368b70329299ffd387d23a193adc891dd971f1d268b56
3
+ size 175709312
CLIP_ImageEncoder.swift ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ///MARK: This is the generated class file, useful for proper implementation.
2
+ //Created by InspiratioNULL on 1/20/2026
3
+ // CLIP_ImageEncoder.swift
4
+ //
5
+ // This file was automatically generated and should not be edited.
6
+ //
7
+
8
+ import CoreML
9
+
10
+
11
+ /// Model Prediction Input Type
12
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
13
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
14
+ public class CLIP_ImageEncoderInput : MLFeatureProvider {
15
+
16
+ /// image as color (kCVPixelFormatType_32BGRA) image buffer, 224 pixels wide by 224 pixels high
17
+ /// image as color (kCVPixelFormatType_32BGRA) image buffer, 224 pixels wide by 224 pixels high
18
+ public var image: CVPixelBuffer
19
+
20
+ public var featureNames: Set<String> { ["image"] }
21
+
22
+ public func featureValue(for featureName: String) -> MLFeatureValue? {
23
+ if featureName == "image" {
24
+ return MLFeatureValue(pixelBuffer: image)
25
+ }
26
+ return nil
27
+ }
28
+
29
+ public init(image: CVPixelBuffer) {
30
+ self.image = image
31
+ }
32
+
33
+ public convenience init(imageWith image: CGImage) throws {
34
+ self.init(image: try MLFeatureValue(cgImage: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!)
35
+ }
36
+
37
+ public convenience init(imageAt image: URL) throws {
38
+ self.init(image: try MLFeatureValue(imageAt: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!)
39
+ }
40
+
41
+ func setImage(with image: CGImage) throws {
42
+ self.image = try MLFeatureValue(cgImage: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!
43
+ }
44
+
45
+ func setImage(with image: URL) throws {
46
+ self.image = try MLFeatureValue(imageAt: image, pixelsWide: 224, pixelsHigh: 224, pixelFormatType: kCVPixelFormatType_32ARGB, options: nil).imageBufferValue!
47
+ }
48
+
49
+ }
50
+
51
+
52
+ /// Model Prediction Output Type
53
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
54
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
55
+ public class CLIP_ImageEncoderOutput : MLFeatureProvider {
56
+
57
+ /// Source provided by CoreML
58
+ private let provider : MLFeatureProvider
59
+
60
+ /// var_1240 as 1 by 512 matrix of floats
61
+ /// var_1240 as 1 by 512 matrix of floats
62
+ public var var_1240: MLMultiArray {
63
+ provider.featureValue(for: "var_1240")!.multiArrayValue!
64
+ }
65
+
66
+ /// var_1240 as 1 by 512 matrix of floats
67
+ /// var_1240 as 1 by 512 matrix of floats
68
+ public var var_1240ShapedArray: MLShapedArray<Float> {
69
+ MLShapedArray<Float>(var_1240)
70
+ }
71
+
72
+ public var featureNames: Set<String> {
73
+ provider.featureNames
74
+ }
75
+
76
+ public func featureValue(for featureName: String) -> MLFeatureValue? {
77
+ provider.featureValue(for: featureName)
78
+ }
79
+
80
+ public init(var_1240: MLMultiArray) {
81
+ self.provider = try! MLDictionaryFeatureProvider(dictionary: ["var_1240" : MLFeatureValue(multiArray: var_1240)])
82
+ }
83
+
84
+ public init(features: MLFeatureProvider) {
85
+ self.provider = features
86
+ }
87
+ }
88
+
89
+
90
+ /// Class for model loading and prediction
91
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
92
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
93
+ public class CLIP_ImageEncoder {
94
+ public let model: MLModel
95
+
96
+ /// URL of model assuming it was installed in the same bundle as this class
97
+ /// URL of model assuming it was installed in the same bundle as this class
98
+ public class var urlOfModelInThisBundle : URL {
99
+ let bundle = Bundle(for: self)
100
+ return bundle.url(forResource: "CLIP_ImageEncoder", withExtension:"mlmodelc")!
101
+ }
102
+
103
+ /**
104
+ Construct CLIP_ImageEncoder instance with an existing MLModel object.
105
+
106
+ Usually the application does not use this initializer unless it makes a subclass of CLIP_ImageEncoder.
107
+ Such application may want to use `MLModel(contentsOfURL:configuration:)` and `CLIP_ImageEncoder.urlOfModelInThisBundle` to create a MLModel object to pass-in.
108
+
109
+ - parameters:
110
+ - model: MLModel object
111
+ */
112
+ public init(model: MLModel) {
113
+ self.model = model
114
+ }
115
+
116
+ /**
117
+ Construct a model with configuration
118
+
119
+ - parameters:
120
+ - configuration: the desired model configuration
121
+
122
+ - throws: an NSError object that describes the problem
123
+ */
124
+ public convenience init(configuration: MLModelConfiguration = MLModelConfiguration()) throws {
125
+ try self.init(contentsOf: type(of:self).urlOfModelInThisBundle, configuration: configuration)
126
+ }
127
+
128
+ /**
129
+ Construct CLIP_ImageEncoder instance with explicit path to mlmodelc file
130
+ - parameters:
131
+ - modelURL: the file url of the model
132
+
133
+ - throws: an NSError object that describes the problem
134
+ */
135
+ public convenience init(contentsOf modelURL: URL) throws {
136
+ try self.init(model: MLModel(contentsOf: modelURL))
137
+ }
138
+
139
+ /**
140
+ Construct a model with URL of the .mlmodelc directory and configuration
141
+
142
+ - parameters:
143
+ - modelURL: the file url of the model
144
+ - configuration: the desired model configuration
145
+
146
+ - throws: an NSError object that describes the problem
147
+ */
148
+ public convenience init(contentsOf modelURL: URL, configuration: MLModelConfiguration) throws {
149
+ try self.init(model: MLModel(contentsOf: modelURL, configuration: configuration))
150
+ }
151
+
152
+ /**
153
+ Construct CLIP_ImageEncoder instance asynchronously with optional configuration.
154
+
155
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
156
+
157
+ - parameters:
158
+ - configuration: the desired model configuration
159
+ - handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
160
+ */
161
+ public class func load(configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_ImageEncoder, Error>) -> Void) {
162
+ load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration, completionHandler: handler)
163
+ }
164
+
165
+ /**
166
+ Construct CLIP_ImageEncoder instance asynchronously with optional configuration.
167
+
168
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
169
+
170
+ - parameters:
171
+ - configuration: the desired model configuration
172
+ */
173
+ public class func load(configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_ImageEncoder {
174
+ try await load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration)
175
+ }
176
+
177
+ /**
178
+ Construct CLIP_ImageEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
179
+
180
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
181
+
182
+ - parameters:
183
+ - modelURL: the URL to the model
184
+ - configuration: the desired model configuration
185
+ - handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
186
+ */
187
+ public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_ImageEncoder, Error>) -> Void) {
188
+ MLModel.load(contentsOf: modelURL, configuration: configuration) { result in
189
+ switch result {
190
+ case .failure(let error):
191
+ handler(.failure(error))
192
+ case .success(let model):
193
+ handler(.success(CLIP_ImageEncoder(model: model)))
194
+ }
195
+ }
196
+ }
197
+
198
+ /**
199
+ Construct CLIP_ImageEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
200
+
201
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
202
+
203
+ - parameters:
204
+ - modelURL: the URL to the model
205
+ - configuration: the desired model configuration
206
+ */
207
+ public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_ImageEncoder {
208
+ let model = try await MLModel.load(contentsOf: modelURL, configuration: configuration)
209
+ return CLIP_ImageEncoder(model: model)
210
+ }
211
+
212
+ /**
213
+ Make a prediction using the structured interface
214
+
215
+ It uses the default function if the model has multiple functions.
216
+
217
+ - parameters:
218
+ - input: the input to the prediction as CLIP_ImageEncoderInput
219
+
220
+ - throws: an NSError object that describes the problem
221
+
222
+ - returns: the result of the prediction as CLIP_ImageEncoderOutput
223
+ */
224
+ public func prediction(input: CLIP_ImageEncoderInput) throws -> CLIP_ImageEncoderOutput {
225
+ try prediction(input: input, options: MLPredictionOptions())
226
+ }
227
+
228
+ /**
229
+ Make a prediction using the structured interface
230
+
231
+ It uses the default function if the model has multiple functions.
232
+
233
+ - parameters:
234
+ - input: the input to the prediction as CLIP_ImageEncoderInput
235
+ - options: prediction options
236
+
237
+ - throws: an NSError object that describes the problem
238
+
239
+ - returns: the result of the prediction as CLIP_ImageEncoderOutput
240
+ */
241
+ public func prediction(input: CLIP_ImageEncoderInput, options: MLPredictionOptions) throws -> CLIP_ImageEncoderOutput {
242
+ let outFeatures = try model.prediction(from: input, options: options)
243
+ return CLIP_ImageEncoderOutput(features: outFeatures)
244
+ }
245
+
246
+ /**
247
+ Make an asynchronous prediction using the structured interface
248
+
249
+ It uses the default function if the model has multiple functions.
250
+
251
+ - parameters:
252
+ - input: the input to the prediction as CLIP_ImageEncoderInput
253
+ - options: prediction options
254
+
255
+ - throws: an NSError object that describes the problem
256
+
257
+ - returns: the result of the prediction as CLIP_ImageEncoderOutput
258
+ */
259
+ @available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, visionOS 1.0, *)
260
+ public func prediction(input: CLIP_ImageEncoderInput, options: MLPredictionOptions = MLPredictionOptions()) async throws -> CLIP_ImageEncoderOutput {
261
+ let outFeatures = try await model.prediction(from: input, options: options)
262
+ return CLIP_ImageEncoderOutput(features: outFeatures)
263
+ }
264
+
265
+ /**
266
+ Make a prediction using the convenience interface
267
+
268
+ It uses the default function if the model has multiple functions.
269
+
270
+ - parameters:
271
+ - image: color (kCVPixelFormatType_32BGRA) image buffer, 224 pixels wide by 224 pixels high
272
+
273
+ - throws: an NSError object that describes the problem
274
+
275
+ - returns: the result of the prediction as CLIP_ImageEncoderOutput
276
+ */
277
+ public func prediction(image: CVPixelBuffer) throws -> CLIP_ImageEncoderOutput {
278
+ let input_ = CLIP_ImageEncoderInput(image: image)
279
+ return try prediction(input: input_)
280
+ }
281
+
282
+ /**
283
+ Make a batch prediction using the structured interface
284
+
285
+ It uses the default function if the model has multiple functions.
286
+
287
+ - parameters:
288
+ - inputs: the inputs to the prediction as [CLIP_ImageEncoderInput]
289
+ - options: prediction options
290
+
291
+ - throws: an NSError object that describes the problem
292
+
293
+ - returns: the result of the prediction as [CLIP_ImageEncoderOutput]
294
+ */
295
+ public func predictions(inputs: [CLIP_ImageEncoderInput], options: MLPredictionOptions = MLPredictionOptions()) throws -> [CLIP_ImageEncoderOutput] {
296
+ let batchIn = MLArrayBatchProvider(array: inputs)
297
+ let batchOut = try model.predictions(from: batchIn, options: options)
298
+ var results : [CLIP_ImageEncoderOutput] = []
299
+ results.reserveCapacity(inputs.count)
300
+ for i in 0..<batchOut.count {
301
+ let outProvider = batchOut.features(at: i)
302
+ let result = CLIP_ImageEncoderOutput(features: outProvider)
303
+ results.append(result)
304
+ }
305
+ return results
306
+ }
307
+ }
CLIP_TextEncoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c58c2cf62ade0826a10b4441c8bb7e4fcd64a0fe5db2ed2d0a77cb8c6bbc0c
3
+ size 243
CLIP_TextEncoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea6f6dfb60c6f0f8d80c2e06500b1bf6a835a114aad66094dc4bf097f238935
3
+ size 478
CLIP_TextEncoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "shortDescription" : "CLIP ViT-B\/32 model trained with DataComp-1B (Text Encoder Model)",
4
+ "metadataOutputVersion" : "3.0",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 512)",
11
+ "shortDescription" : "--",
12
+ "shape" : "[1, 512]",
13
+ "name" : "var_1317",
14
+ "type" : "MultiArray"
15
+ }
16
+ ],
17
+ "version" : "1.0.0",
18
+ "modelParameters" : [
19
+
20
+ ],
21
+ "author" : "InspiratioNULL 2026",
22
+ "specificationVersion" : 6,
23
+ "storagePrecision" : "Float16",
24
+ "license" : "MIT",
25
+ "mlProgramOperationTypeHistogram" : {
26
+ "Linear" : 49,
27
+ "SliceByIndex" : 36,
28
+ "LayerNorm" : 25,
29
+ "Transpose" : 84,
30
+ "Matmul" : 24,
31
+ "Gelu" : 12,
32
+ "Stack" : 1,
33
+ "Softmax" : 12,
34
+ "Squeeze" : 12,
35
+ "Reshape" : 108,
36
+ "Mul" : 12,
37
+ "Add" : 37,
38
+ "ExpandDims" : 12,
39
+ "ReduceArgmax" : 1,
40
+ "Gather" : 1,
41
+ "GatherNd" : 1,
42
+ "Cast" : 1
43
+ },
44
+ "computePrecision" : "Mixed (Float16, Float32, Int32)",
45
+ "stateSchema" : [
46
+
47
+ ],
48
+ "isUpdatable" : "0",
49
+ "availability" : {
50
+ "macOS" : "12.0",
51
+ "tvOS" : "15.0",
52
+ "visionOS" : "1.0",
53
+ "watchOS" : "8.0",
54
+ "iOS" : "15.0",
55
+ "macCatalyst" : "15.0"
56
+ },
57
+ "modelType" : {
58
+ "name" : "MLModelType_mlProgram"
59
+ },
60
+ "inputSchema" : [
61
+ {
62
+ "hasShapeFlexibility" : "0",
63
+ "isOptional" : "0",
64
+ "dataType" : "Int32",
65
+ "formattedType" : "MultiArray (Int32 1 × 77)",
66
+ "shortDescription" : "--",
67
+ "shape" : "[1, 77]",
68
+ "name" : "text",
69
+ "type" : "MultiArray"
70
+ }
71
+ ],
72
+ "userDefinedMetadata" : {
73
+ "com.github.apple.coremltools.conversion_date" : "2026-01-15",
74
+ "com.github.apple.coremltools.source" : "torch==2.9.1",
75
+ "com.github.apple.coremltools.version" : "9.0",
76
+ "com.github.apple.coremltools.source_dialect" : "TorchScript"
77
+ },
78
+ "generatedClassName" : "CLIP_TextEncoder",
79
+ "method" : "predict"
80
+ }
81
+ ]
CLIP_TextEncoder.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
CLIP_TextEncoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fa0dfd9975b5e10a37e3cd8cc4a563d28be2eeb7fc84e575ba74c0136176a01
3
+ size 126878848
CLIP_TextEncoder.swift ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ///MARK: This is the generated class file, useful for proper implementation.
2
+ //Created by InspiratioNULL on 1/20/2026
3
+ // CLIP_TextEncoder.swift
4
+ //
5
+ // This file was automatically generated and should not be edited.
6
+ //
7
+
8
+ import CoreML
9
+
10
+
11
+ /// Model Prediction Input Type
12
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
13
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
14
+ public class CLIP_TextEncoderInput : MLFeatureProvider {
15
+
16
+ /// text as 1 by 77 matrix of 32-bit integers
17
+ /// text as 1 by 77 matrix of 32-bit integers
18
+ public var text: MLMultiArray
19
+
20
+ public var featureNames: Set<String> { ["text"] }
21
+
22
+ public func featureValue(for featureName: String) -> MLFeatureValue? {
23
+ if featureName == "text" {
24
+ return MLFeatureValue(multiArray: text)
25
+ }
26
+ return nil
27
+ }
28
+
29
+ public init(text: MLMultiArray) {
30
+ self.text = text
31
+ }
32
+
33
+ public convenience init(text: MLShapedArray<Int32>) {
34
+ self.init(text: MLMultiArray(text))
35
+ }
36
+
37
+ }
38
+
39
+
40
+ /// Model Prediction Output Type
41
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
42
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
43
+ public class CLIP_TextEncoderOutput : MLFeatureProvider {
44
+
45
+ /// Source provided by CoreML
46
+ private let provider : MLFeatureProvider
47
+
48
+ /// var_1317 as 1 by 512 matrix of floats
49
+ /// var_1317 as 1 by 512 matrix of floats
50
+ public var var_1317: MLMultiArray {
51
+ provider.featureValue(for: "var_1317")!.multiArrayValue!
52
+ }
53
+
54
+ /// var_1317 as 1 by 512 matrix of floats
55
+ /// var_1317 as 1 by 512 matrix of floats
56
+ public var var_1317ShapedArray: MLShapedArray<Float> {
57
+ MLShapedArray<Float>(var_1317)
58
+ }
59
+
60
+ public var featureNames: Set<String> {
61
+ provider.featureNames
62
+ }
63
+
64
+ public func featureValue(for featureName: String) -> MLFeatureValue? {
65
+ provider.featureValue(for: featureName)
66
+ }
67
+
68
+ public init(var_1317: MLMultiArray) {
69
+ self.provider = try! MLDictionaryFeatureProvider(dictionary: ["var_1317" : MLFeatureValue(multiArray: var_1317)])
70
+ }
71
+
72
+ public init(features: MLFeatureProvider) {
73
+ self.provider = features
74
+ }
75
+ }
76
+
77
+
78
+ /// Class for model loading and prediction
79
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
80
+ @available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, visionOS 1.0, *)
81
+ public class CLIP_TextEncoder {
82
+ public let model: MLModel
83
+
84
+ /// URL of model assuming it was installed in the same bundle as this class
85
+ /// URL of model assuming it was installed in the same bundle as this class
86
+ public class var urlOfModelInThisBundle : URL {
87
+ let bundle = Bundle(for: self)
88
+ return bundle.url(forResource: "CLIP_TextEncoder", withExtension:"mlmodelc")!
89
+ }
90
+
91
+ /**
92
+ Construct CLIP_TextEncoder instance with an existing MLModel object.
93
+
94
+ Usually the application does not use this initializer unless it makes a subclass of CLIP_TextEncoder.
95
+ Such application may want to use `MLModel(contentsOfURL:configuration:)` and `CLIP_TextEncoder.urlOfModelInThisBundle` to create a MLModel object to pass-in.
96
+
97
+ - parameters:
98
+ - model: MLModel object
99
+ */
100
+ public init(model: MLModel) {
101
+ self.model = model
102
+ }
103
+
104
+ /**
105
+ Construct a model with configuration
106
+
107
+ - parameters:
108
+ - configuration: the desired model configuration
109
+
110
+ - throws: an NSError object that describes the problem
111
+ */
112
+ public convenience init(configuration: MLModelConfiguration = MLModelConfiguration()) throws {
113
+ try self.init(contentsOf: type(of:self).urlOfModelInThisBundle, configuration: configuration)
114
+ }
115
+
116
+ /**
117
+ Construct CLIP_TextEncoder instance with explicit path to mlmodelc file
118
+ - parameters:
119
+ - modelURL: the file url of the model
120
+
121
+ - throws: an NSError object that describes the problem
122
+ */
123
+ public convenience init(contentsOf modelURL: URL) throws {
124
+ try self.init(model: MLModel(contentsOf: modelURL))
125
+ }
126
+
127
+ /**
128
+ Construct a model with URL of the .mlmodelc directory and configuration
129
+
130
+ - parameters:
131
+ - modelURL: the file url of the model
132
+ - configuration: the desired model configuration
133
+
134
+ - throws: an NSError object that describes the problem
135
+ */
136
+ public convenience init(contentsOf modelURL: URL, configuration: MLModelConfiguration) throws {
137
+ try self.init(model: MLModel(contentsOf: modelURL, configuration: configuration))
138
+ }
139
+
140
+ /**
141
+ Construct CLIP_TextEncoder instance asynchronously with optional configuration.
142
+
143
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
144
+
145
+ - parameters:
146
+ - configuration: the desired model configuration
147
+ - handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
148
+ */
149
+ public class func load(configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_TextEncoder, Error>) -> Void) {
150
+ load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration, completionHandler: handler)
151
+ }
152
+
153
+ /**
154
+ Construct CLIP_TextEncoder instance asynchronously with optional configuration.
155
+
156
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
157
+
158
+ - parameters:
159
+ - configuration: the desired model configuration
160
+ */
161
+ public class func load(configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_TextEncoder {
162
+ try await load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration)
163
+ }
164
+
165
+ /**
166
+ Construct CLIP_TextEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
167
+
168
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
169
+
170
+ - parameters:
171
+ - modelURL: the URL to the model
172
+ - configuration: the desired model configuration
173
+ - handler: the completion handler to be called when the model loading completes successfully or unsuccessfully
174
+ */
175
+ public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<CLIP_TextEncoder, Error>) -> Void) {
176
+ MLModel.load(contentsOf: modelURL, configuration: configuration) { result in
177
+ switch result {
178
+ case .failure(let error):
179
+ handler(.failure(error))
180
+ case .success(let model):
181
+ handler(.success(CLIP_TextEncoder(model: model)))
182
+ }
183
+ }
184
+ }
185
+
186
+ /**
187
+ Construct CLIP_TextEncoder instance asynchronously with URL of the .mlmodelc directory with optional configuration.
188
+
189
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
190
+
191
+ - parameters:
192
+ - modelURL: the URL to the model
193
+ - configuration: the desired model configuration
194
+ */
195
+ public class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> CLIP_TextEncoder {
196
+ let model = try await MLModel.load(contentsOf: modelURL, configuration: configuration)
197
+ return CLIP_TextEncoder(model: model)
198
+ }
199
+
200
+ /**
201
+ Make a prediction using the structured interface
202
+
203
+ It uses the default function if the model has multiple functions.
204
+
205
+ - parameters:
206
+ - input: the input to the prediction as CLIP_TextEncoderInput
207
+
208
+ - throws: an NSError object that describes the problem
209
+
210
+ - returns: the result of the prediction as CLIP_TextEncoderOutput
211
+ */
212
+ public func prediction(input: CLIP_TextEncoderInput) throws -> CLIP_TextEncoderOutput {
213
+ try prediction(input: input, options: MLPredictionOptions())
214
+ }
215
+
216
+ /**
217
+ Make a prediction using the structured interface
218
+
219
+ It uses the default function if the model has multiple functions.
220
+
221
+ - parameters:
222
+ - input: the input to the prediction as CLIP_TextEncoderInput
223
+ - options: prediction options
224
+
225
+ - throws: an NSError object that describes the problem
226
+
227
+ - returns: the result of the prediction as CLIP_TextEncoderOutput
228
+ */
229
+ public func prediction(input: CLIP_TextEncoderInput, options: MLPredictionOptions) throws -> CLIP_TextEncoderOutput {
230
+ let outFeatures = try model.prediction(from: input, options: options)
231
+ return CLIP_TextEncoderOutput(features: outFeatures)
232
+ }
233
+
234
+ /**
235
+ Make an asynchronous prediction using the structured interface
236
+
237
+ It uses the default function if the model has multiple functions.
238
+
239
+ - parameters:
240
+ - input: the input to the prediction as CLIP_TextEncoderInput
241
+ - options: prediction options
242
+
243
+ - throws: an NSError object that describes the problem
244
+
245
+ - returns: the result of the prediction as CLIP_TextEncoderOutput
246
+ */
247
+ @available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, visionOS 1.0, *)
248
+ public func prediction(input: CLIP_TextEncoderInput, options: MLPredictionOptions = MLPredictionOptions()) async throws -> CLIP_TextEncoderOutput {
249
+ let outFeatures = try await model.prediction(from: input, options: options)
250
+ return CLIP_TextEncoderOutput(features: outFeatures)
251
+ }
252
+
253
+ /**
254
+ Make a prediction using the convenience interface
255
+
256
+ It uses the default function if the model has multiple functions.
257
+
258
+ - parameters:
259
+ - text: 1 by 77 matrix of 32-bit integers
260
+
261
+ - throws: an NSError object that describes the problem
262
+
263
+ - returns: the result of the prediction as CLIP_TextEncoderOutput
264
+ */
265
+ public func prediction(text: MLMultiArray) throws -> CLIP_TextEncoderOutput {
266
+ let input_ = CLIP_TextEncoderInput(text: text)
267
+ return try prediction(input: input_)
268
+ }
269
+
270
+ /**
271
+ Make a prediction using the convenience interface
272
+
273
+ It uses the default function if the model has multiple functions.
274
+
275
+ - parameters:
276
+ - text: 1 by 77 matrix of 32-bit integers
277
+
278
+ - throws: an NSError object that describes the problem
279
+
280
+ - returns: the result of the prediction as CLIP_TextEncoderOutput
281
+ */
282
+
283
+ public func prediction(text: MLShapedArray<Int32>) throws -> CLIP_TextEncoderOutput {
284
+ let input_ = CLIP_TextEncoderInput(text: text)
285
+ return try prediction(input: input_)
286
+ }
287
+
288
+ /**
289
+ Make a batch prediction using the structured interface
290
+
291
+ It uses the default function if the model has multiple functions.
292
+
293
+ - parameters:
294
+ - inputs: the inputs to the prediction as [CLIP_TextEncoderInput]
295
+ - options: prediction options
296
+
297
+ - throws: an NSError object that describes the problem
298
+
299
+ - returns: the result of the prediction as [CLIP_TextEncoderOutput]
300
+ */
301
+ public func predictions(inputs: [CLIP_TextEncoderInput], options: MLPredictionOptions = MLPredictionOptions()) throws -> [CLIP_TextEncoderOutput] {
302
+ let batchIn = MLArrayBatchProvider(array: inputs)
303
+ let batchOut = try model.predictions(from: batchIn, options: options)
304
+ var results : [CLIP_TextEncoderOutput] = []
305
+ results.reserveCapacity(inputs.count)
306
+ for i in 0..<batchOut.count {
307
+ let outProvider = batchOut.features(at: i)
308
+ let result = CLIP_TextEncoderOutput(features: outProvider)
309
+ results.append(result)
310
+ }
311
+ return results
312
+ }
313
+ }
README.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ widget:
4
+ - src: >-
5
+ https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
6
+ candidate_labels: playing music, playing sports
7
+ example_title: Cat & Dog
8
+ library_name: open_clip
9
+ datasets:
10
+ - mlfoundations/datacomp_pools
11
+ pipeline_tag: zero-shot-image-classification
12
+ ---
13
+ # Model card for CLIP ViT-B-32 trained DataComp-1B
14
+
15
+ # Table of Contents
16
+
17
+ 1. [Model Details](#model-details)
18
+ 2. [Uses](#uses)
19
+ 3. [Training Details](#training-details)
20
+ 4. [Evaluation](#evaluation)
21
+ 5. [Acknowledgements](#acknowledgements)
22
+ 6. [Citation](#citation)
23
+ 7. [How To Get Started With the Model](#how-to-get-started-with-the-model)
24
+
25
+
26
+ # Model Details
27
+
28
+ This repo contains a converted model for the CoreML model format.
29
+ ```
30
+ CLIP_ImageEncoder.mlmodelc, CLIP_TextEncoder.mlmodelc, + swift classes
31
+ ```
32
+
33
+ ## Model Description
34
+
35
+ A CLIP ViT-B/32 model trained with the DataComp-1B (https://github.com/mlfoundations/datacomp) using OpenCLIP (https://github.com/mlfoundations/open_clip).
36
+
37
+ Model training done on the [stability.ai](https://stability.ai/) cluster.
38
+
39
+ # Uses
40
+
41
+ As per the original [OpenAI CLIP model card](https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/model-card.md), this model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such model.
42
+
43
+ The OpenAI CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. Additionally, the DataComp paper (https://arxiv.org/abs/2304.14108) include additional discussion as it relates specifically to the training dataset.
44
+
45
+ ## Direct Use
46
+
47
+ Zero-shot image classification, image and text retrieval, among others.
48
+
49
+ ## Downstream Use
50
+
51
+ Image classification and other image task fine-tuning, linear probe image classification, image generation guiding and conditioning, among others.
52
+
53
+ ## Out-of-Scope Use
54
+
55
+ As per the OpenAI models,
56
+
57
+ **Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful.
58
+
59
+ Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use.
60
+
61
+ # Training Details
62
+
63
+ ## Training Data
64
+
65
+ This model was trained with the 1.4 Billion samples of the DataComp-1B dataset (https://arxiv.org/abs/2304.14108).
66
+
67
+ **IMPORTANT NOTE:** The motivation behind dataset creation is to democratize research and experimentation around large-scale multi-modal model training and handling of uncurated, large-scale datasets crawled from publically available internet. Our recommendation is therefore to use the dataset for research purposes. Be aware that this large-scale dataset is uncurated. Keep in mind that the uncurated nature of the dataset means that collected links may lead to strongly discomforting and disturbing content for a human viewer. Therefore, please use the demo links with caution and at your own risk. It is possible to extract a “safe” subset by filtering out samples based on the safety tags (using a customized trained NSFW classifier that we built). While this strongly reduces the chance for encountering potentially harmful content when viewing, we cannot entirely exclude the possibility for harmful content being still present in safe mode, so that the warning holds also there. We think that providing the dataset openly to broad research and other interested communities will allow for transparent investigation of benefits that come along with training large-scale models as well as pitfalls and dangers that may stay unreported or unnoticed when working with closed large datasets that remain restricted to a small community. Providing our dataset openly, we however do not recommend using it for creating ready-to-go industrial products, as the basic research about general properties and safety of such large-scale models, which we would like to encourage with this release, is still in progress.
68
+
69
+ ## Training Procedure
70
+
71
+ Please see https://arxiv.org/abs/2304.14108.
72
+
73
+ # Evaluation
74
+
75
+ Evaluation done on 38 datasets, using the [DataComp repo](https://github.com/mlfoundations/datacomp) and the [LAION CLIP Benchmark](https://github.com/LAION-AI/CLIP_benchmark).
76
+
77
+ ## Testing Data, Factors & Metrics
78
+
79
+ ### Testing Data
80
+
81
+ The testing is performed on a suite of 38 datasets. See our paper for more details (https://arxiv.org/abs/2304.14108).
82
+
83
+ ## Results
84
+
85
+ The model achieves a 72.7% zero-shot top-1 accuracy on ImageNet-1k. See our paper for more details and results (https://arxiv.org/abs/2304.14108).
86
+
87
+ # Acknowledgements
88
+
89
+ Acknowledging [stability.ai](https://stability.ai/) for the compute used to train this model.
90
+
91
+ # Citation
92
+
93
+ **BibTeX:**
94
+ Conversion To CoreML
95
+ ```
96
+ @coreml{
97
+ author = {InspiratioNULL},
98
+ month = Jan
99
+ year = 2026
100
+ note = {See CLIP_ImageEncoder.swift &
101
+ CLIP_TextEncoder.swift for
102
+ implementation details.
103
+
104
+ Special Thanks to the individuals
105
+ and organizations below for
106
+ creating this model},
107
+ url = {Inspirationull.com},
108
+ github = {https://github.com/InspiratioNULL},
109
+ huggingface = {huggingface.co/InspiratioNULL},
110
+ }
111
+ ```
112
+ DataComp
113
+ ```bibtex
114
+ @article{datacomp,
115
+ title={DataComp: In search of the next generation of multimodal datasets},
116
+ author={Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt},
117
+ journal={arXiv preprint arXiv:2304.14108},
118
+ year={2023}
119
+ }
120
+ ```
121
+
122
+
123
+ OpenAI CLIP paper
124
+ ```
125
+ @inproceedings{Radford2021LearningTV,
126
+ title={Learning Transferable Visual Models From Natural Language Supervision},
127
+ author={Alec Radford and Jong Wook Kim and Chris Hallacy and A. Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
128
+ booktitle={ICML},
129
+ year={2021}
130
+ }
131
+ ```
132
+
133
+ OpenCLIP software
134
+ ```
135
+ @software{ilharco_gabriel_2021_5143773,
136
+ author = {Ilharco, Gabriel and
137
+ Wortsman, Mitchell and
138
+ Wightman, Ross and
139
+ Gordon, Cade and
140
+ Carlini, Nicholas and
141
+ Taori, Rohan and
142
+ Dave, Achal and
143
+ Shankar, Vaishaal and
144
+ Namkoong, Hongseok and
145
+ Miller, John and
146
+ Hajishirzi, Hannaneh and
147
+ Farhadi, Ali and
148
+ Schmidt, Ludwig},
149
+ title = {OpenCLIP},
150
+ month = jul,
151
+ year = 2021,
152
+ note = {If you use this software, please cite it as below.},
153
+ publisher = {Zenodo},
154
+ version = {0.1},
155
+ doi = {10.5281/zenodo.5143773},
156
+ url = {https://doi.org/10.5281/zenodo.5143773}
157
+ }
158
+ ```
159
+
160
+ # How to Get Started with the Model
161
+
162
+ See https://github.com/mlfoundations/open_clip
163
+ # For CoreML
164
+ See https://huggingface.co/apple
165
+ See https://developer.apple.com/documentation/
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
open_clip_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_cfg": {
3
+ "embed_dim": 512,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": 12,
7
+ "width": 768,
8
+ "patch_size": 32
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 512,
14
+ "heads": 8,
15
+ "layers": 12
16
+ }
17
+ },
18
+ "preprocess_cfg": {
19
+ "mean": [
20
+ 0.48145466,
21
+ 0.4578275,
22
+ 0.40821073
23
+ ],
24
+ "std": [
25
+ 0.26862954,
26
+ 0.26130258,
27
+ 0.27577711
28
+ ]
29
+ }
30
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "clean_up_tokenization_spaces": true,
12
+ "do_lower_case": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 77,
23
+ "pad_token": "<|endoftext|>",
24
+ "tokenizer_class": "CLIPTokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff