richtext commited on
Commit
0b7a788
·
verified ·
1 Parent(s): a899750

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ library_name: mlx
6
+ tags:
7
+ - sentence-transformers
8
+ - sentence-similarity
9
+ - feature-extraction
10
+ - food
11
+ - embeddings
12
+ - mlx
13
+ - apple-silicon
14
+ base_model: thenlper/gte-large
15
+ pipeline_tag: sentence-similarity
16
+ ---
17
+
18
+ # FoodMapper GTE-Large (MLX Format)
19
+
20
+ This is [thenlper/gte-large](https://huggingface.co/thenlper/gte-large) converted to MLX-Swift safetensors format for use with the FoodMapper macOS application.
21
+
22
+ ## Model Description
23
+
24
+ GTE-Large is a 335M parameter text embedding model that maps sentences to 1024-dimensional dense vectors. It excels at semantic similarity tasks, making it ideal for matching food names across different databases and nomenclatures.
25
+
26
+ This conversion is optimized for Apple Silicon GPUs via [MLX-Swift](https://github.com/ml-explore/mlx-swift).
27
+
28
+ ## Intended Use
29
+
30
+ - Semantic food name matching (e.g., matching "granny smith apple" to "Apple, raw, with skin")
31
+ - Food database harmonization between USDA FoodData Central, FooDB, and custom datasets
32
+ - General text similarity on Apple Silicon Macs
33
+
34
+ ## Model Details
35
+
36
+ | Property | Value |
37
+ |----------|-------|
38
+ | Parameters | 335M |
39
+ | Embedding Dimension | 1024 |
40
+ | Max Sequence Length | 512 |
41
+ | Architecture | BERT |
42
+ | Precision | float16 |
43
+ | Format | safetensors |
44
+
45
+ ## Files
46
+
47
+ - `gte-large.safetensors` - Model weights in safetensors format (~670MB)
48
+ - `config.json` - Model architecture configuration
49
+ - `tokenizer.json` - Tokenizer vocabulary and settings
50
+ - `tokenizer_config.json` - Tokenizer configuration
51
+ - `vocab.txt` - WordPiece vocabulary
52
+ - `special_tokens_map.json` - Special token mappings
53
+
54
+ ## Usage with FoodMapper
55
+
56
+ This model is automatically downloaded by the FoodMapper macOS app when first launched. No manual setup required.
57
+
58
+ ## Usage with MLX-Swift
59
+
60
+ ```swift
61
+ import MLX
62
+ import MLXNN
63
+
64
+ // Load weights
65
+ let weights = try loadArrays(url: modelURL)
66
+ let parameters = ModuleParameters.unflattened(weights)
67
+ try model.update(parameters: parameters, verify: .none)
68
+ ```
69
+
70
+ ## Pooling
71
+
72
+ GTE models use **mean pooling** over token embeddings (not CLS token pooling). The attention mask should be applied before averaging:
73
+
74
+ ```swift
75
+ func meanPooling(_ hiddenState: MLXArray, attentionMask: MLXArray) -> MLXArray {
76
+ let maskExpanded = attentionMask.expandedDimensions(axis: -1)
77
+ .asType(hiddenState.dtype)
78
+ let sumEmbeddings = (hiddenState * maskExpanded).sum(axis: 1)
79
+ let sumMask = MLX.maximum(maskExpanded.sum(axis: 1), MLXArray(1e-9))
80
+ return sumEmbeddings / sumMask
81
+ }
82
+ ```
83
+
84
+ ## Original Model
85
+
86
+ Based on [thenlper/gte-large](https://huggingface.co/thenlper/gte-large) by Alibaba DAMO Academy.
87
+
88
+ ## License
89
+
90
+ Apache 2.0 (same as original GTE-Large)
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4096,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.28.1",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
gte-large.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f917f334b6e38e966519983a6b567a5a86d90065932c780f6b4ad72e6bf3a90b
3
+ size 670326040
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
test_baseline.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf14068025e2b0831a3e45f202db9bfef9a48e164877a1f73d98ffbc19d82f5c
3
+ size 20608
test_texts.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [
2
+ "red wine, table",
3
+ "chicken breast, roasted, skinless",
4
+ "apple, fresh, raw, with skin",
5
+ "milk, whole, 3.25% fat",
6
+ "bread, white, commercially prepared"
7
+ ]
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff