File size: 6,692 Bytes
b115882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21e5a6f
b115882
 
 
 
 
 
 
 
 
 
 
21e5a6f
 
b115882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
---
tags:
- onnx
- pgvector
- pg_onnx
- sentence-transformers
license: mit
library_name: onnx
pipeline_tag: sentence-similarity
model-index:
- name: multilingual-e5-small-pg-onnx
  results: []
authors:
- oga5
---


# multilingual-e5-small ONNX for pg_onnx

ONNX-converted version of [`intfloat/multilingual-e5-small`](https://huggingface.co/intfloat/multilingual-e5-small), optimized for [pgvector](https://github.com/pgvector/pgvector) and [pg_onnx](https://github.com/pgvector/pg_onnx).

This repository enables **native semantic search and embedding inference inside PostgreSQL**, using ONNXRuntime and pg_onnx.


It includes:

- `encoder.onnx`: The sentence embedding encoder
- `text_to_embedding.onnx`: A wrapper model for direct text-to-embedding inference
- `tokenizer.onnx`: ONNX-converted tokenizer for fast, portable preprocessing

## Files

| File | Purpose | Size |
| --- | --- | --- |
| `encoder.onnx` | Sentence embedding encoder | ~470 MB |
| `text_to_embedding.onnx` | Wrapper for direct text-to-embedding inference | ~475 MB |
| `tokenizer.onnx` | ONNX serialized tokenizer | ~5 MB |

## Conversion Details

This model was converted from the original multilingual-e5-small using:

- Hugging Face Transformers for model and tokenizer loading
- ONNX export via `transformers.onnx` and custom scripts
- Tokenizer serialized into ONNX using [`tokenizers`](https://github.com/huggingface/tokenizers) and custom conversion logic

All components are designed to run natively in PostgreSQL via `pg_onnx`, enabling efficient semantic search and embedding inference directly in the database.

## Download from Hugging Face

```python
from huggingface_hub import hf_hub_download

# Adjust repo_id if you fork or rename
repo_id = "oga5/multilingual-e5-small-pg-onnx"

enc_path = hf_hub_download(repo_id=repo_id, filename="encoder.onnx")
tte_path = hf_hub_download(repo_id=repo_id, filename="text_to_embedding.onnx")
tok_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.onnx")

print(enc_path, tte_path, tok_path)
```

## Setup

Required libraries:
- `onnxruntime`
- `onnxruntime-extensions` (build with `-DOCOS_ENABLE_SPM_TOKENIZER=ON`)
- `pg_onnx` (v1.23.1b or later)

-- After building pg_onnx, install the pg_onnx extension
``` SQL
create extension pg_onnx
```

## Usage

```sql

-- Register models
select pg_onnx_import_model('e5-tok', 'v1', pg_read_binary_file('/PATH/tokenizer.onnx')::bytea, '{"ortextensions_path": "libortextensions.so"}'::jsonb, 'e5 tokenizer');
select pg_onnx_import_model('e5-embedding', 'v1', pg_read_binary_file('/PATH/text_to_embedding.onnx')::bytea, '{"ortextensions_path": "libortextensions.so"}'::jsonb, 'e5 text to embedding');

-- Create functions
create or replace function e5_tok(input_text text)
returns integer[]
AS $$
    SELECT array_agg(value::int)
    FROM jsonb_array_elements_text(
        pg_onnx_execute_session(
            'e5-tok',
            'v1',
            jsonb_build_object('inputs', jsonb_build_array(input_text))
        ) -> 'tokens'
    );
$$
language sql
immutable;

create or replace function e5_embedding(input_text text)
returns vector(384)
as $$
    select array(
        select jsonb_array_elements_text(
            pg_onnx_execute_session(
                'e5-embedding',
                'v1',
                jsonb_build_object('text', jsonb_build_array(input_text))
            )->'embedding'->0
        )::float
    )::vector(384);
$$
language sql
immutable;

create or replace function e5_embedding_passage(input_text text)
returns vector
AS $$
    select e5_embedding('passage: ' || input_text);
$$
language 'sql'
immutable;

create or replace function e5_embedding_query(input_text text)
returns vector
AS $$
    select e5_embedding('query ' || input_text);
$$
language 'sql'
immutable;

-- Create sample data
create table llm_test (
    i integer not null primary key,
    txt text,
    v vector(384)
);

create index llm_test_v_idx on llm_test using hnsw (v vector_ip_ops);

insert into llm_test (i,txt) values ('1','Machine learning is a subfield of artificial intelligence');
insert into llm_test (i,txt) values ('2','A database is a system for managing data');
insert into llm_test (i,txt) values ('3','PostgreSQL is a powerful open-source database');
insert into llm_test (i,txt) values ('4','Vector search retrieves results by computing similarity');
insert into llm_test (i,txt) values ('5','ONNX is a standard format for machine learning models');
insert into llm_test (i,txt) values ('6','Natural language processing is a technology for handling text');
insert into llm_test (i,txt) values ('7','Embeddings convert text into vectors');
insert into llm_test (i,txt) values ('8','Cosine similarity measures similarity between vectors');
insert into llm_test (i,txt) values ('9','A tokenizer splits text into tokens');
insert into llm_test (i,txt) values ('10','Transformers are a modern neural network architecture');
insert into llm_test (i,txt) values ('11','SQL is a language for manipulating databases');
insert into llm_test (i,txt) values ('12','Indexes improve query performance');
insert into llm_test (i,txt) values ('13','pgvector is a vector extension for PostgreSQL');
insert into llm_test (i,txt) values ('14','Semantic search retrieves based on meaning');
insert into llm_test (i,txt) values ('15','Neural networks mimic the structure of the brain');
insert into llm_test (i,txt) values ('16','Deep learning uses multi-layer neural networks');
insert into llm_test (i,txt) values ('17','Batch processing handles multiple data at once');
insert into llm_test (i,txt) values ('18','Model inference performs prediction with a trained model');
insert into llm_test (i,txt) values ('19','Fine-tuning adapts an existing model to a specific task');
insert into llm_test (i,txt) values ('20','A cross-encoder evaluates the relevance between two texts');

-- register embeddings
update llm_test set v = e5_embedding_passage(txt);

-- Search
with q as (
    select 'What is machine learning?' as query
),
qv as materialized (
    select e5_embedding_query(q.query) as v from q
)
select i, txt, t.v <#> qv.v as distance
from llm_test t, qv
order by distance;

```

## License

This project is released under the **MIT License**. See the full text in [`LICENSE`](./LICENSE).

This repository redistributes the original model weights from [`intfloat/multilingual-e5-small`](https://huggingface.co/intfloat/multilingual-e5-small) without modification. The original MIT license is retained as required.

## Credits
- Original model: [intfloat/multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small)
- Conversion to ONNX and packaging: [oga5](https://huggingface.co/oga5)