|
|
--- |
|
|
tags: |
|
|
- onnx |
|
|
- pgvector |
|
|
- pg_onnx |
|
|
- sentence-transformers |
|
|
license: mit |
|
|
library_name: onnx |
|
|
pipeline_tag: sentence-similarity |
|
|
model-index: |
|
|
- name: multilingual-e5-small-pg-onnx |
|
|
results: [] |
|
|
authors: |
|
|
- oga5 |
|
|
--- |
|
|
|
|
|
|
|
|
# multilingual-e5-small ONNX for pg_onnx |
|
|
|
|
|
ONNX-converted version of [`intfloat/multilingual-e5-small`](https://huggingface.co/intfloat/multilingual-e5-small), optimized for [pgvector](https://github.com/pgvector/pgvector) and [pg_onnx](https://github.com/pgvector/pg_onnx). |
|
|
|
|
|
This repository enables **native semantic search and embedding inference inside PostgreSQL**, using ONNXRuntime and pg_onnx. |
|
|
|
|
|
|
|
|
It includes: |
|
|
|
|
|
- `encoder.onnx`: The sentence embedding encoder |
|
|
- `text_to_embedding.onnx`: A wrapper model for direct text-to-embedding inference |
|
|
- `tokenizer.onnx`: ONNX-converted tokenizer for fast, portable preprocessing |
|
|
|
|
|
## Files |
|
|
|
|
|
| File | Purpose | Size | |
|
|
| --- | --- | --- | |
|
|
| `encoder.onnx` | Sentence embedding encoder | ~470 MB | |
|
|
| `text_to_embedding.onnx` | Wrapper for direct text-to-embedding inference | ~475 MB | |
|
|
| `tokenizer.onnx` | ONNX serialized tokenizer | ~5 MB | |
|
|
|
|
|
## Conversion Details |
|
|
|
|
|
This model was converted from the original multilingual-e5-small using: |
|
|
|
|
|
- Hugging Face Transformers for model and tokenizer loading |
|
|
- ONNX export via `transformers.onnx` and custom scripts |
|
|
- Tokenizer serialized into ONNX using [`tokenizers`](https://github.com/huggingface/tokenizers) and custom conversion logic |
|
|
|
|
|
All components are designed to run natively in PostgreSQL via `pg_onnx`, enabling efficient semantic search and embedding inference directly in the database. |
|
|
|
|
|
## Download from Hugging Face |
|
|
|
|
|
```python |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
# Adjust repo_id if you fork or rename |
|
|
repo_id = "oga5/multilingual-e5-small-pg-onnx" |
|
|
|
|
|
enc_path = hf_hub_download(repo_id=repo_id, filename="encoder.onnx") |
|
|
tte_path = hf_hub_download(repo_id=repo_id, filename="text_to_embedding.onnx") |
|
|
tok_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.onnx") |
|
|
|
|
|
print(enc_path, tte_path, tok_path) |
|
|
``` |
|
|
|
|
|
## Setup |
|
|
|
|
|
Required libraries: |
|
|
- `onnxruntime` |
|
|
- `onnxruntime-extensions` (build with `-DOCOS_ENABLE_SPM_TOKENIZER=ON`) |
|
|
- `pg_onnx` (v1.23.1b or later) |
|
|
|
|
|
-- After building pg_onnx, install the pg_onnx extension |
|
|
``` SQL |
|
|
create extension pg_onnx |
|
|
``` |
|
|
|
|
|
## Usage |
|
|
|
|
|
```sql |
|
|
|
|
|
-- Register models |
|
|
select pg_onnx_import_model('e5-tok', 'v1', pg_read_binary_file('/PATH/tokenizer.onnx')::bytea, '{"ortextensions_path": "libortextensions.so"}'::jsonb, 'e5 tokenizer'); |
|
|
select pg_onnx_import_model('e5-embedding', 'v1', pg_read_binary_file('/PATH/text_to_embedding.onnx')::bytea, '{"ortextensions_path": "libortextensions.so"}'::jsonb, 'e5 text to embedding'); |
|
|
|
|
|
-- Create functions |
|
|
create or replace function e5_tok(input_text text) |
|
|
returns integer[] |
|
|
AS $$ |
|
|
SELECT array_agg(value::int) |
|
|
FROM jsonb_array_elements_text( |
|
|
pg_onnx_execute_session( |
|
|
'e5-tok', |
|
|
'v1', |
|
|
jsonb_build_object('inputs', jsonb_build_array(input_text)) |
|
|
) -> 'tokens' |
|
|
); |
|
|
$$ |
|
|
language sql |
|
|
immutable; |
|
|
|
|
|
create or replace function e5_embedding(input_text text) |
|
|
returns vector(384) |
|
|
as $$ |
|
|
select array( |
|
|
select jsonb_array_elements_text( |
|
|
pg_onnx_execute_session( |
|
|
'e5-embedding', |
|
|
'v1', |
|
|
jsonb_build_object('text', jsonb_build_array(input_text)) |
|
|
)->'embedding'->0 |
|
|
)::float |
|
|
)::vector(384); |
|
|
$$ |
|
|
language sql |
|
|
immutable; |
|
|
|
|
|
create or replace function e5_embedding_passage(input_text text) |
|
|
returns vector |
|
|
AS $$ |
|
|
select e5_embedding('passage: ' || input_text); |
|
|
$$ |
|
|
language 'sql' |
|
|
immutable; |
|
|
|
|
|
create or replace function e5_embedding_query(input_text text) |
|
|
returns vector |
|
|
AS $$ |
|
|
select e5_embedding('query ' || input_text); |
|
|
$$ |
|
|
language 'sql' |
|
|
immutable; |
|
|
|
|
|
-- Create sample data |
|
|
create table llm_test ( |
|
|
i integer not null primary key, |
|
|
txt text, |
|
|
v vector(384) |
|
|
); |
|
|
|
|
|
create index llm_test_v_idx on llm_test using hnsw (v vector_ip_ops); |
|
|
|
|
|
insert into llm_test (i,txt) values ('1','Machine learning is a subfield of artificial intelligence'); |
|
|
insert into llm_test (i,txt) values ('2','A database is a system for managing data'); |
|
|
insert into llm_test (i,txt) values ('3','PostgreSQL is a powerful open-source database'); |
|
|
insert into llm_test (i,txt) values ('4','Vector search retrieves results by computing similarity'); |
|
|
insert into llm_test (i,txt) values ('5','ONNX is a standard format for machine learning models'); |
|
|
insert into llm_test (i,txt) values ('6','Natural language processing is a technology for handling text'); |
|
|
insert into llm_test (i,txt) values ('7','Embeddings convert text into vectors'); |
|
|
insert into llm_test (i,txt) values ('8','Cosine similarity measures similarity between vectors'); |
|
|
insert into llm_test (i,txt) values ('9','A tokenizer splits text into tokens'); |
|
|
insert into llm_test (i,txt) values ('10','Transformers are a modern neural network architecture'); |
|
|
insert into llm_test (i,txt) values ('11','SQL is a language for manipulating databases'); |
|
|
insert into llm_test (i,txt) values ('12','Indexes improve query performance'); |
|
|
insert into llm_test (i,txt) values ('13','pgvector is a vector extension for PostgreSQL'); |
|
|
insert into llm_test (i,txt) values ('14','Semantic search retrieves based on meaning'); |
|
|
insert into llm_test (i,txt) values ('15','Neural networks mimic the structure of the brain'); |
|
|
insert into llm_test (i,txt) values ('16','Deep learning uses multi-layer neural networks'); |
|
|
insert into llm_test (i,txt) values ('17','Batch processing handles multiple data at once'); |
|
|
insert into llm_test (i,txt) values ('18','Model inference performs prediction with a trained model'); |
|
|
insert into llm_test (i,txt) values ('19','Fine-tuning adapts an existing model to a specific task'); |
|
|
insert into llm_test (i,txt) values ('20','A cross-encoder evaluates the relevance between two texts'); |
|
|
|
|
|
-- register embeddings |
|
|
update llm_test set v = e5_embedding_passage(txt); |
|
|
|
|
|
-- Search |
|
|
with q as ( |
|
|
select 'What is machine learning?' as query |
|
|
), |
|
|
qv as materialized ( |
|
|
select e5_embedding_query(q.query) as v from q |
|
|
) |
|
|
select i, txt, t.v <#> qv.v as distance |
|
|
from llm_test t, qv |
|
|
order by distance; |
|
|
|
|
|
``` |
|
|
|
|
|
## License |
|
|
|
|
|
This project is released under the **MIT License**. See the full text in [`LICENSE`](./LICENSE). |
|
|
|
|
|
This repository redistributes the original model weights from [`intfloat/multilingual-e5-small`](https://huggingface.co/intfloat/multilingual-e5-small) without modification. The original MIT license is retained as required. |
|
|
|
|
|
## Credits |
|
|
- Original model: [intfloat/multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) |
|
|
- Conversion to ONNX and packaging: [oga5](https://huggingface.co/oga5) |
|
|
|
|
|
|