--- tags: - onnx - pgvector - pg_onnx - sentence-transformers license: mit library_name: onnx pipeline_tag: sentence-similarity model-index: - name: multilingual-e5-small-pg-onnx results: [] authors: - oga5 --- # multilingual-e5-small ONNX for pg_onnx ONNX-converted version of [`intfloat/multilingual-e5-small`](https://huggingface.co/intfloat/multilingual-e5-small), optimized for [pgvector](https://github.com/pgvector/pgvector) and [pg_onnx](https://github.com/pgvector/pg_onnx). This repository enables **native semantic search and embedding inference inside PostgreSQL**, using ONNXRuntime and pg_onnx. It includes: - `encoder.onnx`: The sentence embedding encoder - `text_to_embedding.onnx`: A wrapper model for direct text-to-embedding inference - `tokenizer.onnx`: ONNX-converted tokenizer for fast, portable preprocessing ## Files | File | Purpose | Size | | --- | --- | --- | | `encoder.onnx` | Sentence embedding encoder | ~470 MB | | `text_to_embedding.onnx` | Wrapper for direct text-to-embedding inference | ~475 MB | | `tokenizer.onnx` | ONNX serialized tokenizer | ~5 MB | ## Conversion Details This model was converted from the original multilingual-e5-small using: - Hugging Face Transformers for model and tokenizer loading - ONNX export via `transformers.onnx` and custom scripts - Tokenizer serialized into ONNX using [`tokenizers`](https://github.com/huggingface/tokenizers) and custom conversion logic All components are designed to run natively in PostgreSQL via `pg_onnx`, enabling efficient semantic search and embedding inference directly in the database. ## Download from Hugging Face ```python from huggingface_hub import hf_hub_download # Adjust repo_id if you fork or rename repo_id = "oga5/multilingual-e5-small-pg-onnx" enc_path = hf_hub_download(repo_id=repo_id, filename="encoder.onnx") tte_path = hf_hub_download(repo_id=repo_id, filename="text_to_embedding.onnx") tok_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.onnx") print(enc_path, tte_path, tok_path) ``` ## Setup Required libraries: - `onnxruntime` - `onnxruntime-extensions` (build with `-DOCOS_ENABLE_SPM_TOKENIZER=ON`) - `pg_onnx` (v1.23.1b or later) -- After building pg_onnx, install the pg_onnx extension ``` SQL create extension pg_onnx ``` ## Usage ```sql -- Register models select pg_onnx_import_model('e5-tok', 'v1', pg_read_binary_file('/PATH/tokenizer.onnx')::bytea, '{"ortextensions_path": "libortextensions.so"}'::jsonb, 'e5 tokenizer'); select pg_onnx_import_model('e5-embedding', 'v1', pg_read_binary_file('/PATH/text_to_embedding.onnx')::bytea, '{"ortextensions_path": "libortextensions.so"}'::jsonb, 'e5 text to embedding'); -- Create functions create or replace function e5_tok(input_text text) returns integer[] AS $$ SELECT array_agg(value::int) FROM jsonb_array_elements_text( pg_onnx_execute_session( 'e5-tok', 'v1', jsonb_build_object('inputs', jsonb_build_array(input_text)) ) -> 'tokens' ); $$ language sql immutable; create or replace function e5_embedding(input_text text) returns vector(384) as $$ select array( select jsonb_array_elements_text( pg_onnx_execute_session( 'e5-embedding', 'v1', jsonb_build_object('text', jsonb_build_array(input_text)) )->'embedding'->0 )::float )::vector(384); $$ language sql immutable; create or replace function e5_embedding_passage(input_text text) returns vector AS $$ select e5_embedding('passage: ' || input_text); $$ language 'sql' immutable; create or replace function e5_embedding_query(input_text text) returns vector AS $$ select e5_embedding('query ' || input_text); $$ language 'sql' immutable; -- Create sample data create table llm_test ( i integer not null primary key, txt text, v vector(384) ); create index llm_test_v_idx on llm_test using hnsw (v vector_ip_ops); insert into llm_test (i,txt) values ('1','Machine learning is a subfield of artificial intelligence'); insert into llm_test (i,txt) values ('2','A database is a system for managing data'); insert into llm_test (i,txt) values ('3','PostgreSQL is a powerful open-source database'); insert into llm_test (i,txt) values ('4','Vector search retrieves results by computing similarity'); insert into llm_test (i,txt) values ('5','ONNX is a standard format for machine learning models'); insert into llm_test (i,txt) values ('6','Natural language processing is a technology for handling text'); insert into llm_test (i,txt) values ('7','Embeddings convert text into vectors'); insert into llm_test (i,txt) values ('8','Cosine similarity measures similarity between vectors'); insert into llm_test (i,txt) values ('9','A tokenizer splits text into tokens'); insert into llm_test (i,txt) values ('10','Transformers are a modern neural network architecture'); insert into llm_test (i,txt) values ('11','SQL is a language for manipulating databases'); insert into llm_test (i,txt) values ('12','Indexes improve query performance'); insert into llm_test (i,txt) values ('13','pgvector is a vector extension for PostgreSQL'); insert into llm_test (i,txt) values ('14','Semantic search retrieves based on meaning'); insert into llm_test (i,txt) values ('15','Neural networks mimic the structure of the brain'); insert into llm_test (i,txt) values ('16','Deep learning uses multi-layer neural networks'); insert into llm_test (i,txt) values ('17','Batch processing handles multiple data at once'); insert into llm_test (i,txt) values ('18','Model inference performs prediction with a trained model'); insert into llm_test (i,txt) values ('19','Fine-tuning adapts an existing model to a specific task'); insert into llm_test (i,txt) values ('20','A cross-encoder evaluates the relevance between two texts'); -- register embeddings update llm_test set v = e5_embedding_passage(txt); -- Search with q as ( select 'What is machine learning?' as query ), qv as materialized ( select e5_embedding_query(q.query) as v from q ) select i, txt, t.v <#> qv.v as distance from llm_test t, qv order by distance; ``` ## License This project is released under the **MIT License**. See the full text in [`LICENSE`](./LICENSE). This repository redistributes the original model weights from [`intfloat/multilingual-e5-small`](https://huggingface.co/intfloat/multilingual-e5-small) without modification. The original MIT license is retained as required. ## Credits - Original model: [intfloat/multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) - Conversion to ONNX and packaging: [oga5](https://huggingface.co/oga5)