| | --- |
| | license: apache-2.0 |
| | language: |
| | - en |
| | - zh |
| | - ru |
| | - es |
| | - fr |
| | - de |
| | - ar |
| | - nl |
| | - vi |
| | - hi |
| | - ko |
| | - ja |
| | - it |
| | - id |
| | - pt |
| | - pl |
| | - tr |
| | - da |
| | - th |
| | - sv |
| | - fa |
| | - uk |
| | - cs |
| | - 'no' |
| | - el |
| | - ca |
| | - ro |
| | - fi |
| | - bg |
| | - tl |
| | - gl |
| | - my |
| | - hy |
| | - km |
| | - ne |
| | - hu |
| | - eu |
| | - he |
| | - lo |
| | - sw |
| | - az |
| | - lv |
| | - si |
| | - sk |
| | - tg |
| | - et |
| | - lt |
| | - ms |
| | - hr |
| | - is |
| | - sl |
| | - sr |
| | - ur |
| | - bn |
| | - af |
| | - ta |
| | - ka |
| | - te |
| | - ml |
| | - mn |
| | - nn |
| | - kk |
| | - cy |
| | - mr |
| | - sq |
| | - nb |
| | - mk |
| | - jv |
| | - kn |
| | - eo |
| | - la |
| | - gu |
| | - uz |
| | - am |
| | - oc |
| | - be |
| | - mg |
| | - vo |
| | - pa |
| | - lb |
| | - ht |
| | - br |
| | - ga |
| | - xh |
| | - tt |
| | - bs |
| | - yo |
| | base_model: |
| | - codefuse-ai/F2LLM-v2-0.6B-Preview |
| | pipeline_tag: feature-extraction |
| | library_name: transformers |
| | tags: |
| | - sentence-transformers |
| | datasets: |
| | - codefuse-ai/F2LLM-v2 |
| | --- |
| | |
| | # F2LLM-v2-80M |
| |
|
| | F2LLM-v2 is a family of general-purpose, multilingual embedding models in 8 distinct sizes ranging from 80M to 14B. Trained on a curated composite of 60 million publicly available high-quality data, F2LLM-v2 supports more than 200 languages, with a particular emphasis on previously underserved mid- and low-resource languages. |
| |
|
| | ## Usage |
| |
|
| | ### With Sentence Transformers |
| |
|
| | To encode text with the [Sentence Transformers](https://www.sbert.net/) library: |
| |
|
| | ```python |
| | from sentence_transformers import SentenceTransformer |
| | model = SentenceTransformer("codefuse-ai/F2LLM-v2-80M", device="cuda:0", model_kwargs={"torch_dtype": "bfloat16"}) |
| | # Some sample query and documents |
| | query = "What is F2LLM used for?" |
| | documents = [ |
| | 'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.', |
| | 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.', |
| | 'F2LLM 是 CodeFuse 开源的系列嵌入模型。', |
| | 'F2LLM — это модель вычисления встраивания текста, которую можно использовать для различных задач НЛП, таких как поиск информации, семантический поиск и классификация текста.' |
| | ] |
| | # Encode the query and documents separately. The encode_query method uses the query prompt |
| | query_embedding = model.encode_query(query) |
| | document_embeddings = model.encode_document(documents) |
| | print(query_embedding.shape, document_embeddings.shape) |
| | # (320,) (4, 320) |
| | # Compute cosine similarity between the query and documents |
| | similarity = model.similarity(query_embedding, document_embeddings) |
| | print(similarity) |
| | # tensor([[0.6968, 0.7818, 0.7165, 0.8374]]) |
| | ``` |
| |
|
| | ### With Transformers |
| |
|
| | Or directly with the [Transformers](https://huggingface.co/docs/transformers/index) library: |
| |
|
| | ```python |
| | from transformers import AutoModel, AutoTokenizer |
| | import torch |
| | import torch.nn.functional as F |
| | model_path = "codefuse-ai/F2LLM-v2-80M" |
| | tokenizer = AutoTokenizer.from_pretrained(model_path) |
| | model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map={'': 0}) |
| | query = "What is F2LLM used for?" |
| | query_prompt = "Instruct: Given a question, retrieve passages that can help answer the question.\nQuery: " |
| | documents = [ |
| | 'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.', |
| | 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.', |
| | 'F2LLM 是 CodeFuse 开源的系列嵌入模型。', |
| | 'F2LLM — это модель вычисления встраивания текста, которую можно использовать для различных задач НЛП, таких как поиск информации, семантический поиск и классификация текста.' |
| | ] |
| | def encode(sentences): |
| | batch_size = len(sentences) |
| | # the tokenizer will automatically add eos token |
| | tokenized_inputs = tokenizer(sentences, padding=True, return_tensors='pt').to(model.device) |
| | last_hidden_state = model(**tokenized_inputs).last_hidden_state |
| | eos_positions = tokenized_inputs.attention_mask.sum(dim=1) - 1 |
| | embeddings = last_hidden_state[torch.arange(batch_size, device=model.device), eos_positions] |
| | embeddings = F.normalize(embeddings, p=2, dim=1) |
| | return embeddings |
| | # Encode the query and documents |
| | query_embedding = encode([query_prompt + query]) |
| | document_embeddings = encode(documents) |
| | print(query_embedding.shape, document_embeddings.shape) |
| | # torch.Size([1, 320]) torch.Size([4, 320]) |
| | # Compute cosine similarity between the query and documents |
| | similarity = query_embedding @ document_embeddings.T |
| | print(similarity) |
| | # tensor([[0.6914, 0.7812, 0.7148, 0.8359]], device='cuda:0', |
| | # dtype=torch.bfloat16, grad_fn=<MmBackward0>) |
| | ``` |