sentence-transformers
Somali
English
Italian
semantic-search
lexical-retrieval
somali
multilingual
dictionary
terminology
Instructions to use haajidheere/ErayNet-nirig with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use haajidheere/ErayNet-nirig with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("haajidheere/ErayNet-nirig") sentences = [ "The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium." ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [3, 3] - Notebooks
- Google Colab
- Kaggle
| from sentence_transformers import SentenceTransformer | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import normalize | |
| import os | |
| DATA_PATH = "data/cleaned/abbreviations.csv" | |
| OUTPUT_DIR = "ai_model" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| df = pd.read_csv(DATA_PATH) | |
| print(f"Loaded {len(df)} entries") | |
| if 'domain' not in df.columns: | |
| def infer_domain(row): | |
| text = f"{row.get('somali', '')} {row.get('english', '')} {row.get('italian', '')}".lower() | |
| medical_keywords = ['medicine', 'medical', 'disease', 'health', 'doctor', 'hospital', 'clinic', 'treatment', 'patient', 'diagnosis', 'therapy', 'pharma', 'drug', 'medic', 'caafimaad', 'daktari', 'bukaan'] | |
| legal_keywords = ['law', 'legal', 'court', 'judge', 'court', ' legislation', 'statute', 'contract', 'rights', 'crime', 'offense', 'prosecution', 'defense', 'lawyer', 'sharciga', 'qodob', 'xeer'] | |
| science_keywords = ['biology', 'botany', 'physics', 'chemistry', 'science', 'astronomy', 'zoology', 'meteorology', 'agriculture', 'technology', 'math', 'computer', 'environment'] | |
| religious_keywords = ['religion', 'god', 'islam', 'christian', 'church', 'prayer', 'faith', 'diin', 'iimaan', ' MASJID'] | |
| if any(kw in text for kw in medical_keywords): | |
| return 'Medical' | |
| elif any(kw in text for kw in legal_keywords): | |
| return 'Legal' | |
| elif any(kw in text for kw in science_keywords): | |
| return 'Science' | |
| elif any(kw in text for kw in religious_keywords): | |
| return 'Religious' | |
| return 'General' | |
| df['domain'] = df.apply(infer_domain, axis=1) | |
| for col in ['somali', 'english', 'italian', 'domain']: | |
| df[col] = df[col].fillna('') | |
| df["search_text"] = ( | |
| df["somali"].str.lower() + " " + | |
| df["english"].str.lower() + " " + | |
| df["italian"].str.lower() + " " + | |
| df["domain"].str.lower() | |
| ).str.strip() | |
| print("Loading multilingual model...") | |
| model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") | |
| print("Generating embeddings...") | |
| embeddings = model.encode(df["search_text"].tolist(), normalize_embeddings=True) | |
| embeddings = normalize(embeddings, axis=1, norm='l2') | |
| np.save(f"{OUTPUT_DIR}/embeddings.npy", embeddings) | |
| df.to_csv(f"{OUTPUT_DIR}/search_data.csv", index=False) | |
| print(f"Embeddings created: {embeddings.shape}") | |
| print(f"Saved to {OUTPUT_DIR}/") |