Batch_RAG / embeddings /text_embedder.py
Arsen Dolichnyi
basic version with classic and multimodal rag
42b2b3c
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
import torch
import re
import emoji
import unicodedata
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
device = "cuda" if torch.cuda.is_available() else "cpu"
text_model = SentenceTransformer("intfloat/e5-base", device=device)
def preprocess_text(text):
text = emoji.replace_emoji(text, replace='')
text = ''.join(c for c in text if unicodedata.category(c)[0] != 'C')
text = text.lower()
words = re.findall(r'\b[a-z]+\b', text)
filtered = [w for w in words if w not in stop_words]
return " ".join(filtered)
def get_text_embedding(text):
clean_text = preprocess_text(text)
emb = text_model.encode(clean_text, convert_to_numpy=True, normalize_embeddings=True)
return emb.tolist()
class TextEmbeddings(Embeddings):
def embed_documents(self, texts):
return [get_text_embedding(text) for text in texts]
def embed_query(self, text):
return get_text_embedding(text)