| from ._base import BaseEmbedder |
|
|
| |
| from bertopic.backend._sklearn import SklearnEmbedder |
| from sklearn.pipeline import make_pipeline |
| from sklearn.decomposition import TruncatedSVD |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.pipeline import Pipeline as ScikitPipeline |
|
|
|
|
| languages = [ |
| "arabic", |
| "bulgarian", |
| "catalan", |
| "czech", |
| "danish", |
| "german", |
| "greek", |
| "english", |
| "spanish", |
| "estonian", |
| "persian", |
| "finnish", |
| "french", |
| "canadian french", |
| "galician", |
| "gujarati", |
| "hebrew", |
| "hindi", |
| "croatian", |
| "hungarian", |
| "armenian", |
| "indonesian", |
| "italian", |
| "japanese", |
| "georgian", |
| "korean", |
| "kurdish", |
| "lithuanian", |
| "latvian", |
| "macedonian", |
| "mongolian", |
| "marathi", |
| "malay", |
| "burmese", |
| "norwegian bokmal", |
| "dutch", |
| "polish", |
| "portuguese", |
| "brazilian portuguese", |
| "romanian", |
| "russian", |
| "slovak", |
| "slovenian", |
| "albanian", |
| "serbian", |
| "swedish", |
| "thai", |
| "turkish", |
| "ukrainian", |
| "urdu", |
| "vietnamese", |
| "chinese (simplified)", |
| "chinese (traditional)", |
| ] |
|
|
|
|
| def select_backend(embedding_model, |
| language: str = None) -> BaseEmbedder: |
| """ Select an embedding model based on language or a specific sentence transformer models. |
| When selecting a language, we choose all-MiniLM-L6-v2 for English and |
| paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages. |
| |
| Returns: |
| model: Either a Sentence-Transformer or Flair model |
| """ |
| |
| if isinstance(embedding_model, BaseEmbedder): |
| return embedding_model |
|
|
| |
| if isinstance(embedding_model, ScikitPipeline): |
| return SklearnEmbedder(embedding_model) |
|
|
| |
| if "flair" in str(type(embedding_model)): |
| from bertopic.backend._flair import FlairBackend |
| return FlairBackend(embedding_model) |
|
|
| |
| if "spacy" in str(type(embedding_model)): |
| from bertopic.backend._spacy import SpacyBackend |
| return SpacyBackend(embedding_model) |
|
|
| |
| if "gensim" in str(type(embedding_model)): |
| from bertopic.backend._gensim import GensimBackend |
| return GensimBackend(embedding_model) |
|
|
| |
| if "tensorflow" and "saved_model" in str(type(embedding_model)): |
| from bertopic.backend._use import USEBackend |
| return USEBackend(embedding_model) |
|
|
| |
| if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str): |
| from ._sentencetransformers import SentenceTransformerBackend |
| return SentenceTransformerBackend(embedding_model) |
|
|
| |
| if "transformers" and "pipeline" in str(type(embedding_model)): |
| from ._hftransformers import HFTransformerBackend |
| return HFTransformerBackend(embedding_model) |
|
|
| |
| if language: |
| try: |
| from ._sentencetransformers import SentenceTransformerBackend |
| if language.lower() in ["English", "english", "en"]: |
| return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2") |
| elif language.lower() in languages or language == "multilingual": |
| return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
| else: |
| raise ValueError(f"{language} is currently not supported. However, you can " |
| f"create any embeddings yourself and pass it through fit_transform(docs, embeddings)\n" |
| "Else, please select a language from the following list:\n" |
| f"{languages}") |
|
|
| |
| except ModuleNotFoundError: |
| pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(100)) |
| return SklearnEmbedder(pipe) |
|
|
| from ._sentencetransformers import SentenceTransformerBackend |
| return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2") |
|
|