use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};

use crate::token::PyToken;
use crate::trainers::PyTrainer;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use serde::{Deserialize, Serialize};
use tk::models::bpe::{BpeBuilder, Merges, Vocab, BPE};
use tk::models::unigram::Unigram;
use tk::models::wordlevel::WordLevel;
use tk::models::wordpiece::{WordPiece, WordPieceBuilder};
use tk::models::ModelWrapper;
use tk::{Model, Token};
use tokenizers as tk;

use super::error::{deprecation_warning, ToPyResult};

/// Base class for all models
///
/// The model represents the actual tokenization algorithm. This is the part that
/// will contain and manage the learned vocabulary.
///
/// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass(module = "tokenizers.models", name = "Model", subclass)]
#[derive(Clone, Serialize, Deserialize)]
#[serde(transparent)]
pub struct PyModel {
    pub model: Arc<RwLock<ModelWrapper>>,
}

impl PyModel {
    pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
        let base = self.clone();
        Ok(match *self.model.as_ref().read().unwrap() {
            ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base))?.into_py(py),
            ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base))?.into_py(py),
            ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base))?.into_py(py),
            ModelWrapper::Unigram(_) => Py::new(py, (PyUnigram {}, base))?.into_py(py),
        })
    }
}

impl Model for PyModel {
    type Trainer = PyTrainer;

    fn tokenize(&self, tokens: &str) -> tk::Result<Vec<Token>> {
        self.model.read().unwrap().tokenize(tokens)
    }

    fn token_to_id(&self, token: &str) -> Option<u32> {
        self.model.read().unwrap().token_to_id(token)
    }

    fn id_to_token(&self, id: u32) -> Option<String> {
        self.model.read().unwrap().id_to_token(id)
    }

    fn get_vocab(&self) -> HashMap<String, u32> {
        self.model.read().unwrap().get_vocab()
    }

    fn get_vocab_size(&self) -> usize {
        self.model.read().unwrap().get_vocab_size()
    }

    fn save(&self, folder: &Path, name: Option<&str>) -> tk::Result<Vec<PathBuf>> {
        self.model.read().unwrap().save(folder, name)
    }

    fn get_trainer(&self) -> Self::Trainer {
        self.model.read().unwrap().get_trainer().into()
    }
}

impl<I> From<I> for PyModel
where
    I: Into<ModelWrapper>,
{
    fn from(model: I) -> Self {
        Self {
            model: Arc::new(RwLock::new(model.into())),
        }
    }
}

#[pymethods]
impl PyModel {
    #[new]
    #[pyo3(text_signature = None)]
    fn __new__() -> Self {
        // Instantiate a default empty model. This doesn't really make sense, but we need
        // to be able to instantiate an empty model for pickle capabilities.
        PyModel {
            model: Arc::new(RwLock::new(BPE::default().into())),
        }
    }

    fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
        let data = serde_json::to_string(&self.model).map_err(|e| {
            exceptions::PyException::new_err(format!(
                "Error while attempting to pickle Model: {}",
                e
            ))
        })?;
        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
    }

    fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
        match state.extract::<&PyBytes>(py) {
            Ok(s) => {
                self.model = serde_json::from_slice(s.as_bytes()).map_err(|e| {
                    exceptions::PyException::new_err(format!(
                        "Error while attempting to unpickle Model: {}",
                        e
                    ))
                })?;
                Ok(())
            }
            Err(e) => Err(e),
        }
    }

    /// Tokenize a sequence
    ///
    /// Args:
    ///     sequence (:obj:`str`):
    ///         A sequence to tokenize
    ///
    /// Returns:
    ///     A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
    #[pyo3(text_signature = "(self, sequence)")]
    fn tokenize(&self, sequence: &str) -> PyResult<Vec<PyToken>> {
        Ok(ToPyResult(self.model.read().unwrap().tokenize(sequence))
            .into_py()?
            .into_iter()
            .map(|t| t.into())
            .collect())
    }

    /// Get the ID associated to a token
    ///
    /// Args:
    ///     token (:obj:`str`):
    ///         A token to convert to an ID
    ///
    /// Returns:
    ///     :obj:`int`: The ID associated to the token
    #[pyo3(text_signature = "(self, tokens)")]
    fn token_to_id(&self, token: &str) -> Option<u32> {
        self.model.read().unwrap().token_to_id(token)
    }

    /// Get the token associated to an ID
    ///
    /// Args:
    ///     id (:obj:`int`):
    ///         An ID to convert to a token
    ///
    /// Returns:
    ///     :obj:`str`: The token associated to the ID
    #[pyo3(text_signature = "(self, id)")]
    fn id_to_token(&self, id: u32) -> Option<String> {
        self.model.read().unwrap().id_to_token(id)
    }

    /// Save the current model
    ///
    /// Save the current model in the given folder, using the given prefix for the various
    /// files that will get created.
    /// Any file with the same name that already exists in this folder will be overwritten.
    ///
    /// Args:
    ///     folder (:obj:`str`):
    ///         The path to the target folder in which to save the various files
    ///
    ///     prefix (:obj:`str`, `optional`):
    ///         An optional prefix, used to prefix each file name
    ///
    /// Returns:
    ///     :obj:`List[str]`: The list of saved files
    #[pyo3(text_signature = "(self, folder, prefix)")]
    fn save<'a>(
        &self,
        py: Python<'_>,
        folder: &str,
        mut prefix: Option<&'a str>,
        name: Option<&'a str>,
    ) -> PyResult<Vec<String>> {
        if name.is_some() {
            deprecation_warning(
                py,
                "0.10.0",
                "Parameter `name` of Model.save has been renamed `prefix`",
            )?;
            if prefix.is_none() {
                prefix = name;
            }
        }

        let saved: PyResult<Vec<_>> =
            ToPyResult(self.model.read().unwrap().save(Path::new(folder), prefix)).into();

        Ok(saved?
            .into_iter()
            .map(|path| path.to_string_lossy().into_owned())
            .collect())
    }

    /// Get the associated :class:`~tokenizers.trainers.Trainer`
    ///
    /// Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
    /// :class:`~tokenizers.models.Model`.
    ///
    /// Returns:
    ///     :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
    #[pyo3(text_signature = "(self)")]
    fn get_trainer(&self, py: Python<'_>) -> PyResult<PyObject> {
        PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype(py)
    }

    fn __repr__(&self) -> PyResult<String> {
        crate::utils::serde_pyo3::repr(self)
            .map_err(|e| exceptions::PyException::new_err(e.to_string()))
    }

    fn __str__(&self) -> PyResult<String> {
        crate::utils::serde_pyo3::to_string(self)
            .map_err(|e| exceptions::PyException::new_err(e.to_string()))
    }
}

/// An implementation of the BPE (Byte-Pair Encoding) algorithm
///
/// Args:
///     vocab (:obj:`Dict[str, int]`, `optional`):
///         A dictionary of string keys and their ids :obj:`{"am": 0,...}`
///
///     merges (:obj:`List[Tuple[str, str]]`, `optional`):
///         A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
///
///     cache_capacity (:obj:`int`, `optional`):
///         The number of words that the BPE cache can contain. The cache allows
///         to speed-up the process by keeping the result of the merge operations
///         for a number of words.
///
///     dropout (:obj:`float`, `optional`):
///         A float between 0 and 1 that represents the BPE dropout to use.
///
///     unk_token (:obj:`str`, `optional`):
///         The unknown token to be used by the model.
///
///     continuing_subword_prefix (:obj:`str`, `optional`):
///         The prefix to attach to subword units that don't represent a beginning of word.
///
///     end_of_word_suffix (:obj:`str`, `optional`):
///         The suffix to attach to subword units that represent an end of word.
///
///     fuse_unk (:obj:`bool`, `optional`):
///         Whether to fuse any subsequent unknown tokens into a single one
///
///     byte_fallback (:obj:`bool`, `optional`):
///         Whether to use spm byte-fallback trick (defaults to False)
///
///     ignore_merges (:obj:`bool`, `optional`):
///         Whether or not to match tokens with the vocab before using merges.
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")]
pub struct PyBPE {}

impl PyBPE {
    fn with_builder(
        mut builder: BpeBuilder,
        kwargs: Option<&Bound<'_, PyDict>>,
    ) -> PyResult<(Self, PyModel)> {
        if let Some(kwargs) = kwargs {
            for (key, value) in kwargs {
                let key: &str = key.extract()?;
                match key {
                    "cache_capacity" => builder = builder.cache_capacity(value.extract()?),
                    "dropout" => {
                        if let Some(dropout) = value.extract()? {
                            builder = builder.dropout(dropout);
                        }
                    }
                    "unk_token" => {
                        if let Some(unk) = value.extract()? {
                            builder = builder.unk_token(unk);
                        }
                    }
                    "continuing_subword_prefix" => {
                        builder = builder.continuing_subword_prefix(value.extract()?)
                    }
                    "end_of_word_suffix" => builder = builder.end_of_word_suffix(value.extract()?),
                    "fuse_unk" => builder = builder.fuse_unk(value.extract()?),
                    "byte_fallback" => builder = builder.byte_fallback(value.extract()?),
                    "ignore_merges" => builder = builder.ignore_merges(value.extract()?),
                    _ => println!("Ignored unknown kwarg option {}", key),
                };
            }
        }

        match builder.build() {
            Err(e) => Err(exceptions::PyException::new_err(format!(
                "Error while initializing BPE: {}",
                e
            ))),
            Ok(bpe) => Ok((PyBPE {}, bpe.into())),
        }
    }
}

macro_rules! getter {
    ($self: ident, $variant: ident, $($name: tt)+) => {{
        let super_ = $self.as_ref();
        let model = super_.model.read().unwrap();
        if let ModelWrapper::$variant(ref mo) = *model {
            mo.$($name)+
        } else {
            unreachable!()
        }
    }};
}

macro_rules! setter {
    ($self: ident, $variant: ident, $name: ident, $value: expr) => {{
        let super_ = $self.as_ref();
        let mut model = super_.model.write().unwrap();
        if let ModelWrapper::$variant(ref mut mo) = *model {
            mo.$name = $value;
        }
    }};
}

#[derive(FromPyObject)]
enum PyVocab {
    Vocab(Vocab),
    Filename(String),
}
#[derive(FromPyObject)]
enum PyMerges {
    Merges(Merges),
    Filename(String),
}

#[pymethods]
impl PyBPE {
    #[getter]
    fn get_dropout(self_: PyRef<Self>) -> Option<f32> {
        getter!(self_, BPE, dropout)
    }

    #[setter]
    fn set_dropout(self_: PyRef<Self>, dropout: Option<f32>) {
        setter!(self_, BPE, dropout, dropout);
    }

    #[getter]
    fn get_unk_token(self_: PyRef<Self>) -> Option<String> {
        getter!(self_, BPE, unk_token.clone())
    }

    #[setter]
    fn set_unk_token(self_: PyRef<Self>, unk_token: Option<String>) {
        setter!(self_, BPE, unk_token, unk_token);
    }

    #[getter]
    fn get_continuing_subword_prefix(self_: PyRef<Self>) -> Option<String> {
        getter!(self_, BPE, continuing_subword_prefix.clone())
    }

    #[setter]
    fn set_continuing_subword_prefix(
        self_: PyRef<Self>,
        continuing_subword_prefix: Option<String>,
    ) {
        setter!(
            self_,
            BPE,
            continuing_subword_prefix,
            continuing_subword_prefix
        );
    }

    #[getter]
    fn get_end_of_word_suffix(self_: PyRef<Self>) -> Option<String> {
        getter!(self_, BPE, end_of_word_suffix.clone())
    }

    #[setter]
    fn set_end_of_word_suffix(self_: PyRef<Self>, end_of_word_suffix: Option<String>) {
        setter!(self_, BPE, end_of_word_suffix, end_of_word_suffix);
    }

    #[getter]
    fn get_fuse_unk(self_: PyRef<Self>) -> bool {
        getter!(self_, BPE, fuse_unk)
    }

    #[setter]
    fn set_fuse_unk(self_: PyRef<Self>, fuse_unk: bool) {
        setter!(self_, BPE, fuse_unk, fuse_unk);
    }

    #[getter]
    fn get_byte_fallback(self_: PyRef<Self>) -> bool {
        getter!(self_, BPE, byte_fallback)
    }

    #[setter]
    fn set_byte_fallback(self_: PyRef<Self>, byte_fallback: bool) {
        setter!(self_, BPE, byte_fallback, byte_fallback);
    }
    #[getter]
    fn get_ignore_merges(self_: PyRef<Self>) -> bool {
        getter!(self_, BPE, ignore_merges)
    }

    #[setter]
    fn set_ignore_merges(self_: PyRef<Self>, ignore_merges: bool) {
        setter!(self_, BPE, ignore_merges, ignore_merges);
    }
    #[new]
    #[pyo3(
        signature = (vocab=None, merges=None, **kwargs),
        text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False, ignore_merges=False)")]
    fn new(
        py: Python<'_>,
        vocab: Option<PyVocab>,
        merges: Option<PyMerges>,
        kwargs: Option<&Bound<'_, PyDict>>,
    ) -> PyResult<(Self, PyModel)> {
        if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
            return Err(exceptions::PyValueError::new_err(
                "`vocab` and `merges` must be both specified",
            ));
        }

        let mut builder = BPE::builder();
        if let (Some(vocab), Some(merges)) = (vocab, merges) {
            match (vocab, merges) {
                (PyVocab::Vocab(vocab), PyMerges::Merges(merges)) => {
                    builder = builder.vocab_and_merges(vocab, merges);
                }
                (PyVocab::Filename(vocab_filename), PyMerges::Filename(merges_filename)) => {
                    deprecation_warning(
                    py,
                    "0.9.0",
                    "BPE.__init__ will not create from files anymore, try `BPE.from_file` instead",
                )?;
                    builder =
                        builder.files(vocab_filename.to_string(), merges_filename.to_string());
                }
                _ => {
                    return Err(exceptions::PyValueError::new_err(
                        "`vocab` and `merges` must be both be from memory or both filenames",
                    ));
                }
            }
        }

        PyBPE::with_builder(builder, kwargs)
    }

    /// Read a :obj:`vocab.json` and a :obj:`merges.txt` files
    ///
    /// This method provides a way to read and parse the content of these files,
    /// returning the relevant data structures. If you want to instantiate some BPE models
    /// from memory, this method gives you the expected input from the standard files.
    ///
    /// Args:
    ///     vocab (:obj:`str`):
    ///         The path to a :obj:`vocab.json` file
    ///
    ///     merges (:obj:`str`):
    ///         The path to a :obj:`merges.txt` file
    ///
    /// Returns:
    ///     A :obj:`Tuple` with the vocab and the merges:
    ///         The vocabulary and merges loaded into memory
    #[staticmethod]
    #[pyo3(text_signature = "(self, vocab, merges)")]
    fn read_file(vocab: &str, merges: &str) -> PyResult<(Vocab, Merges)> {
        BPE::read_file(vocab, merges).map_err(|e| {
            exceptions::PyException::new_err(format!(
                "Error while reading vocab & merges files: {}",
                e
            ))
        })
    }

    /// Instantiate a BPE model from the given files.
    ///
    /// This method is roughly equivalent to doing::
    ///
    ///    vocab, merges = BPE.read_file(vocab_filename, merges_filename)
    ///    bpe = BPE(vocab, merges)
    ///
    /// If you don't need to keep the :obj:`vocab, merges` values lying around,
    /// this method is more optimized than manually calling
    /// :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
    ///
    /// Args:
    ///     vocab (:obj:`str`):
    ///         The path to a :obj:`vocab.json` file
    ///
    ///     merges (:obj:`str`):
    ///         The path to a :obj:`merges.txt` file
    ///
    /// Returns:
    ///     :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
    #[classmethod]
    #[pyo3(signature = (vocab, merges, **kwargs))]
    #[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")]
    fn from_file(
        _cls: &Bound<'_, PyType>,
        py: Python,
        vocab: &str,
        merges: &str,
        kwargs: Option<&Bound<'_, PyDict>>,
    ) -> PyResult<Py<Self>> {
        let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| {
            exceptions::PyException::new_err(format!("Error while reading BPE files: {}", e))
        })?;
        Py::new(
            py,
            PyBPE::new(
                py,
                Some(PyVocab::Vocab(vocab)),
                Some(PyMerges::Merges(merges)),
                kwargs,
            )?,
        )
    }
}

/// An implementation of the WordPiece algorithm
///
/// Args:
///     vocab (:obj:`Dict[str, int]`, `optional`):
///         A dictionary of string keys and their ids :obj:`{"am": 0,...}`
///
///     unk_token (:obj:`str`, `optional`):
///         The unknown token to be used by the model.
///
///     max_input_chars_per_word (:obj:`int`, `optional`):
///         The maximum number of characters to authorize in a single word.
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")]
pub struct PyWordPiece {}

impl PyWordPiece {
    fn with_builder(
        mut builder: WordPieceBuilder,
        kwargs: Option<&Bound<'_, PyDict>>,
    ) -> PyResult<(Self, PyModel)> {
        if let Some(kwargs) = kwargs {
            for (key, val) in kwargs {
                let key: &str = key.extract()?;
                match key {
                    "unk_token" => {
                        builder = builder.unk_token(val.extract()?);
                    }
                    "max_input_chars_per_word" => {
                        builder = builder.max_input_chars_per_word(val.extract()?);
                    }
                    "continuing_subword_prefix" => {
                        builder = builder.continuing_subword_prefix(val.extract()?);
                    }
                    _ => println!("Ignored unknown kwargs option {}", key),
                }
            }
        }

        match builder.build() {
            Err(e) => Err(exceptions::PyException::new_err(format!(
                "Error while initializing WordPiece: {}",
                e
            ))),
            Ok(wordpiece) => Ok((PyWordPiece {}, wordpiece.into())),
        }
    }
}

#[pymethods]
impl PyWordPiece {
    #[getter]
    fn get_unk_token(self_: PyRef<Self>) -> String {
        getter!(self_, WordPiece, unk_token.clone())
    }

    #[setter]
    fn set_unk_token(self_: PyRef<Self>, unk_token: String) {
        setter!(self_, WordPiece, unk_token, unk_token);
    }

    #[getter]
    fn get_continuing_subword_prefix(self_: PyRef<Self>) -> String {
        getter!(self_, WordPiece, continuing_subword_prefix.clone())
    }

    #[setter]
    fn set_continuing_subword_prefix(self_: PyRef<Self>, continuing_subword_prefix: String) {
        setter!(
            self_,
            WordPiece,
            continuing_subword_prefix,
            continuing_subword_prefix
        );
    }

    #[getter]
    fn get_max_input_chars_per_word(self_: PyRef<Self>) -> usize {
        getter!(self_, WordPiece, max_input_chars_per_word)
    }

    #[setter]
    fn set_max_input_chars_per_word(self_: PyRef<Self>, max: usize) {
        setter!(self_, WordPiece, max_input_chars_per_word, max);
    }

    #[new]
    #[pyo3(signature = (vocab=None, **kwargs), text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
    fn new(
        py: Python<'_>,
        vocab: Option<PyVocab>,
        kwargs: Option<&Bound<'_, PyDict>>,
    ) -> PyResult<(Self, PyModel)> {
        let mut builder = WordPiece::builder();

        if let Some(vocab) = vocab {
            match vocab {
                PyVocab::Vocab(vocab) => {
                    builder = builder.vocab(vocab);
                }
                PyVocab::Filename(vocab_filename) => {
                    deprecation_warning(
                        py,
                        "0.9.0",
                        "WordPiece.__init__ will not create from files anymore, try `WordPiece.from_file` instead",
                    )?;
                    builder = builder.files(vocab_filename.to_string());
                }
            }
        }

        PyWordPiece::with_builder(builder, kwargs)
    }

    /// Read a :obj:`vocab.txt` file
    ///
    /// This method provides a way to read and parse the content of a standard `vocab.txt`
    /// file as used by the WordPiece Model, returning the relevant data structures. If you
    /// want to instantiate some WordPiece models from memory, this method gives you the
    /// expected input from the standard files.
    ///
    /// Args:
    ///     vocab (:obj:`str`):
    ///         The path to a :obj:`vocab.txt` file
    ///
    /// Returns:
    ///     :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
    #[staticmethod]
    #[pyo3(text_signature = "(vocab)")]
    fn read_file(vocab: &str) -> PyResult<Vocab> {
        WordPiece::read_file(vocab).map_err(|e| {
            exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
        })
    }

    /// Instantiate a WordPiece model from the given file
    ///
    /// This method is roughly equivalent to doing::
    ///
    ///     vocab = WordPiece.read_file(vocab_filename)
    ///     wordpiece = WordPiece(vocab)
    ///
    /// If you don't need to keep the :obj:`vocab` values lying around, this method is
    /// more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
    /// initialize a :class:`~tokenizers.models.WordPiece`
    ///
    /// Args:
    ///     vocab (:obj:`str`):
    ///         The path to a :obj:`vocab.txt` file
    ///
    /// Returns:
    ///     :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
    #[classmethod]
    #[pyo3(signature = (vocab, **kwargs))]
    #[pyo3(text_signature = "(vocab, **kwargs)")]
    fn from_file(
        _cls: &Bound<'_, PyType>,
        py: Python,
        vocab: &str,
        kwargs: Option<&Bound<'_, PyDict>>,
    ) -> PyResult<Py<Self>> {
        let vocab = WordPiece::read_file(vocab).map_err(|e| {
            exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
        })?;
        Py::new(
            py,
            PyWordPiece::new(py, Some(PyVocab::Vocab(vocab)), kwargs)?,
        )
    }
}

/// An implementation of the WordLevel algorithm
///
/// Most simple tokenizer model based on mapping tokens to their corresponding id.
///
/// Args:
///     vocab (:obj:`str`, `optional`):
///         A dictionary of string keys and their ids :obj:`{"am": 0,...}`
///
///     unk_token (:obj:`str`, `optional`):
///         The unknown token to be used by the model.
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")]
pub struct PyWordLevel {}

#[pymethods]
impl PyWordLevel {
    #[getter]
    fn get_unk_token(self_: PyRef<Self>) -> String {
        getter!(self_, WordLevel, unk_token.clone())
    }

    #[setter]
    fn set_unk_token(self_: PyRef<Self>, unk_token: String) {
        setter!(self_, WordLevel, unk_token, unk_token);
    }

    #[new]
    #[pyo3(signature = (vocab=None, unk_token = None), text_signature = "(self, vocab, unk_token)")]
    fn new(
        py: Python<'_>,
        vocab: Option<PyVocab>,
        unk_token: Option<String>,
    ) -> PyResult<(Self, PyModel)> {
        let mut builder = WordLevel::builder();

        if let Some(vocab) = vocab {
            match vocab {
                PyVocab::Vocab(vocab) => {
                    builder = builder.vocab(vocab);
                }
                PyVocab::Filename(vocab_filename) => {
                    deprecation_warning(
                        py,
                        "0.9.0",
                        "WordLevel.__init__ will not create from files anymore, \
                            try `WordLevel.from_file` instead",
                    )?;
                    builder = builder.files(vocab_filename.to_string());
                }
            };
        }
        if let Some(unk_token) = unk_token {
            builder = builder.unk_token(unk_token);
        }

        Ok((
            PyWordLevel {},
            builder
                .build()
                .map_err(|e| exceptions::PyException::new_err(e.to_string()))?
                .into(),
        ))
    }

    /// Read a :obj:`vocab.json`
    ///
    /// This method provides a way to read and parse the content of a vocabulary file,
    /// returning the relevant data structures. If you want to instantiate some WordLevel models
    /// from memory, this method gives you the expected input from the standard files.
    ///
    /// Args:
    ///     vocab (:obj:`str`):
    ///         The path to a :obj:`vocab.json` file
    ///
    /// Returns:
    ///     :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
    #[staticmethod]
    #[pyo3(text_signature = "(vocab)")]
    fn read_file(vocab: &str) -> PyResult<Vocab> {
        WordLevel::read_file(vocab).map_err(|e| {
            exceptions::PyException::new_err(format!("Error while reading WordLevel file: {}", e))
        })
    }

    /// Instantiate a WordLevel model from the given file
    ///
    /// This method is roughly equivalent to doing::
    ///
    ///     vocab = WordLevel.read_file(vocab_filename)
    ///     wordlevel = WordLevel(vocab)
    ///
    /// If you don't need to keep the :obj:`vocab` values lying around, this method is
    /// more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
    /// initialize a :class:`~tokenizers.models.WordLevel`
    ///
    /// Args:
    ///     vocab (:obj:`str`):
    ///         The path to a :obj:`vocab.json` file
    ///
    /// Returns:
    ///     :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
    #[classmethod]
    #[pyo3(signature = (vocab, unk_token = None))]
    #[pyo3(text_signature = "(vocab, unk_token)")]
    fn from_file(
        _cls: &Bound<'_, PyType>,
        py: Python,
        vocab: &str,
        unk_token: Option<String>,
    ) -> PyResult<Py<Self>> {
        let vocab = WordLevel::read_file(vocab).map_err(|e| {
            exceptions::PyException::new_err(format!("Error while reading WordLevel file: {}", e))
        })?;
        Py::new(
            py,
            PyWordLevel::new(py, Some(PyVocab::Vocab(vocab)), unk_token)?,
        )
    }
}

/// An implementation of the Unigram algorithm
///
/// Args:
///     vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
///         A list of vocabulary items and their relative score [("am", -0.2442),...]
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")]
pub struct PyUnigram {}

#[pymethods]
impl PyUnigram {
    #[new]
    #[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
    fn new(
        vocab: Option<Vec<(String, f64)>>,
        unk_id: Option<usize>,
        byte_fallback: Option<bool>,
    ) -> PyResult<(Self, PyModel)> {
        match (vocab, unk_id, byte_fallback) {
            (Some(vocab), unk_id, byte_fallback) => {
                let model =
                    Unigram::from(vocab, unk_id, byte_fallback.unwrap_or(false)).map_err(|e| {
                        exceptions::PyException::new_err(format!(
                            "Error while loading Unigram: {}",
                            e
                        ))
                    })?;
                Ok((PyUnigram {}, model.into()))
            }
            (None, None, _) => Ok((PyUnigram {}, Unigram::default().into())),
            _ => Err(exceptions::PyValueError::new_err(
                "`vocab` and `unk_id` must be both specified",
            )),
        }
    }
}

/// Models Module
#[pymodule]
pub fn models(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_class::<PyModel>()?;
    m.add_class::<PyBPE>()?;
    m.add_class::<PyWordPiece>()?;
    m.add_class::<PyWordLevel>()?;
    m.add_class::<PyUnigram>()?;
    Ok(())
}

#[cfg(test)]
mod test {
    use crate::models::PyModel;
    use pyo3::prelude::*;
    use tk::models::bpe::BPE;
    use tk::models::ModelWrapper;

    #[test]
    fn get_subtype() {
        Python::with_gil(|py| {
            let py_model = PyModel::from(BPE::default());
            let py_bpe = py_model.get_as_subtype(py).unwrap();
            assert_eq!("BPE", py_bpe.bind(py).get_type().qualname().unwrap());
        })
    }

    #[test]
    fn serialize() {
        let rs_bpe = BPE::default();
        let rs_bpe_ser = serde_json::to_string(&rs_bpe).unwrap();
        let rs_wrapper: ModelWrapper = rs_bpe.into();
        let rs_wrapper_ser = serde_json::to_string(&rs_wrapper).unwrap();

        let py_model = PyModel::from(rs_wrapper);
        let py_ser = serde_json::to_string(&py_model).unwrap();
        assert_eq!(py_ser, rs_bpe_ser);
        assert_eq!(py_ser, rs_wrapper_ser);

        let py_model: PyModel = serde_json::from_str(&rs_bpe_ser).unwrap();
        match *py_model.model.as_ref().read().unwrap() {
            ModelWrapper::BPE(_) => (),
            _ => panic!("Expected Bert postprocessor."),
        };

        let py_model: PyModel = serde_json::from_str(&rs_wrapper_ser).unwrap();
        match *py_model.model.as_ref().read().unwrap() {
            ModelWrapper::BPE(_) => (),
            _ => panic!("Expected Bert postprocessor."),
        };
    }
}