import pandas as pd
import calamancy

# load the corpus
df = pd.read_csv("corpus_with_group.csv")

# nlp initialize
nlp = calamancy.load("tl_calamancy_md-0.2.0")

# test if corpus is loading
print(df.head())

def tokenize_and_lemmatize(text):
    doc = nlp(str(text))
    tokens = []
    for token in doc:
        if token.is_punct or token.is_space:
            continue    

        lemma = token.lemma_.lower() if token.lemma_ else token.text.lower()

        # this part is used to split merged tokens during tokenizations...
        for sub in lemma.split():
            tokens.append(sub)
    return tokens


df["tokens"] = df["text"].apply(tokenize_and_lemmatize)
df.to_csv("Tokenized_Corpus", index=False)

print(df.head())