import pandas as pd import calamancy # load the corpus df = pd.read_csv("corpus_with_group.csv") # nlp initialize nlp = calamancy.load("tl_calamancy_md-0.2.0") # test if corpus is loading print(df.head()) def tokenize_and_lemmatize(text): doc = nlp(str(text)) tokens = [] for token in doc: if token.is_punct or token.is_space: continue lemma = token.lemma_.lower() if token.lemma_ else token.text.lower() # this part is used to split merged tokens during tokenizations... for sub in lemma.split(): tokens.append(sub) return tokens df["tokens"] = df["text"].apply(tokenize_and_lemmatize) df.to_csv("Tokenized_Corpus", index=False) print(df.head())