Alalay / models /calamancy_tokenizer.py
Jandayl's picture
Deploy Filipino NLP
b052258
raw
history blame contribute delete
731 Bytes
import pandas as pd
import calamancy
# load the corpus
df = pd.read_csv("corpus_with_group.csv")
# nlp initialize
nlp = calamancy.load("tl_calamancy_md-0.2.0")
# test if corpus is loading
print(df.head())
def tokenize_and_lemmatize(text):
doc = nlp(str(text))
tokens = []
for token in doc:
if token.is_punct or token.is_space:
continue
lemma = token.lemma_.lower() if token.lemma_ else token.text.lower()
# this part is used to split merged tokens during tokenizations...
for sub in lemma.split():
tokens.append(sub)
return tokens
df["tokens"] = df["text"].apply(tokenize_and_lemmatize)
df.to_csv("Tokenized_Corpus", index=False)
print(df.head())