Spaces:

Jandayl
/

Alalay

Sleeping

Alalay / models /calamancy_tokenizer.py

Deploy Filipino NLP

b052258 2 months ago

731 Bytes

	import pandas as pd
	import calamancy

	# load the corpus
	df = pd.read_csv("corpus_with_group.csv")

	# nlp initialize
	nlp = calamancy.load("tl_calamancy_md-0.2.0")

	# test if corpus is loading
	print(df.head())

	def tokenize_and_lemmatize(text):
	doc = nlp(str(text))
	tokens = []
	for token in doc:
	if token.is_punct or token.is_space:
	continue

	lemma = token.lemma_.lower() if token.lemma_ else token.text.lower()

	# this part is used to split merged tokens during tokenizations...
	for sub in lemma.split():
	tokens.append(sub)
	return tokens


	df["tokens"] = df["text"].apply(tokenize_and_lemmatize)
	df.to_csv("Tokenized_Corpus", index=False)

	print(df.head())