| import pandas as pd |
| import calamancy |
|
|
| |
| df = pd.read_csv("corpus_with_group.csv") |
|
|
| |
| nlp = calamancy.load("tl_calamancy_md-0.2.0") |
|
|
| |
| print(df.head()) |
|
|
| def tokenize_and_lemmatize(text): |
| doc = nlp(str(text)) |
| tokens = [] |
| for token in doc: |
| if token.is_punct or token.is_space: |
| continue |
|
|
| lemma = token.lemma_.lower() if token.lemma_ else token.text.lower() |
|
|
| |
| for sub in lemma.split(): |
| tokens.append(sub) |
| return tokens |
|
|
|
|
| df["tokens"] = df["text"].apply(tokenize_and_lemmatize) |
| df.to_csv("Tokenized_Corpus", index=False) |
|
|
| print(df.head()) |