Spaces:
Sleeping
Sleeping
File size: 1,588 Bytes
4475241 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import spacy
from nltk.tokenize import sent_tokenize
import pandas as pd
from ast import literal_eval
import os
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset
class NamedEntityRecognizer:
def __init__(self):
self.nlp_model = self.load_model()
pass
def load_model(self):
nlp = spacy.load("en_core_web_trf")
return nlp
def get_ners_inference(self,script):
script_sentences = sent_tokenize(script)
ner_output = []
for sentence in script_sentences:
doc = self.nlp_model(sentence)
ners = set()
for entity in doc.ents:
if entity.label_ =="PERSON":
full_name = entity.text
first_name = full_name.split(" ")[0]
first_name = first_name.strip()
ners.add(first_name)
ner_output.append(ners)
return ner_output
def get_ners(self,dataset_path,save_path=None):
if save_path is not None and os.path.exists(save_path):
df = pd.read_csv(save_path)
df['ners'] = df['ners'].apply(lambda x: literal_eval(x) if isinstance(x,str) else x)
return df
# load dataset
df = load_subtitles_dataset(dataset_path)
# Run Inference
df['ners'] = df['script'].apply(self.get_ners_inference)
if save_path is not None:
df.to_csv(save_path,index=False)
return df |