import spacy from nltk.tokenize import sent_tokenize import pandas as pd from ast import literal_eval import os import sys import pathlib folder_path = pathlib.Path().parent.resolve() sys.path.append(os.path.join(folder_path, '../')) from utils import load_subtitles_dataset class NamedEntityRecognizer: def __init__(self): self.nlp_model = self.load_model() pass def load_model(self): nlp = spacy.load("en_core_web_trf") return nlp def get_ners_inference(self,script): script_sentences = sent_tokenize(script) ner_output = [] for sentence in script_sentences: doc = self.nlp_model(sentence) ners = set() for entity in doc.ents: if entity.label_ =="PERSON": full_name = entity.text first_name = full_name.split(" ")[0] first_name = first_name.strip() ners.add(first_name) ner_output.append(ners) return ner_output def get_ners(self,dataset_path,save_path=None): if save_path is not None and os.path.exists(save_path): df = pd.read_csv(save_path) df['ners'] = df['ners'].apply(lambda x: literal_eval(x) if isinstance(x,str) else x) return df # load dataset df = load_subtitles_dataset(dataset_path) # Run Inference df['ners'] = df['script'].apply(self.get_ners_inference) if save_path is not None: df.to_csv(save_path,index=False) return df