Analysis_System / character_network /named_entity_recognizer.py
kankur0007's picture
Add application file
4475241
import spacy
from nltk.tokenize import sent_tokenize
import pandas as pd
from ast import literal_eval
import os
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset
class NamedEntityRecognizer:
def __init__(self):
self.nlp_model = self.load_model()
pass
def load_model(self):
nlp = spacy.load("en_core_web_trf")
return nlp
def get_ners_inference(self,script):
script_sentences = sent_tokenize(script)
ner_output = []
for sentence in script_sentences:
doc = self.nlp_model(sentence)
ners = set()
for entity in doc.ents:
if entity.label_ =="PERSON":
full_name = entity.text
first_name = full_name.split(" ")[0]
first_name = first_name.strip()
ners.add(first_name)
ner_output.append(ners)
return ner_output
def get_ners(self,dataset_path,save_path=None):
if save_path is not None and os.path.exists(save_path):
df = pd.read_csv(save_path)
df['ners'] = df['ners'].apply(lambda x: literal_eval(x) if isinstance(x,str) else x)
return df
# load dataset
df = load_subtitles_dataset(dataset_path)
# Run Inference
df['ners'] = df['script'].apply(self.get_ners_inference)
if save_path is not None:
df.to_csv(save_path,index=False)
return df