AISOP-oop-classifier / recognize.py
ppolx's picture
runnability
e7de395
raw
history blame
2.56 kB
# recognize: processes two-levels-topic-recognition
# Authors: Paul Libbrecht, Pierre Günthner and Alexander Gantikow from the AISOP project
# Installation: Install spacy then...
# Usage: python recognize.py l1-model l2-models "this is a text"
# l1-model path: a relative path (starting from this script) pointing to the level-1 model folder
# l2-model path: a relative path (starting from this script) pointing to the folder containing a folder for each L2-label
# "this is a text": the text to recognize
import sys # System-specific parameters and functions, part of Py
import spacy # Natural language processing
from pathlib import Path # Object-oriented filesystem paths, part of Py
import json # JSON object dumping functions
RoundTo = 2 # Round to precision of n decimals
Encoding = 'utf8' # Encoding of the html file to be read and parsed via BeautifulSoup
ScoreThreshold = 0.2 # Min. spacy probability value for an element to be analysed/enriched.
MaxResults = 3 # Max. number of concepts/labels added to an html element
ParagraphMinLetters = 10 # Min. number of letters of paragraph to be considered in analysis
ListMinLetters = 10 # Min. number of letters of <ul> and <ol> to be considered in analysis
SubModels = {}
Nlp = spacy.load(sys.argv[1])
SubModelDir = Path(__file__).parent.joinpath(sys.argv[2]).absolute()
input = sys.argv[3]
def filterDoc(doc, scoreThreshold, maxResults, roundTo=2):
cats = doc.cats.items()
filt = list(filter(lambda c: c[1] > scoreThreshold, cats))
sort = sorted(filt, key=lambda c: c[1], reverse=True)
maxi = sort[0:maxResults]
rund = [(l[0], round(l[1], roundTo)) for l in maxi ]
return dict(rund)
def recognize(text):
global Nlp
# find l1 labels
labels = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
# find L2 labels
relabels = dict()
for label in labels.keys():
label2 = label.strip()
SubModelPath = SubModelDir.joinpath(label2).absolute()
if SubModelPath.exists():
Nlp = spacy.load(SubModelPath)
docSub = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
relabels[label2] = {'score': labels[label], 'subs': docSub}
else:
relabels[label2] = {'score': labels[label]}
relabels["messages"] = "Submodel path \"" + str(SubModelPath) + "\" not found"
return relabels
print(json.dumps(recognize(input)))