PH-Weingarten
/

AISOP-oop-classifier

Text Classification

computer-science

Model card Files Files and versions

AISOP-oop-classifier / recognize.py

ppolx's picture

runnability

e7de395 about 1 month ago

2.56 kB

	# recognize: processes two-levels-topic-recognition
	# Authors: Paul Libbrecht, Pierre Günthner and Alexander Gantikow from the AISOP project

	# Installation: Install spacy then...
	# Usage: python recognize.py l1-model l2-models "this is a text"
	# l1-model path: a relative path (starting from this script) pointing to the level-1 model folder
	# l2-model path: a relative path (starting from this script) pointing to the folder containing a folder for each L2-label
	# "this is a text": the text to recognize


	import sys # System-specific parameters and functions, part of Py
	import spacy # Natural language processing
	from pathlib import Path # Object-oriented filesystem paths, part of Py
	import json # JSON object dumping functions


	RoundTo = 2 # Round to precision of n decimals
	Encoding = 'utf8' # Encoding of the html file to be read and parsed via BeautifulSoup
	ScoreThreshold = 0.2 # Min. spacy probability value for an element to be analysed/enriched.
	MaxResults = 3 # Max. number of concepts/labels added to an html element
	ParagraphMinLetters = 10 # Min. number of letters of paragraph to be considered in analysis
	ListMinLetters = 10 # Min. number of letters of <ul> and <ol> to be considered in analysis

	SubModels = {}

	Nlp = spacy.load(sys.argv[1])
	SubModelDir = Path(__file__).parent.joinpath(sys.argv[2]).absolute()
	input = sys.argv[3]


	def filterDoc(doc, scoreThreshold, maxResults, roundTo=2):
	cats = doc.cats.items()
	filt = list(filter(lambda c: c[1] > scoreThreshold, cats))
	sort = sorted(filt, key=lambda c: c[1], reverse=True)
	maxi = sort[0:maxResults]
	rund = [(l[0], round(l[1], roundTo)) for l in maxi ]
	return dict(rund)



	def recognize(text):
	global Nlp
	# find l1 labels
	labels = filterDoc(Nlp(text), ScoreThreshold, MaxResults)

	# find L2 labels
	relabels = dict()
	for label in labels.keys():
	label2 = label.strip()
	SubModelPath = SubModelDir.joinpath(label2).absolute()
	if SubModelPath.exists():
	Nlp = spacy.load(SubModelPath)
	docSub = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
	relabels[label2] = {'score': labels[label], 'subs': docSub}
	else:
	relabels[label2] = {'score': labels[label]}
	relabels["messages"] = "Submodel path \"" + str(SubModelPath) + "\" not found"

	return relabels

	print(json.dumps(recognize(input)))