ppolx commited on
Commit
470a58e
·
1 Parent(s): 7c46213

Runnable.

Browse files
Files changed (3) hide show
  1. .gitignore +4 -0
  2. README.md +18 -2
  3. recognize.py +61 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Created by venv; see https://docs.python.org/3/library/venv.html
2
+ .idea
3
+ .venv
4
+ .DS_Store
README.md CHANGED
@@ -14,10 +14,26 @@ tags:
14
  # AISOP-fundid-classifiers
15
 
16
  This is a series of spacy models for the classification tasks.
17
- It is part of the AISOP-domain-fundid https://gitlab.com/aisop/aisop-domain-fundid which is designed to serve for the [AISOP-webapp](https://gitlab.com/aisop/aisop-webapp).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  The [python scripts](https://gitlab.com/aisop/aisop-webapp/-/tree/main/scripts/python?ref_type=heads) there use the models and the spaCy library to classify each "paragraph" of e-portfolios stored in HTML and even generate words using tesseract (if the picture is available) and annotate them too.
20
 
21
  The scripts enrich the HTML with `data-topic-*` attributes, indicating the presence of topics in the paragraphs.
22
 
23
- The scripts can be tested in the web-app in the `/debug/` road.
 
14
  # AISOP-fundid-classifiers
15
 
16
  This is a series of spacy models for the classification tasks.
17
+ ## Try it here
18
+
19
+ - Install spaCy
20
+ - `python python recognize.py l1-model l2-models "this is a text"`
21
+
22
+ ...outputs the recognition in JSON e.g.:
23
+ {
24
+ "Algorithmization": {"score": 0.39,
25
+ "subs": {"Algorithmization": 0.94, "Variable": 0.58, "programming concepts": 0.49}},
26
+ "Encryption": {"score": 0.26,
27
+ "subs": {"Encryption": 0.66, "Symmetric key systems": 0.5, "Substitution ciphers": 0.44}}
28
+ }
29
+
30
+
31
+ ## Web-App Packaging
32
+
33
+ This model is part of the AISOP-domain-fundid https://gitlab.com/aisop/aisop-domain-fundid which is designed to serve for the [AISOP-webapp](https://gitlab.com/aisop/aisop-webapp).
34
 
35
  The [python scripts](https://gitlab.com/aisop/aisop-webapp/-/tree/main/scripts/python?ref_type=heads) there use the models and the spaCy library to classify each "paragraph" of e-portfolios stored in HTML and even generate words using tesseract (if the picture is available) and annotate them too.
36
 
37
  The scripts enrich the HTML with `data-topic-*` attributes, indicating the presence of topics in the paragraphs.
38
 
39
+ The scripts can be tested in the web-app in the `/debug/` road.
recognize.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # recognize: processes two-levels-topic-recognition
2
+ # Authors: Paul Libbrecht, Pierre Günthner and Alexander Gantikow from the AISOP project
3
+
4
+ # Installation: Install spacy then...
5
+ # Usage: python recognize.py l1-model l2-models "this is a text"
6
+ # l1-model path: a relative path (starting from this script) pointing to the level-1 model folder
7
+ # l2-model path: a relative path (starting from this script) pointing to the folder containing a folder for each L2-label
8
+ # "this is a text": the text to recognize
9
+
10
+
11
+ import sys # System-specific parameters and functions, part of Py
12
+ import spacy # Natural language processing
13
+ from pathlib import Path # Object-oriented filesystem paths, part of Py
14
+ import json # JSON object dumping functions
15
+
16
+
17
+ RoundTo = 2 # Round to precision of n decimals
18
+ Encoding = 'utf8' # Encoding of the html file to be read and parsed via BeautifulSoup
19
+ ScoreThreshold = 0.2 # Min. spacy probability value for an element to be analysed/enriched.
20
+ MaxResults = 3 # Max. number of concepts/labels added to an html element
21
+ ParagraphMinLetters = 10 # Min. number of letters of paragraph to be considered in analysis
22
+ ListMinLetters = 10 # Min. number of letters of <ul> and <ol> to be considered in analysis
23
+
24
+ SubModels = {}
25
+
26
+ Nlp = spacy.load(sys.argv[1])
27
+ SubModelDir = Path(__file__).parent.joinpath(sys.argv[2]).absolute()
28
+ input = sys.argv[3]
29
+
30
+
31
+ def filterDoc(doc, scoreThreshold, maxResults, roundTo=2):
32
+ cats = doc.cats.items()
33
+ filt = list(filter(lambda c: c[1] > scoreThreshold, cats))
34
+ sort = sorted(filt, key=lambda c: c[1], reverse=True)
35
+ maxi = sort[0:maxResults]
36
+ rund = [(l[0], round(l[1], roundTo)) for l in maxi ]
37
+ return dict(rund)
38
+
39
+
40
+
41
+ def recognize(text):
42
+ global Nlp
43
+ # find l1 labels
44
+ labels = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
45
+
46
+ # find L2 labels
47
+ relabels = dict()
48
+ for label in labels.keys():
49
+ label2 = label.strip()
50
+ SubModelPath = SubModelDir.joinpath(label2).absolute()
51
+ if SubModelPath.exists():
52
+ Nlp = spacy.load(SubModelPath)
53
+ docSub = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
54
+ relabels[label2] = {'score': labels[label], 'subs': docSub}
55
+ else:
56
+ relabels[label2] = {'score': labels[label]}
57
+ relabels["messages"] = "Submodel path \"" + str(SubModelPath) + "\" not found"
58
+
59
+ return relabels
60
+
61
+ print(json.dumps(recognize(input)))