bauerfriederike commited on
Commit
750faf9
·
1 Parent(s): 92cc354

initial model

Browse files
Files changed (5) hide show
  1. README.md +1 -0
  2. data/2024-08-16_patterns_1192.json +0 -0
  3. pipeline.py +83 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +15 -0
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Frauenerwerbstätigkeit
data/2024-08-16_patterns_1192.json ADDED
The diff for this file is too large to render. See raw diff
 
pipeline.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import spacy
3
+ from spacy.language import Language
4
+
5
+ class PipelineWrapper:
6
+ """
7
+ Pipeline Wrapper for the project 'Frauenerwerbstätigkeit' with the Bertelsmann Foundation team.
8
+ Loads pre-defined patterns from a json file and adds them to a span ruler in a spacy pipeline
9
+ """
10
+
11
+ def __init__(self) -> None:
12
+ self.patterns = []
13
+ self.nlp: Language = spacy.load("de_core_news_sm")
14
+ self.load_patterns("data/2024-08-16_patterns_1192.json")
15
+ self.add_span_ruler()
16
+
17
+
18
+
19
+ def load_patterns(self, path: str):
20
+ """
21
+ load patterns from json file in spacy pattern format
22
+
23
+ Parameters
24
+ ----------
25
+ path: path to pattern json file
26
+ """
27
+ with open(file=path, mode="r") as fp:
28
+ self.patterns = json.load(fp=fp)
29
+
30
+
31
+ def add_span_ruler(self):
32
+ """
33
+ Add a span ruler to the nlp pipeline
34
+ """
35
+ config = {"spans_key": None, "annotate_ents": True, "overwrite": False}
36
+ ruler = self.nlp.add_pipe("span_ruler")
37
+ ruler.add_patterns(self.patterns)
38
+
39
+
40
+ def __call__(self, queries: list)-> None:
41
+ """
42
+ call method for pipeline
43
+ """
44
+ return self.bulk_predict(queries=queries)
45
+
46
+ def bulk_predict(self, queries: list) -> list:
47
+ """
48
+ Bulk predicts the classes
49
+
50
+ Parameters
51
+ ----------
52
+ queries: list of dictionaries containing this stucture:
53
+ {"posting_id": uuid, "text": str}
54
+
55
+ Returns
56
+ ----------
57
+ list of dictionaries containing this structure:
58
+ [{"posting_id": , "concept": , }, {"posting_id": , "concept": ...}]
59
+ """
60
+
61
+
62
+ extractions = []
63
+
64
+ for entry in queries:
65
+
66
+ doc = self.nlp(entry["text"].lower(), disable=["ner"])
67
+ extraction = [span.label_ for span in doc.spans["ruler"]]
68
+
69
+
70
+ # no concept found
71
+ if extraction == []:
72
+ extractions.append({"posting_id": entry["posting_id"], "concept": None})
73
+
74
+ # concept found
75
+ else:
76
+ # one entry for each found concept
77
+ for el in extraction:
78
+ extractions.append({"posting_id": entry["posting_id"], "concept": el})
79
+
80
+
81
+ # extractions.append({"posting_id": entry["posting_id"], "concept": list(set(extraction))})
82
+
83
+ return extractions
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "family-compatibility-extractor"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["bauerfriederike <141726622+bauerfriederike@users.noreply.github.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.11"
10
+ spacy = "^3.7.6"
11
+
12
+
13
+ [build-system]
14
+ requires = ["poetry-core"]
15
+ build-backend = "poetry.core.masonry.api"