bauerfriederike commited on
Commit ·
750faf9
1
Parent(s): 92cc354
initial model
Browse files- README.md +1 -0
- data/2024-08-16_patterns_1192.json +0 -0
- pipeline.py +83 -0
- poetry.lock +0 -0
- pyproject.toml +15 -0
README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Frauenerwerbstätigkeit
|
data/2024-08-16_patterns_1192.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pipeline.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import spacy
|
| 3 |
+
from spacy.language import Language
|
| 4 |
+
|
| 5 |
+
class PipelineWrapper:
|
| 6 |
+
"""
|
| 7 |
+
Pipeline Wrapper for the project 'Frauenerwerbstätigkeit' with the Bertelsmann Foundation team.
|
| 8 |
+
Loads pre-defined patterns from a json file and adds them to a span ruler in a spacy pipeline
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self) -> None:
|
| 12 |
+
self.patterns = []
|
| 13 |
+
self.nlp: Language = spacy.load("de_core_news_sm")
|
| 14 |
+
self.load_patterns("data/2024-08-16_patterns_1192.json")
|
| 15 |
+
self.add_span_ruler()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def load_patterns(self, path: str):
|
| 20 |
+
"""
|
| 21 |
+
load patterns from json file in spacy pattern format
|
| 22 |
+
|
| 23 |
+
Parameters
|
| 24 |
+
----------
|
| 25 |
+
path: path to pattern json file
|
| 26 |
+
"""
|
| 27 |
+
with open(file=path, mode="r") as fp:
|
| 28 |
+
self.patterns = json.load(fp=fp)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def add_span_ruler(self):
|
| 32 |
+
"""
|
| 33 |
+
Add a span ruler to the nlp pipeline
|
| 34 |
+
"""
|
| 35 |
+
config = {"spans_key": None, "annotate_ents": True, "overwrite": False}
|
| 36 |
+
ruler = self.nlp.add_pipe("span_ruler")
|
| 37 |
+
ruler.add_patterns(self.patterns)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def __call__(self, queries: list)-> None:
|
| 41 |
+
"""
|
| 42 |
+
call method for pipeline
|
| 43 |
+
"""
|
| 44 |
+
return self.bulk_predict(queries=queries)
|
| 45 |
+
|
| 46 |
+
def bulk_predict(self, queries: list) -> list:
|
| 47 |
+
"""
|
| 48 |
+
Bulk predicts the classes
|
| 49 |
+
|
| 50 |
+
Parameters
|
| 51 |
+
----------
|
| 52 |
+
queries: list of dictionaries containing this stucture:
|
| 53 |
+
{"posting_id": uuid, "text": str}
|
| 54 |
+
|
| 55 |
+
Returns
|
| 56 |
+
----------
|
| 57 |
+
list of dictionaries containing this structure:
|
| 58 |
+
[{"posting_id": , "concept": , }, {"posting_id": , "concept": ...}]
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
extractions = []
|
| 63 |
+
|
| 64 |
+
for entry in queries:
|
| 65 |
+
|
| 66 |
+
doc = self.nlp(entry["text"].lower(), disable=["ner"])
|
| 67 |
+
extraction = [span.label_ for span in doc.spans["ruler"]]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# no concept found
|
| 71 |
+
if extraction == []:
|
| 72 |
+
extractions.append({"posting_id": entry["posting_id"], "concept": None})
|
| 73 |
+
|
| 74 |
+
# concept found
|
| 75 |
+
else:
|
| 76 |
+
# one entry for each found concept
|
| 77 |
+
for el in extraction:
|
| 78 |
+
extractions.append({"posting_id": entry["posting_id"], "concept": el})
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# extractions.append({"posting_id": entry["posting_id"], "concept": list(set(extraction))})
|
| 82 |
+
|
| 83 |
+
return extractions
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "family-compatibility-extractor"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = ["bauerfriederike <141726622+bauerfriederike@users.noreply.github.com>"]
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
|
| 8 |
+
[tool.poetry.dependencies]
|
| 9 |
+
python = "^3.11"
|
| 10 |
+
spacy = "^3.7.6"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
[build-system]
|
| 14 |
+
requires = ["poetry-core"]
|
| 15 |
+
build-backend = "poetry.core.masonry.api"
|