initial model

Files changed (5) hide show

README.md +1 -0
data/2024-08-16_patterns_1192.json +0 -0
pipeline.py +83 -0
poetry.lock +0 -0
pyproject.toml +15 -0

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Frauenerwerbstätigkeit

data/2024-08-16_patterns_1192.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pipeline.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import json
+import spacy
+from spacy.language import Language
+class PipelineWrapper:
+    """
+    Pipeline Wrapper for the project 'Frauenerwerbstätigkeit' with the Bertelsmann Foundation team.
+    Loads pre-defined patterns from a json file and adds them to a span ruler in a spacy pipeline
+    """
+    def __init__(self) -> None:
+        self.patterns = []
+        self.nlp: Language = spacy.load("de_core_news_sm")
+        self.load_patterns("data/2024-08-16_patterns_1192.json")
+        self.add_span_ruler()
+    def load_patterns(self, path: str):
+        """
+        load patterns from json file in spacy pattern format
+        Parameters
+        ----------
+        path: path to pattern json file
+        """
+        with open(file=path, mode="r") as fp:
+            self.patterns = json.load(fp=fp)
+    def add_span_ruler(self):
+        """
+        Add a span ruler to the nlp pipeline
+        """
+        config = {"spans_key": None, "annotate_ents": True, "overwrite": False}
+        ruler = self.nlp.add_pipe("span_ruler")
+        ruler.add_patterns(self.patterns)
+    def __call__(self, queries: list)-> None:
+        """
+        call method for pipeline
+        """
+        return self.bulk_predict(queries=queries)
+    def bulk_predict(self, queries: list) -> list:
+        """
+        Bulk predicts the classes
+        Parameters
+        ----------
+        queries: list of dictionaries containing this stucture:
+        {"posting_id": uuid, "text": str}
+        Returns
+        ----------
+        list of dictionaries containing this structure:
+        [{"posting_id": , "concept": , }, {"posting_id": , "concept": ...}]
+        """
+        extractions = []
+        for entry in queries:
+            doc = self.nlp(entry["text"].lower(), disable=["ner"])
+            extraction = [span.label_ for span in doc.spans["ruler"]]
+            # no concept found
+            if extraction == []:
+                extractions.append({"posting_id": entry["posting_id"], "concept": None})
+            # concept found
+            else:
+                # one entry for each found concept
+                for el in extraction:
+                    extractions.append({"posting_id": entry["posting_id"], "concept": el})
+            # extractions.append({"posting_id": entry["posting_id"], "concept": list(set(extraction))})
+        return extractions

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[tool.poetry]
+name = "family-compatibility-extractor"
+version = "0.1.0"
+description = ""
+authors = ["bauerfriederike <141726622+bauerfriederike@users.noreply.github.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.11"
+spacy = "^3.7.6"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"