Formalize handler process
Browse files- README.md +7 -0
- handler.py +12 -4
- src/{extract β dtos}/__init__.py +0 -0
- src/{format β dtos/input}/__init__.py +0 -0
- src/dtos/input/basic.py +4 -0
- src/{predict β dtos/output}/__init__.py +0 -0
- src/dtos/output/basic.py +17 -0
- src/extract/README.md +0 -1
- src/format.py +7 -0
- src/models/_shared/__init__.py +0 -0
- src/models/_shared/object/__init__.py +0 -0
- src/{predict β models/_shared/object}/output.py +0 -0
- src/{extract/core.py β models/bag_of_words/extractor.py} +0 -0
- src/{format/core.py β models/bag_of_words/formatter.py} +0 -0
- src/{predict/core.py β models/bag_of_words/predictor.py} +1 -1
README.md
CHANGED
|
@@ -1,3 +1,10 @@
|
|
| 1 |
# relevance-inference
|
| 2 |
|
| 3 |
A machine learning model to be synced to Hugging Face. For use in their Inference API to evaluate whether a URL is relevant.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# relevance-inference
|
| 2 |
|
| 3 |
A machine learning model to be synced to Hugging Face. For use in their Inference API to evaluate whether a URL is relevant.
|
| 4 |
+
|
| 5 |
+
# Common Files
|
| 6 |
+
|
| 7 |
+
- *model.py* - model container
|
| 8 |
+
- *extractor.py* - extracts relevant data from handler input
|
| 9 |
+
- *formatter.py* - formats data for model ingestion
|
| 10 |
+
- *predictor.py* - predicts relevance based on formatted data
|
handler.py
CHANGED
|
@@ -5,10 +5,13 @@ from environs import Env
|
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from joblib import load
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from src.models.bag_of_words.model import BagOfWordsModelContainer
|
| 9 |
-
from src.
|
| 10 |
-
from src.format.core import BagOfWordsFormatter
|
| 11 |
-
from src.predict.core import RelevancePredictor
|
| 12 |
|
| 13 |
SPACY_MODEL = spacy.load('en_core_web_trf', disable=['parser']) # Largest, slowest, most accurate model
|
| 14 |
|
|
@@ -19,6 +22,7 @@ class EndpointHandler:
|
|
| 19 |
env.read_env()
|
| 20 |
|
| 21 |
model_path = env.str("MODEL_PATH")
|
|
|
|
| 22 |
downloaded_model_path = hf_hub_download(
|
| 23 |
repo_id="PDAP/url-relevance-models",
|
| 24 |
subfolder=model_path,
|
|
@@ -34,4 +38,8 @@ class EndpointHandler:
|
|
| 34 |
bag_of_words = self.extractor.extract_bag_of_words(html)
|
| 35 |
csr = self.formatter.format_bag_of_words(bag_of_words)
|
| 36 |
output = self.predictor.predict_relevance(csr)
|
| 37 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from joblib import load
|
| 7 |
|
| 8 |
+
from src.dtos.output.basic import BasicOutput
|
| 9 |
+
|
| 10 |
+
from src.format import format_model_name_from_path
|
| 11 |
+
from src.models.bag_of_words.extractor import BagOfWordsExtractor
|
| 12 |
+
from src.models.bag_of_words.formatter import BagOfWordsFormatter
|
| 13 |
from src.models.bag_of_words.model import BagOfWordsModelContainer
|
| 14 |
+
from src.models.bag_of_words.predictor import RelevancePredictor
|
|
|
|
|
|
|
| 15 |
|
| 16 |
SPACY_MODEL = spacy.load('en_core_web_trf', disable=['parser']) # Largest, slowest, most accurate model
|
| 17 |
|
|
|
|
| 22 |
env.read_env()
|
| 23 |
|
| 24 |
model_path = env.str("MODEL_PATH")
|
| 25 |
+
self.model_name = format_model_name_from_path(model_path)
|
| 26 |
downloaded_model_path = hf_hub_download(
|
| 27 |
repo_id="PDAP/url-relevance-models",
|
| 28 |
subfolder=model_path,
|
|
|
|
| 38 |
bag_of_words = self.extractor.extract_bag_of_words(html)
|
| 39 |
csr = self.formatter.format_bag_of_words(bag_of_words)
|
| 40 |
output = self.predictor.predict_relevance(csr)
|
| 41 |
+
return BasicOutput(
|
| 42 |
+
annotation=output.is_relevant,
|
| 43 |
+
confidence=output.probability,
|
| 44 |
+
model=self.model_name
|
| 45 |
+
).model_dump(mode="json")
|
src/{extract β dtos}/__init__.py
RENAMED
|
File without changes
|
src/{format β dtos/input}/__init__.py
RENAMED
|
File without changes
|
src/dtos/input/basic.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
|
| 3 |
+
class BasicInput(BaseModel):
|
| 4 |
+
html: str
|
src/{predict β dtos/output}/__init__.py
RENAMED
|
File without changes
|
src/dtos/output/basic.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field, confloat
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BasicOutput(BaseModel):
|
| 5 |
+
annotation: bool = Field(
|
| 6 |
+
description="Whether the annotation is relevant or not"
|
| 7 |
+
)
|
| 8 |
+
confidence: float = Field(
|
| 9 |
+
description="The confidence of the prediction",
|
| 10 |
+
ge=0,
|
| 11 |
+
le=1
|
| 12 |
+
)
|
| 13 |
+
model: str = Field(
|
| 14 |
+
# Characters with underscores, followed by two underscores, followed by a timestamp
|
| 15 |
+
pattern="^\w+__[\d\-\_]+$",
|
| 16 |
+
description="The model used to make the prediction"
|
| 17 |
+
)
|
src/extract/README.md
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Directory for extracting data from raw HTML files
|
|
|
|
|
|
src/format.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
def format_model_name_from_path(path: str) -> str:
|
| 3 |
+
# Remove the `models` prefix
|
| 4 |
+
model_name = path.split("models/")[1]
|
| 5 |
+
# Replace slashes with double underscores
|
| 6 |
+
model_name = model_name.replace("/", "__")
|
| 7 |
+
return model_name
|
src/models/_shared/__init__.py
ADDED
|
File without changes
|
src/models/_shared/object/__init__.py
ADDED
|
File without changes
|
src/{predict β models/_shared/object}/output.py
RENAMED
|
File without changes
|
src/{extract/core.py β models/bag_of_words/extractor.py}
RENAMED
|
File without changes
|
src/{format/core.py β models/bag_of_words/formatter.py}
RENAMED
|
File without changes
|
src/{predict/core.py β models/bag_of_words/predictor.py}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from scipy.sparse import csr_matrix
|
| 2 |
|
| 3 |
-
from src.
|
| 4 |
from types_ import FitPredictor
|
| 5 |
|
| 6 |
|
|
|
|
| 1 |
from scipy.sparse import csr_matrix
|
| 2 |
|
| 3 |
+
from src.models._shared.object.output import PredictionOutput
|
| 4 |
from types_ import FitPredictor
|
| 5 |
|
| 6 |
|