| | from spacy.tokens import Doc, Span, Token |
| | from spacy.language import Language |
| | import pandas as pd |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | default_feature_aggregation_config = { |
| | 'INCH': { |
| | 'method': 'first', |
| | }, |
| | 'MOUNTING_FEATURE': { |
| | 'method': 'join', |
| | }, |
| | 'OS': { |
| | 'method': 'first', |
| | }, |
| | 'REFRESH_RATE': { |
| | 'method': 'first', |
| | }, |
| | 'RESOLUTION': { |
| | 'method': 'first', |
| | }, |
| | 'SOFTWARE_FEATURE': { |
| | 'method': 'expand', |
| | }, |
| | 'VIDEO_FEATURE': { |
| | 'method': 'expand', |
| | }, |
| | 'AUDIO_FEATURE': { |
| | 'method': 'expand', |
| | }, |
| | 'COLOR': { |
| | 'method': 'join', |
| | }, |
| | 'WIRELESS_FEATURE': { |
| | 'method': 'expand', |
| | }, |
| | } |
| |
|
| | @Language.factory("feature_aggregator_component") |
| | class FeatureAggregatorComponent(object): |
| | def __init__(self, nlp, name, config=default_feature_aggregation_config): |
| | self.config = config |
| | Doc.set_extension("raw_features", getter=self.raw_features, force=True) |
| | Doc.set_extension("features", getter=self.features, force=True) |
| | Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True) |
| | Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True) |
| | |
| | def __call__(self, doc): |
| | return doc |
| | |
| | def get_feature_aggregation_config(self, doc): |
| | return self.config |
| | |
| | def set_feature_aggregation_config(self, doc, config): |
| | self.config = config |
| |
|
| | def raw_features(self, doc): |
| | features = {} |
| | for ent in doc.ents: |
| | if ent._.count is None: |
| | if not ent.label_ in features: |
| | features[ent.label_] = set() |
| | features[ent.label_].add(ent._.text) |
| | else: |
| | |
| | if not ent._.text in features: |
| | features[ent._.text] = 0 |
| | features[ent._.text] += ent._.count |
| | return features |
| |
|
| | def features(self, doc): |
| | features = {} |
| | for name, values in self.raw_features(doc).items(): |
| | if not name in self.config: |
| | features[name] = values |
| | continue |
| | |
| | if not 'method' in self.config[name]: |
| | features[name] = values |
| | continue |
| |
|
| | method = self.config[name]["method"] |
| | if method == 'first': |
| | if len(values) != 0: |
| | features[name] = values.pop() |
| | else: |
| | features[name] = float('nan') |
| | elif method == 'join': |
| | features[name] = ','.join(list(values)) |
| | elif method == 'expand': |
| | for value in values: |
| | features[value] = 1 |
| | else: |
| | print(f"unknown feature aggregation method: {method}, skipping...") |
| | features[name] = values |
| | return features |
| | |
| | def add_to_dataframe(self, doc, df): |
| | features = self.features(doc) |
| | for name, feature in features.items(): |
| | features[name] = [feature] |
| | df = pd.concat([df, pd.DataFrame(features)]) |
| | return df |
| |
|