Spaces:
Sleeping
Sleeping
Update app/src/linguistic.py
Browse files- app/src/linguistic.py +21 -8
app/src/linguistic.py
CHANGED
|
@@ -1,29 +1,38 @@
|
|
| 1 |
import spacy
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
-
import math
|
| 5 |
from collections import Counter
|
| 6 |
from nltk import word_tokenize
|
| 7 |
from nltk.corpus import wordnet
|
| 8 |
-
from nltk.corpus.reader.wordnet import NOUN, ADJ, ADV, VERB
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
nlp = spacy.load("en_core_web_sm")
|
| 12 |
-
|
| 13 |
-
# Optional: load word frequency data (e.g., SUBTLEX frequency file)
|
| 14 |
-
# If not available, we use a simple fallback (all words equally frequent).
|
| 15 |
FREQ_DICT = {}
|
| 16 |
FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
|
| 17 |
if os.path.exists(FREQ_PATH):
|
| 18 |
with open(FREQ_PATH, 'r') as f:
|
| 19 |
FREQ_DICT = json.load(f)
|
| 20 |
|
|
|
|
| 21 |
class LinguisticAnalyzer:
|
| 22 |
"""
|
| 23 |
Extracts rich linguistic features from a trademark string.
|
| 24 |
Features include POS tags, dependency relations, dictionary membership,
|
| 25 |
word frequency, n‑gram overlap with goods description, and named entities.
|
| 26 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def __init__(self, descriptive_keywords_path=None):
|
| 29 |
self.descriptive_keywords = {}
|
|
@@ -36,11 +45,13 @@ class LinguisticAnalyzer:
|
|
| 36 |
|
| 37 |
def pos_tags(self, text):
|
| 38 |
"""Return list of (token, POS, detailed tag) using spaCy."""
|
|
|
|
| 39 |
doc = nlp(text)
|
| 40 |
return [(token.text, token.pos_, token.tag_) for token in doc]
|
| 41 |
|
| 42 |
def dependency_relations(self, text):
|
| 43 |
"""Extract adjective‑noun and other modifier relations."""
|
|
|
|
| 44 |
doc = nlp(text)
|
| 45 |
modifiers = []
|
| 46 |
for token in doc:
|
|
@@ -91,7 +102,7 @@ class LinguisticAnalyzer:
|
|
| 91 |
"""
|
| 92 |
if not self.descriptive_keywords or not goods_class:
|
| 93 |
return 0.0
|
| 94 |
-
|
| 95 |
doc = nlp(mark)
|
| 96 |
mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
|
| 97 |
desc_words = set(self.descriptive_keywords.get(goods_class, []))
|
|
@@ -106,6 +117,7 @@ class LinguisticAnalyzer:
|
|
| 106 |
|
| 107 |
def extract_entities(self, text):
|
| 108 |
"""Return list of named entities (PERSON, ORG, GPE, etc.)."""
|
|
|
|
| 109 |
doc = nlp(text)
|
| 110 |
return [(ent.text, ent.label_) for ent in doc.ents]
|
| 111 |
|
|
@@ -113,6 +125,7 @@ class LinguisticAnalyzer:
|
|
| 113 |
"""
|
| 114 |
Main method: returns a dictionary of linguistic features.
|
| 115 |
"""
|
|
|
|
| 116 |
doc = nlp(mark)
|
| 117 |
tokens = [token.text.lower() for token in doc if token.is_alpha]
|
| 118 |
if not tokens:
|
|
|
|
| 1 |
import spacy
|
| 2 |
+
from spacy.cli import download
|
| 3 |
import json
|
| 4 |
import os
|
|
|
|
| 5 |
from collections import Counter
|
| 6 |
from nltk import word_tokenize
|
| 7 |
from nltk.corpus import wordnet
|
|
|
|
| 8 |
|
| 9 |
+
# Optional: load word frequency data (if available)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
FREQ_DICT = {}
|
| 11 |
FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
|
| 12 |
if os.path.exists(FREQ_PATH):
|
| 13 |
with open(FREQ_PATH, 'r') as f:
|
| 14 |
FREQ_DICT = json.load(f)
|
| 15 |
|
| 16 |
+
|
| 17 |
class LinguisticAnalyzer:
|
| 18 |
"""
|
| 19 |
Extracts rich linguistic features from a trademark string.
|
| 20 |
Features include POS tags, dependency relations, dictionary membership,
|
| 21 |
word frequency, n‑gram overlap with goods description, and named entities.
|
| 22 |
"""
|
| 23 |
+
_nlp = None
|
| 24 |
+
|
| 25 |
+
@classmethod
|
| 26 |
+
def _get_nlp(cls):
|
| 27 |
+
"""Lazy load or download the spaCy model."""
|
| 28 |
+
if cls._nlp is None:
|
| 29 |
+
try:
|
| 30 |
+
cls._nlp = spacy.load("en_core_web_sm")
|
| 31 |
+
except OSError:
|
| 32 |
+
print("Downloading spaCy model 'en_core_web_sm'...")
|
| 33 |
+
download("en_core_web_sm")
|
| 34 |
+
cls._nlp = spacy.load("en_core_web_sm")
|
| 35 |
+
return cls._nlp
|
| 36 |
|
| 37 |
def __init__(self, descriptive_keywords_path=None):
|
| 38 |
self.descriptive_keywords = {}
|
|
|
|
| 45 |
|
| 46 |
def pos_tags(self, text):
|
| 47 |
"""Return list of (token, POS, detailed tag) using spaCy."""
|
| 48 |
+
nlp = self.__class__._get_nlp()
|
| 49 |
doc = nlp(text)
|
| 50 |
return [(token.text, token.pos_, token.tag_) for token in doc]
|
| 51 |
|
| 52 |
def dependency_relations(self, text):
|
| 53 |
"""Extract adjective‑noun and other modifier relations."""
|
| 54 |
+
nlp = self.__class__._get_nlp()
|
| 55 |
doc = nlp(text)
|
| 56 |
modifiers = []
|
| 57 |
for token in doc:
|
|
|
|
| 102 |
"""
|
| 103 |
if not self.descriptive_keywords or not goods_class:
|
| 104 |
return 0.0
|
| 105 |
+
nlp = self.__class__._get_nlp()
|
| 106 |
doc = nlp(mark)
|
| 107 |
mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
|
| 108 |
desc_words = set(self.descriptive_keywords.get(goods_class, []))
|
|
|
|
| 117 |
|
| 118 |
def extract_entities(self, text):
|
| 119 |
"""Return list of named entities (PERSON, ORG, GPE, etc.)."""
|
| 120 |
+
nlp = self.__class__._get_nlp()
|
| 121 |
doc = nlp(text)
|
| 122 |
return [(ent.text, ent.label_) for ent in doc.ents]
|
| 123 |
|
|
|
|
| 125 |
"""
|
| 126 |
Main method: returns a dictionary of linguistic features.
|
| 127 |
"""
|
| 128 |
+
nlp = self.__class__._get_nlp()
|
| 129 |
doc = nlp(mark)
|
| 130 |
tokens = [token.text.lower() for token in doc if token.is_alpha]
|
| 131 |
if not tokens:
|