DIVYA-NSHU99 commited on
Commit
23345cd
·
verified ·
1 Parent(s): 049099a

Update app/src/linguistic.py

Browse files
Files changed (1) hide show
  1. app/src/linguistic.py +21 -8
app/src/linguistic.py CHANGED
@@ -1,29 +1,38 @@
1
  import spacy
 
2
  import json
3
  import os
4
- import math
5
  from collections import Counter
6
  from nltk import word_tokenize
7
  from nltk.corpus import wordnet
8
- from nltk.corpus.reader.wordnet import NOUN, ADJ, ADV, VERB
9
 
10
- # Load spaCy model (download if not present: python -m spacy download en_core_web_sm)
11
- nlp = spacy.load("en_core_web_sm")
12
-
13
- # Optional: load word frequency data (e.g., SUBTLEX frequency file)
14
- # If not available, we use a simple fallback (all words equally frequent).
15
  FREQ_DICT = {}
16
  FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
17
  if os.path.exists(FREQ_PATH):
18
  with open(FREQ_PATH, 'r') as f:
19
  FREQ_DICT = json.load(f)
20
 
 
21
  class LinguisticAnalyzer:
22
  """
23
  Extracts rich linguistic features from a trademark string.
24
  Features include POS tags, dependency relations, dictionary membership,
25
  word frequency, n‑gram overlap with goods description, and named entities.
26
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def __init__(self, descriptive_keywords_path=None):
29
  self.descriptive_keywords = {}
@@ -36,11 +45,13 @@ class LinguisticAnalyzer:
36
 
37
  def pos_tags(self, text):
38
  """Return list of (token, POS, detailed tag) using spaCy."""
 
39
  doc = nlp(text)
40
  return [(token.text, token.pos_, token.tag_) for token in doc]
41
 
42
  def dependency_relations(self, text):
43
  """Extract adjective‑noun and other modifier relations."""
 
44
  doc = nlp(text)
45
  modifiers = []
46
  for token in doc:
@@ -91,7 +102,7 @@ class LinguisticAnalyzer:
91
  """
92
  if not self.descriptive_keywords or not goods_class:
93
  return 0.0
94
- # Lemmatize mark words
95
  doc = nlp(mark)
96
  mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
97
  desc_words = set(self.descriptive_keywords.get(goods_class, []))
@@ -106,6 +117,7 @@ class LinguisticAnalyzer:
106
 
107
  def extract_entities(self, text):
108
  """Return list of named entities (PERSON, ORG, GPE, etc.)."""
 
109
  doc = nlp(text)
110
  return [(ent.text, ent.label_) for ent in doc.ents]
111
 
@@ -113,6 +125,7 @@ class LinguisticAnalyzer:
113
  """
114
  Main method: returns a dictionary of linguistic features.
115
  """
 
116
  doc = nlp(mark)
117
  tokens = [token.text.lower() for token in doc if token.is_alpha]
118
  if not tokens:
 
1
  import spacy
2
+ from spacy.cli import download
3
  import json
4
  import os
 
5
  from collections import Counter
6
  from nltk import word_tokenize
7
  from nltk.corpus import wordnet
 
8
 
9
+ # Optional: load word frequency data (if available)
 
 
 
 
10
  FREQ_DICT = {}
11
  FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
12
  if os.path.exists(FREQ_PATH):
13
  with open(FREQ_PATH, 'r') as f:
14
  FREQ_DICT = json.load(f)
15
 
16
+
17
  class LinguisticAnalyzer:
18
  """
19
  Extracts rich linguistic features from a trademark string.
20
  Features include POS tags, dependency relations, dictionary membership,
21
  word frequency, n‑gram overlap with goods description, and named entities.
22
  """
23
+ _nlp = None
24
+
25
+ @classmethod
26
+ def _get_nlp(cls):
27
+ """Lazy load or download the spaCy model."""
28
+ if cls._nlp is None:
29
+ try:
30
+ cls._nlp = spacy.load("en_core_web_sm")
31
+ except OSError:
32
+ print("Downloading spaCy model 'en_core_web_sm'...")
33
+ download("en_core_web_sm")
34
+ cls._nlp = spacy.load("en_core_web_sm")
35
+ return cls._nlp
36
 
37
  def __init__(self, descriptive_keywords_path=None):
38
  self.descriptive_keywords = {}
 
45
 
46
  def pos_tags(self, text):
47
  """Return list of (token, POS, detailed tag) using spaCy."""
48
+ nlp = self.__class__._get_nlp()
49
  doc = nlp(text)
50
  return [(token.text, token.pos_, token.tag_) for token in doc]
51
 
52
  def dependency_relations(self, text):
53
  """Extract adjective‑noun and other modifier relations."""
54
+ nlp = self.__class__._get_nlp()
55
  doc = nlp(text)
56
  modifiers = []
57
  for token in doc:
 
102
  """
103
  if not self.descriptive_keywords or not goods_class:
104
  return 0.0
105
+ nlp = self.__class__._get_nlp()
106
  doc = nlp(mark)
107
  mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
108
  desc_words = set(self.descriptive_keywords.get(goods_class, []))
 
117
 
118
  def extract_entities(self, text):
119
  """Return list of named entities (PERSON, ORG, GPE, etc.)."""
120
+ nlp = self.__class__._get_nlp()
121
  doc = nlp(text)
122
  return [(ent.text, ent.label_) for ent in doc.ents]
123
 
 
125
  """
126
  Main method: returns a dictionary of linguistic features.
127
  """
128
+ nlp = self.__class__._get_nlp()
129
  doc = nlp(mark)
130
  tokens = [token.text.lower() for token in doc if token.is_alpha]
131
  if not tokens: