Spaces:

DIVYA-NSHU99
/

disk

Sleeping

App Files Files Community

DIVYA-NSHU99 commited on 11 days ago

Commit

23345cd

verified ·

1 Parent(s): 049099a

Update app/src/linguistic.py

Browse files

Files changed (1) hide show

app/src/linguistic.py +21 -8

app/src/linguistic.py CHANGED Viewed

@@ -1,29 +1,38 @@
 import spacy
 import json
 import os
-import math
 from collections import Counter
 from nltk import word_tokenize
 from nltk.corpus import wordnet
-from nltk.corpus.reader.wordnet import NOUN, ADJ, ADV, VERB
-# Load spaCy model (download if not present: python -m spacy download en_core_web_sm)
-nlp = spacy.load("en_core_web_sm")
-# Optional: load word frequency data (e.g., SUBTLEX frequency file)
-# If not available, we use a simple fallback (all words equally frequent).
 FREQ_DICT = {}
 FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
 if os.path.exists(FREQ_PATH):
     with open(FREQ_PATH, 'r') as f:
         FREQ_DICT = json.load(f)
 class LinguisticAnalyzer:
     """
     Extracts rich linguistic features from a trademark string.
     Features include POS tags, dependency relations, dictionary membership,
     word frequency, n‑gram overlap with goods description, and named entities.
     """
     def __init__(self, descriptive_keywords_path=None):
         self.descriptive_keywords = {}
@@ -36,11 +45,13 @@ class LinguisticAnalyzer:
     def pos_tags(self, text):
         """Return list of (token, POS, detailed tag) using spaCy."""
         doc = nlp(text)
         return [(token.text, token.pos_, token.tag_) for token in doc]
     def dependency_relations(self, text):
         """Extract adjective‑noun and other modifier relations."""
         doc = nlp(text)
         modifiers = []
         for token in doc:
@@ -91,7 +102,7 @@ class LinguisticAnalyzer:
         """
         if not self.descriptive_keywords or not goods_class:
             return 0.0
-        # Lemmatize mark words
         doc = nlp(mark)
         mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
         desc_words = set(self.descriptive_keywords.get(goods_class, []))
@@ -106,6 +117,7 @@ class LinguisticAnalyzer:
     def extract_entities(self, text):
         """Return list of named entities (PERSON, ORG, GPE, etc.)."""
         doc = nlp(text)
         return [(ent.text, ent.label_) for ent in doc.ents]
@@ -113,6 +125,7 @@ class LinguisticAnalyzer:
         """
         Main method: returns a dictionary of linguistic features.
         """
         doc = nlp(mark)
         tokens = [token.text.lower() for token in doc if token.is_alpha]
         if not tokens:

 import spacy
+from spacy.cli import download
 import json
 import os
 from collections import Counter
 from nltk import word_tokenize
 from nltk.corpus import wordnet
+# Optional: load word frequency data (if available)
 FREQ_DICT = {}
 FREQ_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'word_freq.json')
 if os.path.exists(FREQ_PATH):
     with open(FREQ_PATH, 'r') as f:
         FREQ_DICT = json.load(f)
 class LinguisticAnalyzer:
     """
     Extracts rich linguistic features from a trademark string.
     Features include POS tags, dependency relations, dictionary membership,
     word frequency, n‑gram overlap with goods description, and named entities.
     """
+    _nlp = None
+    @classmethod
+    def _get_nlp(cls):
+        """Lazy load or download the spaCy model."""
+        if cls._nlp is None:
+            try:
+                cls._nlp = spacy.load("en_core_web_sm")
+            except OSError:
+                print("Downloading spaCy model 'en_core_web_sm'...")
+                download("en_core_web_sm")
+                cls._nlp = spacy.load("en_core_web_sm")
+        return cls._nlp
     def __init__(self, descriptive_keywords_path=None):
         self.descriptive_keywords = {}
     def pos_tags(self, text):
         """Return list of (token, POS, detailed tag) using spaCy."""
+        nlp = self.__class__._get_nlp()
         doc = nlp(text)
         return [(token.text, token.pos_, token.tag_) for token in doc]
     def dependency_relations(self, text):
         """Extract adjective‑noun and other modifier relations."""
+        nlp = self.__class__._get_nlp()
         doc = nlp(text)
         modifiers = []
         for token in doc:
         """
         if not self.descriptive_keywords or not goods_class:
             return 0.0
+        nlp = self.__class__._get_nlp()
         doc = nlp(mark)
         mark_lemmas = {token.lemma_.lower() for token in doc if token.is_alpha}
         desc_words = set(self.descriptive_keywords.get(goods_class, []))
     def extract_entities(self, text):
         """Return list of named entities (PERSON, ORG, GPE, etc.)."""
+        nlp = self.__class__._get_nlp()
         doc = nlp(text)
         return [(ent.text, ent.label_) for ent in doc.ents]
         """
         Main method: returns a dictionary of linguistic features.
         """
+        nlp = self.__class__._get_nlp()
         doc = nlp(mark)
         tokens = [token.text.lower() for token in doc if token.is_alpha]
         if not tokens: