asritha22bce commited on
Commit
bb6b974
·
verified ·
1 Parent(s): 07a8a78

Update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +14 -6
preprocessing.py CHANGED
@@ -3,9 +3,9 @@ import pandas as pd
3
 
4
  class NegativeWordReplacer:
5
  def __init__(self, excel_path):
6
- """Initialize replacement dictionary from Excel file."""
 
7
  self.replacement_dict = self.load_replacement_dict(excel_path)
8
- self.nlp = spacy.load("en_core_web_sm") # Load spaCy model once
9
 
10
  def normalize_pos(self, pos):
11
  """Convert spaCy POS tags to match the Excel sheet."""
@@ -31,7 +31,7 @@ class NegativeWordReplacer:
31
  lemma = doc[0].lemma_
32
 
33
  replacement_dict[(neg_word, pos_tag)] = neutral_word
34
- replacement_dict[(lemma, pos_tag)] = neutral_word
35
 
36
  return replacement_dict
37
 
@@ -43,17 +43,25 @@ class NegativeWordReplacer:
43
  for token in doc:
44
  lemma = token.lemma_.lower()
45
  pos = self.normalize_pos(token.pos_)
 
46
 
47
- if token.pos_ == "VERB" and token.dep_ == "amod":
 
48
  pos = "adjective"
49
 
50
  if pos:
51
- replacement = self.replacement_dict.get((lemma, pos), None)
 
 
52
 
53
  if replacement:
 
 
 
 
54
  new_tokens.append(replacement)
55
  elif pos in ["adjective", "adverb"]:
56
- continue # Remove unnecessary exaggerated words
57
  else:
58
  new_tokens.append(token.text)
59
  else:
 
3
 
4
  class NegativeWordReplacer:
5
  def __init__(self, excel_path):
6
+ """Initialize spaCy NLP model and load replacement dictionary."""
7
+ self.nlp = spacy.load("en_core_web_sm") # ✅ Load spaCy before calling it
8
  self.replacement_dict = self.load_replacement_dict(excel_path)
 
9
 
10
  def normalize_pos(self, pos):
11
  """Convert spaCy POS tags to match the Excel sheet."""
 
31
  lemma = doc[0].lemma_
32
 
33
  replacement_dict[(neg_word, pos_tag)] = neutral_word
34
+ replacement_dict[(lemma, pos_tag)] = neutral_word # Store root word too
35
 
36
  return replacement_dict
37
 
 
43
  for token in doc:
44
  lemma = token.lemma_.lower()
45
  pos = self.normalize_pos(token.pos_)
46
+ word_lower = token.text.lower() # Convert to lowercase for lookup
47
 
48
+ # Fix: "amod" dependency only applies to adjectives
49
+ if token.dep_ == "amod":
50
  pos = "adjective"
51
 
52
  if pos:
53
+ # Check both lowercase word and its lemma
54
+ replacement = self.replacement_dict.get((word_lower, pos)) or \
55
+ self.replacement_dict.get((lemma, pos))
56
 
57
  if replacement:
58
+ # Keep original capitalization
59
+ if token.text[0].isupper():
60
+ replacement = replacement.capitalize()
61
+
62
  new_tokens.append(replacement)
63
  elif pos in ["adjective", "adverb"]:
64
+ continue # Remove exaggerated words
65
  else:
66
  new_tokens.append(token.text)
67
  else: