Spaces:
Build error
Build error
Update preprocessing.py
Browse files- preprocessing.py +14 -6
preprocessing.py
CHANGED
|
@@ -3,9 +3,9 @@ import pandas as pd
|
|
| 3 |
|
| 4 |
class NegativeWordReplacer:
|
| 5 |
def __init__(self, excel_path):
|
| 6 |
-
"""Initialize
|
|
|
|
| 7 |
self.replacement_dict = self.load_replacement_dict(excel_path)
|
| 8 |
-
self.nlp = spacy.load("en_core_web_sm") # Load spaCy model once
|
| 9 |
|
| 10 |
def normalize_pos(self, pos):
|
| 11 |
"""Convert spaCy POS tags to match the Excel sheet."""
|
|
@@ -31,7 +31,7 @@ class NegativeWordReplacer:
|
|
| 31 |
lemma = doc[0].lemma_
|
| 32 |
|
| 33 |
replacement_dict[(neg_word, pos_tag)] = neutral_word
|
| 34 |
-
replacement_dict[(lemma, pos_tag)] = neutral_word
|
| 35 |
|
| 36 |
return replacement_dict
|
| 37 |
|
|
@@ -43,17 +43,25 @@ class NegativeWordReplacer:
|
|
| 43 |
for token in doc:
|
| 44 |
lemma = token.lemma_.lower()
|
| 45 |
pos = self.normalize_pos(token.pos_)
|
|
|
|
| 46 |
|
| 47 |
-
|
|
|
|
| 48 |
pos = "adjective"
|
| 49 |
|
| 50 |
if pos:
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
|
| 53 |
if replacement:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
new_tokens.append(replacement)
|
| 55 |
elif pos in ["adjective", "adverb"]:
|
| 56 |
-
continue # Remove
|
| 57 |
else:
|
| 58 |
new_tokens.append(token.text)
|
| 59 |
else:
|
|
|
|
| 3 |
|
| 4 |
class NegativeWordReplacer:
|
| 5 |
def __init__(self, excel_path):
|
| 6 |
+
"""Initialize spaCy NLP model and load replacement dictionary."""
|
| 7 |
+
self.nlp = spacy.load("en_core_web_sm") # ✅ Load spaCy before calling it
|
| 8 |
self.replacement_dict = self.load_replacement_dict(excel_path)
|
|
|
|
| 9 |
|
| 10 |
def normalize_pos(self, pos):
|
| 11 |
"""Convert spaCy POS tags to match the Excel sheet."""
|
|
|
|
| 31 |
lemma = doc[0].lemma_
|
| 32 |
|
| 33 |
replacement_dict[(neg_word, pos_tag)] = neutral_word
|
| 34 |
+
replacement_dict[(lemma, pos_tag)] = neutral_word # Store root word too
|
| 35 |
|
| 36 |
return replacement_dict
|
| 37 |
|
|
|
|
| 43 |
for token in doc:
|
| 44 |
lemma = token.lemma_.lower()
|
| 45 |
pos = self.normalize_pos(token.pos_)
|
| 46 |
+
word_lower = token.text.lower() # Convert to lowercase for lookup
|
| 47 |
|
| 48 |
+
# ✅ Fix: "amod" dependency only applies to adjectives
|
| 49 |
+
if token.dep_ == "amod":
|
| 50 |
pos = "adjective"
|
| 51 |
|
| 52 |
if pos:
|
| 53 |
+
# ✅ Check both lowercase word and its lemma
|
| 54 |
+
replacement = self.replacement_dict.get((word_lower, pos)) or \
|
| 55 |
+
self.replacement_dict.get((lemma, pos))
|
| 56 |
|
| 57 |
if replacement:
|
| 58 |
+
# Keep original capitalization
|
| 59 |
+
if token.text[0].isupper():
|
| 60 |
+
replacement = replacement.capitalize()
|
| 61 |
+
|
| 62 |
new_tokens.append(replacement)
|
| 63 |
elif pos in ["adjective", "adverb"]:
|
| 64 |
+
continue # Remove exaggerated words
|
| 65 |
else:
|
| 66 |
new_tokens.append(token.text)
|
| 67 |
else:
|