New_space_test / preprocessing.py
asritha22bce's picture
Update preprocessing.py
58cc55a verified
import spacy
import pandas as pd
import subprocess
try:
spacy.load("en_core_web_sm") # Try loading spaCy model
except OSError:
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
class NegativeWordReplacer:
def __init__(self, excel_path):
"""Initialize spaCy NLP model and load replacement dictionary."""
self.nlp = spacy.load("en_core_web_sm") # ✅ Load spaCy before calling it
self.replacement_dict = self.load_replacement_dict(excel_path)
def normalize_pos(self, pos):
"""Convert spaCy POS tags to match the Excel sheet."""
pos_mapping = {
"NOUN": "noun",
"VERB": "verb",
"ADJ": "adjective",
"ADV": "adverb"
}
return pos_mapping.get(pos, None)
def load_replacement_dict(self, excel_path):
"""Load and preprocess the Excel sheet into a dictionary."""
df = pd.read_excel(excel_path)
replacement_dict = {}
for _, row in df.iterrows():
neg_word = str(row['Exaggerated Word']).lower().strip()
pos_tag = str(row['POS']).lower().strip()
neutral_word = str(row['Neutral Word']).lower().strip()
doc = self.nlp(neg_word)
lemma = doc[0].lemma_
replacement_dict[(neg_word, pos_tag)] = neutral_word
replacement_dict[(lemma, pos_tag)] = neutral_word # Store root word too
return replacement_dict
def replace_negative_words(self, sentence):
"""Replace negative words in a sentence using the dictionary."""
doc = self.nlp(sentence)
new_tokens = []
for token in doc:
lemma = token.lemma_.lower()
pos = self.normalize_pos(token.pos_)
word_lower = token.text.lower() # Convert to lowercase for lookup
# ✅ Fix: "amod" dependency only applies to adjectives
if token.dep_ == "amod":
pos = "adjective"
if pos:
# ✅ Check both lowercase word and its lemma
replacement = self.replacement_dict.get((word_lower, pos)) or \
self.replacement_dict.get((lemma, pos))
if replacement:
# Keep original capitalization
if token.text[0].isupper():
replacement = replacement.capitalize()
new_tokens.append(replacement)
elif pos in ["adjective"]:
continue # Remove exaggerated words
else:
new_tokens.append(token.text)
else:
new_tokens.append(token.text)
return " ".join(new_tokens)