Spaces:

asritha22bce
/

New_space_test

Build error

App Files Files Community

New_space_test / preprocessing.py

asritha22bce

Update preprocessing.py

58cc55a verified 12 months ago

raw

history blame contribute delete

2.75 kB

	import spacy
	import pandas as pd
	import subprocess

	try:
	spacy.load("en_core_web_sm") # Try loading spaCy model
	except OSError:
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)


	class NegativeWordReplacer:
	def __init__(self, excel_path):
	"""Initialize spaCy NLP model and load replacement dictionary."""
	self.nlp = spacy.load("en_core_web_sm") # ✅ Load spaCy before calling it
	self.replacement_dict = self.load_replacement_dict(excel_path)

	def normalize_pos(self, pos):
	"""Convert spaCy POS tags to match the Excel sheet."""
	pos_mapping = {
	"NOUN": "noun",
	"VERB": "verb",
	"ADJ": "adjective",
	"ADV": "adverb"
	}
	return pos_mapping.get(pos, None)

	def load_replacement_dict(self, excel_path):
	"""Load and preprocess the Excel sheet into a dictionary."""
	df = pd.read_excel(excel_path)
	replacement_dict = {}

	for _, row in df.iterrows():
	neg_word = str(row['Exaggerated Word']).lower().strip()
	pos_tag = str(row['POS']).lower().strip()
	neutral_word = str(row['Neutral Word']).lower().strip()

	doc = self.nlp(neg_word)
	lemma = doc[0].lemma_

	replacement_dict[(neg_word, pos_tag)] = neutral_word
	replacement_dict[(lemma, pos_tag)] = neutral_word # Store root word too

	return replacement_dict

	def replace_negative_words(self, sentence):
	"""Replace negative words in a sentence using the dictionary."""
	doc = self.nlp(sentence)
	new_tokens = []

	for token in doc:
	lemma = token.lemma_.lower()
	pos = self.normalize_pos(token.pos_)
	word_lower = token.text.lower() # Convert to lowercase for lookup

	# ✅ Fix: "amod" dependency only applies to adjectives
	if token.dep_ == "amod":
	pos = "adjective"

	if pos:
	# ✅ Check both lowercase word and its lemma
	replacement = self.replacement_dict.get((word_lower, pos)) or \
	self.replacement_dict.get((lemma, pos))

	if replacement:
	# Keep original capitalization
	if token.text[0].isupper():
	replacement = replacement.capitalize()

	new_tokens.append(replacement)
	elif pos in ["adjective"]:
	continue # Remove exaggerated words
	else:
	new_tokens.append(token.text)
	else:
	new_tokens.append(token.text)

	return " ".join(new_tokens)