Spaces:
Runtime error
Runtime error
| import ssl | |
| import random | |
| import warnings | |
| import nltk | |
| import spacy | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import wordnet | |
| from sentence_transformers import SentenceTransformer, util | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| NLP_GLOBAL = spacy.load("en_core_web_sm") | |
| def download_nltk_resources(): | |
| """ | |
| Download required NLTK resources if not already installed. | |
| """ | |
| try: | |
| _create_unverified_https_context = ssl._create_unverified_context | |
| except AttributeError: | |
| pass | |
| else: | |
| ssl._create_default_https_context = _create_unverified_https_context | |
| resources = ['punkt', 'averaged_perceptron_tagger', 'punkt_tab','wordnet','averaged_perceptron_tagger_eng'] | |
| for resource in resources: | |
| try: | |
| nltk.download(resource, quiet=True) | |
| except Exception as e: | |
| print(f"Error downloading {resource}: {str(e)}") | |
| # This class contains methods to humanize academic text, such as improving readability or | |
| # simplifying complex language. | |
| class TextHumanizer: | |
| """ | |
| Transforms text into a more formal (academic) style: | |
| - Expands contractions | |
| - Adds academic transitions | |
| - Optionally converts some sentences to passive voice | |
| - Optionally replaces words with synonyms for more formality | |
| """ | |
| def __init__( | |
| self, | |
| model_name='paraphrase-MiniLM-L6-v2', | |
| p_passive=0.2, | |
| p_synonym_replacement=0.3, | |
| p_academic_transition=0.3, | |
| seed=None | |
| ): | |
| if seed is not None: | |
| random.seed(seed) | |
| self.nlp = spacy.load("en_core_web_sm") | |
| self.model = SentenceTransformer(model_name) | |
| # Transformation probabilities | |
| self.p_passive = p_passive | |
| self.p_synonym_replacement = p_synonym_replacement | |
| self.p_academic_transition = p_academic_transition | |
| # Common academic transitions | |
| self.academic_transitions = [ | |
| "Moreover,", "Additionally,", "Furthermore,", "Hence,", | |
| "Therefore,", "Consequently,", "Nonetheless,", "Nevertheless," | |
| ] | |
| def humanize_text(self, text, use_passive=False, use_synonyms=False): | |
| doc = self.nlp(text) | |
| transformed_sentences = [] | |
| for sent in doc.sents: | |
| sentence_str = sent.text.strip() | |
| # 1. Expand contractions | |
| sentence_str = self.expand_contractions(sentence_str) | |
| # 2. Possibly add academic transitions | |
| # if random.random() < self.p_academic_transition: | |
| # sentence_str = self.add_academic_transitions(sentence_str) | |
| # 3. Optionally convert to passive | |
| if use_passive and random.random() < self.p_passive: | |
| sentence_str = self.convert_to_passive(sentence_str) | |
| # 4. Optionally replace words with synonyms | |
| if use_synonyms and random.random() < self.p_synonym_replacement: | |
| sentence_str = self.replace_with_synonyms(sentence_str) | |
| transformed_sentences.append(sentence_str) | |
| return ' '.join(transformed_sentences) | |
| def expand_contractions(self, sentence): | |
| contraction_map = { | |
| "n't": " not", "'re": " are", "'s": " is", "'ll": " will", | |
| "'ve": " have", "'d": " would", "'m": " am" | |
| } | |
| tokens = word_tokenize(sentence) | |
| expanded_tokens = [] | |
| for token in tokens: | |
| lower_token = token.lower() | |
| replaced = False | |
| for contraction, expansion in contraction_map.items(): | |
| if contraction in lower_token and lower_token.endswith(contraction): | |
| new_token = lower_token.replace(contraction, expansion) | |
| if token[0].isupper(): | |
| new_token = new_token.capitalize() | |
| expanded_tokens.append(new_token) | |
| replaced = True | |
| break | |
| if not replaced: | |
| expanded_tokens.append(token) | |
| return ' '.join(expanded_tokens) | |
| def add_academic_transitions(self, sentence): | |
| transition = random.choice(self.academic_transitions) | |
| return f"{transition} {sentence}" | |
| def convert_to_passive(self, sentence): | |
| doc = self.nlp(sentence) | |
| subj_tokens = [t for t in doc if t.dep_ == 'nsubj' and t.head.dep_ == 'ROOT'] | |
| dobj_tokens = [t for t in doc if t.dep_ == 'dobj'] | |
| if subj_tokens and dobj_tokens: | |
| subject = subj_tokens[0] | |
| dobj = dobj_tokens[0] | |
| verb = subject.head | |
| if subject.i < verb.i < dobj.i: | |
| passive_str = f"{dobj.text} {verb.lemma_} by {subject.text}" | |
| original_str = ' '.join(token.text for token in doc) | |
| chunk = f"{subject.text} {verb.text} {dobj.text}" | |
| if chunk in original_str: | |
| sentence = original_str.replace(chunk, passive_str) | |
| return sentence | |
| def replace_with_synonyms(self, sentence): | |
| tokens = word_tokenize(sentence) | |
| pos_tags = nltk.pos_tag(tokens) | |
| new_tokens = [] | |
| for (word, pos) in pos_tags: | |
| if pos.startswith(('J', 'N', 'V', 'R')) and wordnet.synsets(word): | |
| if random.random() < 0.5: | |
| synonyms = self._get_synonyms(word, pos) | |
| if synonyms: | |
| best_synonym = self._select_closest_synonym(word, synonyms) | |
| new_tokens.append(best_synonym if best_synonym else word) | |
| else: | |
| new_tokens.append(word) | |
| else: | |
| new_tokens.append(word) | |
| else: | |
| new_tokens.append(word) | |
| # Join cleanly with punctuation fix | |
| sentence = " ".join(new_tokens) | |
| sentence = ( | |
| sentence.replace(" ,", ",") | |
| .replace(" .", ".") | |
| .replace(" !", "!") | |
| .replace(" ?", "?") | |
| .replace(" :", ":") | |
| .replace(" '", "'") | |
| ) | |
| return sentence | |
| def _get_synonyms(self, word, pos): | |
| wn_pos = None | |
| if pos.startswith('J'): | |
| wn_pos = wordnet.ADJ | |
| elif pos.startswith('N'): | |
| wn_pos = wordnet.NOUN | |
| elif pos.startswith('R'): | |
| wn_pos = wordnet.ADV | |
| elif pos.startswith('V'): | |
| wn_pos = wordnet.VERB | |
| synonyms = set() | |
| for syn in wordnet.synsets(word, pos=wn_pos): | |
| for lemma in syn.lemmas(): | |
| lemma_name = lemma.name().replace('_', ' ') | |
| if lemma_name.lower() != word.lower(): | |
| synonyms.add(lemma_name) | |
| return list(synonyms) | |
| def _select_closest_synonym(self, original_word, synonyms): | |
| if not synonyms: | |
| return None | |
| original_emb = self.model.encode(original_word, convert_to_tensor=True) | |
| synonym_embs = self.model.encode(synonyms, convert_to_tensor=True) | |
| cos_scores = util.cos_sim(original_emb, synonym_embs)[0] | |
| max_score_index = cos_scores.argmax().item() | |
| max_score = cos_scores[max_score_index].item() | |
| if max_score >= 0.5: | |
| return synonyms[max_score_index] | |
| return None |