Spaces:

zArabi
/

Persian-Sentiment-Analysis

Runtime error

App Files Files Community

zArabi commited on Nov 6, 2022

Commit

3238f2f

1 Parent(s): 1a8dfbe

Update preprocessing.py

Browse files

Files changed (1) hide show

preprocessing.py +73 -0

preprocessing.py CHANGED Viewed

	@@ -0,0 +1,73 @@

+import hazm
+from cleantext import clean
+import regex as re
+def cleanhtml(raw_html):
+    cleanr = re.compile('<.*?>')
+    cleantext = re.sub(cleanr, '', raw_html)
+    return cleantext
+def cleaning(text):
+    text = text.strip()
+    # regular cleaning
+    # https://pypi.org/project/clean-text/ >> works well for eng and de languages
+    text = clean(text,
+        fix_unicode=True,
+        to_ascii=False,
+        lower=True,
+        no_line_breaks=True,
+        no_urls=True,
+        no_emails=True,
+        no_phone_numbers=True,
+        no_numbers=False,
+        no_digits=False,
+        no_currency_symbols=True,
+        no_punct=False, #Keep the punc
+        replace_with_url="",
+        replace_with_email="",
+        replace_with_phone_number="",
+        replace_with_number="",
+        replace_with_digit="0",
+        replace_with_currency_symbol="",
+    )
+    # cleaning htmls
+    text = cleanhtml(text)
+    # normalizing > https://github.com/sobhe/hazm
+    normalizer = hazm.Normalizer()
+    text = normalizer.normalize(text)
+    # removing wierd patterns
+    wierd_pattern = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u'\U00010000-\U0010ffff'
+        u"\u200d"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\u3030"
+        u"\ufe0f"
+        u"\u2069"
+        u"\u2066"
+        # u"\u200c"
+        u"\u2068"
+        u"\u2067"
+        "]+", flags=re.UNICODE)
+    text = wierd_pattern.sub(r'', text)
+    # removing extra spaces, hashtags
+    text = re.sub("#", "", text)
+    text = re.sub("\s+", " ", text)
+    return text