Spaces:

zArabi
/

Persian-Sentiment-Analysis

Runtime error

App Files Files Community

zArabi commited on Nov 5, 2022

Commit

1635166

1 Parent(s): fdabac1

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -6

app.py CHANGED Viewed

@@ -1,12 +1,86 @@
 import gradio as gr
-from transformers import BertModel, BertConfig
 import torch.nn as nn
 import torch.nn.functional as F
 import huggingface_hub
 from huggingface_hub import hf_hub_download
 huggingface_hub.Repository = 'zArabi/Persian-Sentiment-Analysis'
 class SentimentModel(nn.Module):
     def __init__(self, config):
         super(SentimentModel, self).__init__()
@@ -34,16 +108,17 @@ config = BertConfig.from_pretrained(
     label2id=label2id)
 downloadedModelFile = hf_hub_download(repo_id="zArabi/Persian-Sentiment-Analysis", filename="persianModel")
-loaded_model = torch.load(downloadedModelFile)
-max_len=512
-pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
 def predict(text):
   text = cleaning(text)
   encoding = tokenizer.encode_plus(
-    sample_text,
     max_length=max_len,
     truncation=True,
     padding="max_length",
@@ -58,7 +133,7 @@ def predict(text):
   probs = F.softmax(outputs,dim=1)
   values, indices = torch.max(probs, dim=1)
   data = {
-    'comments': sample_text,
     'preds': indices.cpu().numpy()[0],
     'label': class_names[indices.cpu().numpy()[0]],
     'probablities': {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}

 import gradio as gr
+from transformers import BertModel, BertConfig, BertTokenizer
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import huggingface_hub
 from huggingface_hub import hf_hub_download
+import hazm
+from cleantext import clean
+import regex as re
 huggingface_hub.Repository = 'zArabi/Persian-Sentiment-Analysis'
+def cleanhtml(raw_html):
+    cleanr = re.compile('<.*?>')
+    cleantext = re.sub(cleanr, '', raw_html)
+    return cleantext
+def cleaning(text):
+    text = text.strip()
+    # regular cleaning
+    # https://pypi.org/project/clean-text/ >> works well for eng and de languages
+    text = clean(text,
+        fix_unicode=True,
+        to_ascii=False,
+        lower=True,
+        no_line_breaks=True,
+        no_urls=True,
+        no_emails=True,
+        no_phone_numbers=True,
+        no_numbers=False,
+        no_digits=False,
+        no_currency_symbols=True,
+        no_punct=False, #Keep the punc
+        replace_with_url="",
+        replace_with_email="",
+        replace_with_phone_number="",
+        replace_with_number="",
+        replace_with_digit="0",
+        replace_with_currency_symbol="",
+    )
+    # cleaning htmls
+    text = cleanhtml(text)
+    # normalizing > https://github.com/sobhe/hazm
+    normalizer = hazm.Normalizer()
+    text = normalizer.normalize(text)
+    # removing wierd patterns
+    wierd_pattern = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u'\U00010000-\U0010ffff'
+        u"\u200d"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\u3030"
+        u"\ufe0f"
+        u"\u2069"
+        u"\u2066"
+        # u"\u200c"
+        u"\u2068"
+        u"\u2067"
+        "]+", flags=re.UNICODE)
+    text = wierd_pattern.sub(r'', text)
+    # removing extra spaces, hashtags
+    text = re.sub("#", "", text)
+    text = re.sub("\s+", " ", text)
+    return text
 class SentimentModel(nn.Module):
     def __init__(self, config):
         super(SentimentModel, self).__init__()
     label2id=label2id)
 downloadedModelFile = hf_hub_download(repo_id="zArabi/Persian-Sentiment-Analysis", filename="persianModel")
+loaded_model = torch.load(downloadedModelFile,map_location="cpu")
+tokenizer = BertTokenizer.from_pretrained(modelName)
+max_len=512
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def predict(text):
   text = cleaning(text)
   encoding = tokenizer.encode_plus(
+    text,
     max_length=max_len,
     truncation=True,
     padding="max_length",
   probs = F.softmax(outputs,dim=1)
   values, indices = torch.max(probs, dim=1)
   data = {
+    'comments': text,
     'preds': indices.cpu().numpy()[0],
     'label': class_names[indices.cpu().numpy()[0]],
     'probablities': {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}