Zayed13
/

solutionfactory

Model card Files Files and versions

Zayed13 commited on Jun 23, 2023

Commit

5e28a48

·

1 Parent(s): a425f9a

Create Untitled0.ipynb

Files changed (1) hide show

Untitled0.ipynb +68 -0

Untitled0.ipynb ADDED Viewed

	@@ -0,0 +1,68 @@

+!pip install transformers
+import pandas as pd
+from wordcloud import WordCloud
+import seaborn as sns
+import re
+import string
+from collections import Counter, defaultdict
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+from plotly.offline import plot
+import matplotlib.gridspec as gridspec
+from matplotlib.ticker import MaxNLocator
+import matplotlib.patches as mpatches
+import matplotlib.pyplot as plt
+import warnings
+warnings.filterwarnings('ignore')
+import nltk
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+stopWords_nltk = set(stopwords.words('english'))
+import re
+from typing import Union, List
+class CleanText():
+    """ clearing text except digits () . , word character """
+    def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
+        self.clean_pattern =clean_pattern
+    def __call__(self, text: Union[str, list]) -> List[List[str]]:
+        if isinstance(text, str):
+            docs = [[text]]
+        if isinstance(text, list):
+            docs = text
+        text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs]
+        return text
+def remove_emoji(data):
+    emoj = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002500-\U00002BEF"
+        u"\U00002702-\U000027B0"
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u"\U00010000-\U0010ffff"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u200d"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\ufe0f"  # dingbats
+        u"\u3030"
+                      "]+", re.UNICODE)
+    return re.sub(emoj, '', data)