Create Untitled0.ipynb
Browse files- Untitled0.ipynb +68 -0
Untitled0.ipynb
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
!pip install transformers
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from wordcloud import WordCloud
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import re
|
| 6 |
+
import string
|
| 7 |
+
from collections import Counter, defaultdict
|
| 8 |
+
|
| 9 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 10 |
+
|
| 11 |
+
import plotly.express as px
|
| 12 |
+
from plotly.subplots import make_subplots
|
| 13 |
+
import plotly.graph_objects as go
|
| 14 |
+
from plotly.offline import plot
|
| 15 |
+
|
| 16 |
+
import matplotlib.gridspec as gridspec
|
| 17 |
+
from matplotlib.ticker import MaxNLocator
|
| 18 |
+
import matplotlib.patches as mpatches
|
| 19 |
+
import matplotlib.pyplot as plt
|
| 20 |
+
import warnings
|
| 21 |
+
warnings.filterwarnings('ignore')
|
| 22 |
+
import nltk
|
| 23 |
+
nltk.download('stopwords')
|
| 24 |
+
from nltk.corpus import stopwords
|
| 25 |
+
stopWords_nltk = set(stopwords.words('english'))
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
import re
|
| 29 |
+
from typing import Union, List
|
| 30 |
+
class CleanText():
|
| 31 |
+
""" clearing text except digits () . , word character """
|
| 32 |
+
|
| 33 |
+
def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
|
| 34 |
+
self.clean_pattern =clean_pattern
|
| 35 |
+
|
| 36 |
+
def __call__(self, text: Union[str, list]) -> List[List[str]]:
|
| 37 |
+
|
| 38 |
+
if isinstance(text, str):
|
| 39 |
+
docs = [[text]]
|
| 40 |
+
|
| 41 |
+
if isinstance(text, list):
|
| 42 |
+
docs = text
|
| 43 |
+
|
| 44 |
+
text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs]
|
| 45 |
+
|
| 46 |
+
return text
|
| 47 |
+
def remove_emoji(data):
|
| 48 |
+
emoj = re.compile("["
|
| 49 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
| 50 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
| 51 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
| 52 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
| 53 |
+
u"\U00002500-\U00002BEF"
|
| 54 |
+
u"\U00002702-\U000027B0"
|
| 55 |
+
u"\U00002702-\U000027B0"
|
| 56 |
+
u"\U000024C2-\U0001F251"
|
| 57 |
+
u"\U0001f926-\U0001f937"
|
| 58 |
+
u"\U00010000-\U0010ffff"
|
| 59 |
+
u"\u2640-\u2642"
|
| 60 |
+
u"\u2600-\u2B55"
|
| 61 |
+
u"\u200d"
|
| 62 |
+
u"\u23cf"
|
| 63 |
+
u"\u23e9"
|
| 64 |
+
u"\u231a"
|
| 65 |
+
u"\ufe0f" # dingbats
|
| 66 |
+
u"\u3030"
|
| 67 |
+
"]+", re.UNICODE)
|
| 68 |
+
return re.sub(emoj, '', data)
|