File size: 8,051 Bytes
66142c4
50967d1
 
77f9dcf
50967d1
43452d4
 
77f9dcf
 
 
 
 
 
 
 
 
 
c036719
 
77f9dcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df75663
77f9dcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66142c4
d3934ae
50967d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3934ae
66142c4
50967d1
43452d4
50967d1
 
43452d4
 
 
 
 
 
 
50967d1
 
77f9dcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d64903
 
 
 
 
 
50967d1
43452d4
df75663
77f9dcf
 
 
c036719
77f9dcf
 
 
c036719
9d64903
 
 
 
 
 
 
 
e2124cd
9d64903
 
 
 
77f9dcf
43452d4
77f9dcf
 
 
 
 
df75663
c036719
df75663
 
 
c036719
50967d1
 
 
 
 
 
 
c036719
 
 
 
 
 
 
 
 
 
 
 
b234a25
 
c036719
 
43452d4
 
50967d1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import gradio as gr
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
from pythainlp.util import normalize
from pythainlp.tokenize import word_tokenize
from pythainlp import word_vector
import numpy as np
import keras
import plotly.express as px
#################
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import chromedriver_autoinstaller
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()

wv = word_vector.WordVector()
word2vec = wv.get_model()

model= keras.models.load_model('my_model3.h5')

def get_comments(VIDEO_URL):
    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    # Your scraping code here
    #VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID'
    driver.get(VIDEO_URL)

    # Wait for the comments to load
    time.sleep(5)

    # Scroll down to load more comments (optional, repeat as needed)
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
    time.sleep(2)

    # Find and print comments
    comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]')
    data = []
    for comment in comment_elements:
        if comment != '':
            data.append(comment.text)
            print(comment.text)


    # Close the WebDriver
    driver.quit()

    return data
def cosine_sim(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def sentences_to_indices(X, word2vec, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()`.

    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word2vec -- a trained Word2Vec model from gensim
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.

    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """

    m = X.shape[0]                                   # number of training examples

    # Initialize X_indices as a numpy matrix of zeros and the correct shape
    X_indices = np.zeros((m, max_len))

    for i in range(m):                               # loop over training examples

        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        # print(X)
        # print(len(X[i].lower().split()))
        sentence_words = X[i].lower().split()[:max_len]

        # Initialize j to 0
        j = 0
        try:
        # Loop over the words of sentence_words
          for w in sentence_words:
              # Set the (i,j)th entry of X_indices to the index of the correct word.

                if w in word2vec.key_to_index:
                    X_indices[i, j] = word2vec.key_to_index[w]
                    # Increment j to j + 1
                    j += 1
        except:
              print('key error: ', w)


    return X_indices


def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


def clean_me(data):
  data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
  data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
  data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
  # Normalize text
  data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1)
  # Word segmentation: it will take a while....
  data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1)
  # Join the wordsegged with space
  data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)

  return(data)

def pretty_output(lines, sentiment):

    label = np.array(['Neg', 'Neu', 'Pos'])
    txt_sentiment = label[np.argmax(sentiment, axis=1)]
    seriesText = pd.Series(txt_sentiment).value_counts()
    df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values})
    fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment')
    fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos'])

    txt_pos = ''
    txt_neu = ''
    txt_neg = ''
    for (x, y, score) in zip(lines, txt_sentiment, sentiment,):
        txt_score = [f"{i:.2f}" for i in score]
        tmp = f'{y} {txt_score}:-{x} \n'
        if y == 'Pos':
            txt_pos += tmp
        elif y == 'Neu':
            txt_neu += tmp
        else:
            txt_neg += tmp

    return(txt_pos, txt_neu, txt_neg, fig)

def fx(t):
  return 16 * np.sin(t) ** 3
def fy(t):
  return 13 * np.cos(t) - 5 * np.cos(2 * t) - 2 * np.cos(3 * t) - np.cos(4 * t)

def combine(a, b):
    data = pd.DataFrame()
    embedded_url = ''
    lines = str.split(a, '\n')
    if b != "":
        lines = get_comments(b)
        if not lines:
            text001 = 'CANNOT_GET DATA from Youtube'
            print(text001)

    if not lines:
        t = np.linspace(-2 * np.pi, 2 * np.pi)
        xs = fx(t)
        ys = fy(t)
        plt.plot(xs, ys, "o")
        plt.title('My Heart')
        plt.xlabel(' CANNOT LOAD VDO ')
        plt.ylabel(' CANNOT LOAD VDO ')

        str_output = 'www.youtube.com/embed/KRhPBvrBhro?si=U8sOh4ighEG9hkTI'

        embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'

        return (embed_html,"","","", plt.gcf())
    data['text'] = lines
    data = clean_me(data)
    a = data['wordseged_space_text'][0] + ' SENTIMENT: '

    X_train_indices = sentences_to_indices(data['wordseged_space_text'].values, word2vec, 128)
    result = model.predict(X_train_indices[:])
    txt_pos, txt_neu, txt_neg, fig = pretty_output(lines,result)

    str_output = re.sub(r'youtube.com/', 'youtube.com/embed/', b)
    str_output = re.sub(r'watch\?v=', '', str_output)
    embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'
    print(embed_html)
    return embed_html, txt_pos, txt_neu, txt_neg, fig


def mirror(x):
    return x



with gr.Blocks() as demo:
    with gr.Row():
        txt_2 = gr.Textbox(label="Input: Youtube URL")
        txt   = gr.Textbox(label="Input: TEXT", lines=2)

    with gr.Row():
        btn   = gr.Button(value="Submit")
    with gr.Row():
        HTM   = gr.HTML(value= "", label="Youtube VDO")
        plot = gr.Plot(label="Plot")
    with gr.Row():
        txt_NEG = gr.Textbox(value="", label="Negative comments")
        txt_NEU = gr.Textbox(value="", label="Neutral comments")
        txt_POS = gr.Textbox(value="", label="Positive comments")

    btn.click(combine, inputs=[txt, txt_2], outputs=[HTM, txt_POS, txt_NEU, txt_NEG, plot])


if __name__ == "__main__":
    demo.launch()