hug101 / app.py
ThanaphonJoe's picture
Update app.py
7117e27 verified
import gradio as gr
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
from pythainlp.util import normalize
from pythainlp.tokenize import word_tokenize
from pythainlp import word_vector
import numpy as np
import keras
import plotly.express as px
#################
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import chromedriver_autoinstaller
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()
wv = word_vector.WordVector()
word2vec = wv.get_model()
model= keras.models.load_model('my_model3.h5')
def get_comments(VIDEO_URL):
# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)
# Your scraping code here
#VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID'
driver.get(VIDEO_URL)
# Wait for the comments to load
time.sleep(5)
# Scroll down to load more comments (optional, repeat as needed)
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
time.sleep(2)
# Find and print comments
comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]')
data = []
for comment in comment_elements:
if comment != '':
data.append(comment.text)
print(comment.text)
# Close the WebDriver
driver.quit()
return data
def cosine_sim(u, v):
return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
def sentences_to_indices(X, word2vec, max_len):
"""
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
The output shape should be such that it can be given to `Embedding()`.
Arguments:
X -- array of sentences (strings), of shape (m, 1)
word2vec -- a trained Word2Vec model from gensim
max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.
Returns:
X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
"""
m = X.shape[0] # number of training examples
# Initialize X_indices as a numpy matrix of zeros and the correct shape
X_indices = np.zeros((m, max_len))
for i in range(m): # loop over training examples
# Convert the ith training sentence in lower case and split is into words. You should get a list of words.
# print(X)
# print(len(X[i].lower().split()))
sentence_words = X[i].lower().split()[:max_len]
# Initialize j to 0
j = 0
try:
# Loop over the words of sentence_words
for w in sentence_words:
# Set the (i,j)th entry of X_indices to the index of the correct word.
if w in word2vec.key_to_index:
X_indices[i, j] = word2vec.key_to_index[w]
# Increment j to j + 1
j += 1
except:
print('key error: ', w)
return X_indices
def deEmojify(text):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
def clean_me(data):
data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
# Normalize text
data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1)
# Word segmentation: it will take a while....
data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1)
# Join the wordsegged with space
data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)
return(data)
def pretty_output(lines, sentiment):
label = np.array(['Neg', 'Neu', 'Pos'])
txt_sentiment = label[np.argmax(sentiment, axis=1)]
seriesText = pd.Series(txt_sentiment).value_counts()
df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values})
fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment')
fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos'])
txt_pos = ''
txt_neu = ''
txt_neg = ''
for (x, y, score) in zip(lines, txt_sentiment, sentiment,):
txt_score = [f"{i:.2f}" for i in score]
tmp = f'{y} {txt_score}:-{x} \n'
if y == 'Pos':
txt_pos += tmp
elif y == 'Neu':
txt_neu += tmp
else:
txt_neg += tmp
return(txt_pos, txt_neu, txt_neg, fig)
def fx(t):
return 16 * np.sin(t) ** 3
def fy(t):
return 13 * np.cos(t) - 5 * np.cos(2 * t) - 2 * np.cos(3 * t) - np.cos(4 * t)
def combine(a, b):
data = pd.DataFrame()
embedded_url = ''
lines = str.split(a, '\n')
if b != "":
lines = get_comments(b)
if not lines:
text001 = 'CANNOT_GET DATA from Youtube'
print(text001)
if not lines:
t = np.linspace(-2 * np.pi, 2 * np.pi)
xs = fx(t)
ys = fy(t)
plt.plot(xs, ys, "o")
plt.title('My Heart')
plt.xlabel(' CANNOT LOAD VDO ')
plt.ylabel(' CANNOT LOAD VDO ')
str_output = 'www.youtube.com/embed/KRhPBvrBhro?si=U8sOh4ighEG9hkTI'
embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'
return (embed_html,"","","", plt.gcf())
data['text'] = lines
data = clean_me(data)
a = data['wordseged_space_text'][0] + ' SENTIMENT: '
X_train_indices = sentences_to_indices(data['wordseged_space_text'].values, word2vec, 128)
result = model.predict(X_train_indices[:])
txt_pos, txt_neu, txt_neg, fig = pretty_output(lines,result)
str_output = re.sub(r'youtube.com/', 'youtube.com/embed/', b)
str_output = re.sub(r'watch\?v=', '', str_output)
embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'
print(embed_html)
return embed_html, txt_pos, txt_neu, txt_neg, fig
def mirror(x):
return x
with gr.Blocks() as demo:
with gr.Row():
txt_2 = gr.Textbox(label="Input: Youtube URL")
txt = gr.Textbox(label="Input: TEXT", lines=2)
with gr.Row():
btn = gr.Button(value="Submit")
with gr.Row():
HTM = gr.HTML(value= "", label="Youtube VDO")
plot = gr.Plot(label="Plot")
with gr.Row():
txt_NEG = gr.Textbox(value="", label="Negative comments")
txt_NEU = gr.Textbox(value="", label="Neutral comments")
txt_POS = gr.Textbox(value="", label="Positive comments")
btn.click(combine, inputs=[txt, txt_2], outputs=[HTM, txt_POS, txt_NEU, txt_NEG, plot])
if __name__ == "__main__":
demo.launch()