Spaces:

ThanaphonJoe
/

hug101

Runtime error

App Files Files Community

hug101 / app.py

ThanaphonJoe

Update app.py

7117e27 verified over 1 year ago

raw

history blame contribute delete

8.05 kB

	import gradio as gr
	import os
	import matplotlib.pyplot as plt
	import pandas as pd
	import re
	from pythainlp.util import normalize
	from pythainlp.tokenize import word_tokenize
	from pythainlp import word_vector
	import numpy as np
	import keras
	import plotly.express as px
	#################
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.common.by import By
	import time
	import chromedriver_autoinstaller
	import sys
	sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
	# setup chrome options
	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument('--headless') # ensure GUI is off
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-dev-shm-usage')

	# set path to chromedriver as per your configuration
	chromedriver_autoinstaller.install()

	wv = word_vector.WordVector()
	word2vec = wv.get_model()

	model= keras.models.load_model('my_model3.h5')

	def get_comments(VIDEO_URL):
	# Initialize the WebDriver
	driver = webdriver.Chrome(options=chrome_options)
	# Your scraping code here
	#VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID'
	driver.get(VIDEO_URL)

	# Wait for the comments to load
	time.sleep(5)

	# Scroll down to load more comments (optional, repeat as needed)
	driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
	time.sleep(2)

	# Find and print comments
	comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]')
	data = []
	for comment in comment_elements:
	if comment != '':
	data.append(comment.text)
	print(comment.text)


	# Close the WebDriver
	driver.quit()

	return data
	def cosine_sim(u, v):
	return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

	def sentences_to_indices(X, word2vec, max_len):
	"""
	Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
	The output shape should be such that it can be given to `Embedding()`.

	Arguments:
	X -- array of sentences (strings), of shape (m, 1)
	word2vec -- a trained Word2Vec model from gensim
	max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.

	Returns:
	X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
	"""

	m = X.shape[0] # number of training examples

	# Initialize X_indices as a numpy matrix of zeros and the correct shape
	X_indices = np.zeros((m, max_len))

	for i in range(m): # loop over training examples

	# Convert the ith training sentence in lower case and split is into words. You should get a list of words.
	# print(X)
	# print(len(X[i].lower().split()))
	sentence_words = X[i].lower().split()[:max_len]

	# Initialize j to 0
	j = 0
	try:
	# Loop over the words of sentence_words
	for w in sentence_words:
	# Set the (i,j)th entry of X_indices to the index of the correct word.

	if w in word2vec.key_to_index:
	X_indices[i, j] = word2vec.key_to_index[w]
	# Increment j to j + 1
	j += 1
	except:
	print('key error: ', w)


	return X_indices


	def deEmojify(text):
	regrex_pattern = re.compile(pattern = "["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002500-\U00002BEF" # chinese char
	u"\U00002702-\U000027B0"
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	u"\U0001f926-\U0001f937"
	u"\U00010000-\U0010ffff"
	u"\u2640-\u2642"
	u"\u2600-\u2B55"
	u"\u200d"
	u"\u23cf"
	u"\u23e9"
	u"\u231a"
	u"\ufe0f" # dingbats
	u"\u3030"
	"]+", flags = re.UNICODE)
	return regrex_pattern.sub(r'',text)


	def clean_me(data):
	data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
	data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
	data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
	# Normalize text
	data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1)
	# Word segmentation: it will take a while....
	data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1)
	# Join the wordsegged with space
	data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)

	return(data)

	def pretty_output(lines, sentiment):

	label = np.array(['Neg', 'Neu', 'Pos'])
	txt_sentiment = label[np.argmax(sentiment, axis=1)]
	seriesText = pd.Series(txt_sentiment).value_counts()
	df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values})
	fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment')
	fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos'])

	txt_pos = ''
	txt_neu = ''
	txt_neg = ''
	for (x, y, score) in zip(lines, txt_sentiment, sentiment,):
	txt_score = [f"{i:.2f}" for i in score]
	tmp = f'{y} {txt_score}:-{x} \n'
	if y == 'Pos':
	txt_pos += tmp
	elif y == 'Neu':
	txt_neu += tmp
	else:
	txt_neg += tmp

	return(txt_pos, txt_neu, txt_neg, fig)

	def fx(t):
	return 16 * np.sin(t) ** 3
	def fy(t):
	return 13 * np.cos(t) - 5 * np.cos(2 * t) - 2 * np.cos(3 * t) - np.cos(4 * t)

	def combine(a, b):
	data = pd.DataFrame()
	embedded_url = ''
	lines = str.split(a, '\n')
	if b != "":
	lines = get_comments(b)
	if not lines:
	text001 = 'CANNOT_GET DATA from Youtube'
	print(text001)

	if not lines:
	t = np.linspace(-2 * np.pi, 2 * np.pi)
	xs = fx(t)
	ys = fy(t)
	plt.plot(xs, ys, "o")
	plt.title('My Heart')
	plt.xlabel(' CANNOT LOAD VDO ')
	plt.ylabel(' CANNOT LOAD VDO ')

	str_output = 'www.youtube.com/embed/KRhPBvrBhro?si=U8sOh4ighEG9hkTI'

	embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'

	return (embed_html,"","","", plt.gcf())
	data['text'] = lines
	data = clean_me(data)
	a = data['wordseged_space_text'][0] + ' SENTIMENT: '

	X_train_indices = sentences_to_indices(data['wordseged_space_text'].values, word2vec, 128)
	result = model.predict(X_train_indices[:])
	txt_pos, txt_neu, txt_neg, fig = pretty_output(lines,result)

	str_output = re.sub(r'youtube.com/', 'youtube.com/embed/', b)
	str_output = re.sub(r'watch\?v=', '', str_output)
	embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'
	print(embed_html)
	return embed_html, txt_pos, txt_neu, txt_neg, fig


	def mirror(x):
	return x



	with gr.Blocks() as demo:
	with gr.Row():
	txt_2 = gr.Textbox(label="Input: Youtube URL")
	txt = gr.Textbox(label="Input: TEXT", lines=2)

	with gr.Row():
	btn = gr.Button(value="Submit")
	with gr.Row():
	HTM = gr.HTML(value= "", label="Youtube VDO")
	plot = gr.Plot(label="Plot")
	with gr.Row():
	txt_NEG = gr.Textbox(value="", label="Negative comments")
	txt_NEU = gr.Textbox(value="", label="Neutral comments")
	txt_POS = gr.Textbox(value="", label="Positive comments")

	btn.click(combine, inputs=[txt, txt_2], outputs=[HTM, txt_POS, txt_NEU, txt_NEG, plot])


	if __name__ == "__main__":
	demo.launch()