Spaces:

ThanaphonJoe
/

hug101

Runtime error

File size: 8,051 Bytes

import gradio as gr
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
from pythainlp.util import normalize
from pythainlp.tokenize import word_tokenize
from pythainlp import word_vector
import numpy as np
import keras
import plotly.express as px
#################
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import chromedriver_autoinstaller
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()

wv = word_vector.WordVector()
word2vec = wv.get_model()

model= keras.models.load_model('my_model3.h5')

def get_comments(VIDEO_URL):
    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    # Your scraping code here
    #VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID'
    driver.get(VIDEO_URL)

    # Wait for the comments to load
    time.sleep(5)

    # Scroll down to load more comments (optional, repeat as needed)
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
    time.sleep(2)

    # Find and print comments
    comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]')
    data = []
    for comment in comment_elements:
        if comment != '':
            data.append(comment.text)
            print(comment.text)


    # Close the WebDriver
    driver.quit()

    return data
def cosine_sim(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def sentences_to_indices(X, word2vec, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()`.

    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word2vec -- a trained Word2Vec model from gensim
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.

    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """

    m = X.shape[0]                                   # number of training examples

    # Initialize X_indices as a numpy matrix of zeros and the correct shape
    X_indices = np.zeros((m, max_len))

    for i in range(m):                               # loop over training examples

        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        # print(X)
        # print(len(X[i].lower().split()))
        sentence_words = X[i].lower().split()[:max_len]

        # Initialize j to 0
        j = 0
        try:
        # Loop over the words of sentence_words
          for w in sentence_words:
              # Set the (i,j)th entry of X_indices to the index of the correct word.

                if w in word2vec.key_to_index:
                    X_indices[i, j] = word2vec.key_to_index[w]
                    # Increment j to j + 1
                    j += 1
        except:
              print('key error: ', w)


    return X_indices


def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


def clean_me(data):
  data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
  data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
  data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
  # Normalize text
  data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1)
  # Word segmentation: it will take a while....
  data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1)
  # Join the wordsegged with space
  data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)

  return(data)

def pretty_output(lines, sentiment):

    label = np.array(['Neg', 'Neu', 'Pos'])
    txt_sentiment = label[np.argmax(sentiment, axis=1)]
    seriesText = pd.Series(txt_sentiment).value_counts()
    df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values})
    fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment')
    fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos'])

    txt_pos = ''
    txt_neu = ''
    txt_neg = ''
    for (x, y, score) in zip(lines, txt_sentiment, sentiment,):
        txt_score = [f"{i:.2f}" for i in score]
        tmp = f'{y} {txt_score}:-{x} \n'
        if y == 'Pos':
            txt_pos += tmp
        elif y == 'Neu':
            txt_neu += tmp
        else:
            txt_neg += tmp

    return(txt_pos, txt_neu, txt_neg, fig)

def fx(t):
  return 16 * np.sin(t) ** 3
def fy(t):
  return 13 * np.cos(t) - 5 * np.cos(2 * t) - 2 * np.cos(3 * t) - np.cos(4 * t)

def combine(a, b):
    data = pd.DataFrame()
    embedded_url = ''
    lines = str.split(a, '\n')
    if b != "":
        lines = get_comments(b)
        if not lines:
            text001 = 'CANNOT_GET DATA from Youtube'
            print(text001)

    if not lines:
        t = np.linspace(-2 * np.pi, 2 * np.pi)
        xs = fx(t)
        ys = fy(t)
        plt.plot(xs, ys, "o")
        plt.title('My Heart')
        plt.xlabel(' CANNOT LOAD VDO ')
        plt.ylabel(' CANNOT LOAD VDO ')

        str_output = 'www.youtube.com/embed/KRhPBvrBhro?si=U8sOh4ighEG9hkTI'

        embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'

        return (embed_html,"","","", plt.gcf())
    data['text'] = lines
    data = clean_me(data)
    a = data['wordseged_space_text'][0] + ' SENTIMENT: '

    X_train_indices = sentences_to_indices(data['wordseged_space_text'].values, word2vec, 128)
    result = model.predict(X_train_indices[:])
    txt_pos, txt_neu, txt_neg, fig = pretty_output(lines,result)

    str_output = re.sub(r'youtube.com/', 'youtube.com/embed/', b)
    str_output = re.sub(r'watch\?v=', '', str_output)
    embed_html = f'<center> <iframe id="ytplayer" type="text/html" width="560" height="315" src="{str_output}" frameborder="0" allowfullscreen </iframe> </center>'
    print(embed_html)
    return embed_html, txt_pos, txt_neu, txt_neg, fig


def mirror(x):
    return x



with gr.Blocks() as demo:
    with gr.Row():
        txt_2 = gr.Textbox(label="Input: Youtube URL")
        txt   = gr.Textbox(label="Input: TEXT", lines=2)

    with gr.Row():
        btn   = gr.Button(value="Submit")
    with gr.Row():
        HTM   = gr.HTML(value= "", label="Youtube VDO")
        plot = gr.Plot(label="Plot")
    with gr.Row():
        txt_NEG = gr.Textbox(value="", label="Negative comments")
        txt_NEU = gr.Textbox(value="", label="Neutral comments")
        txt_POS = gr.Textbox(value="", label="Positive comments")

    btn.click(combine, inputs=[txt, txt_2], outputs=[HTM, txt_POS, txt_NEU, txt_NEG, plot])


if __name__ == "__main__":
    demo.launch()