import streamlit as st import tensorflow as tf import numpy as np import pandas as pd from transformers import * from tqdm import tqdm from tensorflow.python.client import device_lib from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import time PATH = './checkpoint-7500/' SEQ_LEN = 128 tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') def create_sentiment_bert(): # 버트 pretrained 모델 로드 model = TFAutoModel.from_pretrained(PATH,local_files_only=True) # 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의 token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids') mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks') segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment') # 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의 bert_outputs = model([token_inputs, mask_inputs, segment_inputs]) bert_outputs = bert_outputs[1] sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs) sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first) sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy']) return sentiment_model def sentence_convert_data(data): global tokenizer tokens, masks, segments = [], [], [] token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length') num_zeros = token.count(0) mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros segment = [0]*SEQ_LEN tokens.append(token) segments.append(segment) masks.append(mask) tokens = np.array(tokens) masks = np.array(masks) segments = np.array(segments) return [tokens, masks, segments] def movie_evaluation_predict(sentence): data_x = sentence_convert_data(sentence) predict = sentiment_model.predict(data_x) predict_value = np.ravel(predict) predict_answer = np.round(predict_value,0).item() print(predict_value) if predict_answer == 0: st.write("(부정 확률 : %.2f) 부정적인 영화 평가입니다." % (1.0-predict_value)) elif predict_answer == 1: st.write("(긍정 확률 : %.2f) 긍정적인 영화 평가입니다." % predict_value) def setup_driver(): chrome_options = Options() chrome_options.add_argument("--headless") # 백그라운드 실행 chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=chrome_options) return driver def scrape_content(url): driver = setup_driver() try: driver.get(url) # 페이지 로딩 대기 time.sleep(3) # 본문 추출 soup = BeautifulSoup(driver.page_source, 'html.parser') content = soup.find('article') # 본문 태그에 맞게 수정 # 댓글 추출 comments = soup.find_all('span', class_='u_cbox_contents') # 댓글 태그에 맞게 수정 return { 'content': content.text if content else "본문을 찾을 수 없습니다.", 'comments': [comment.text for comment in comments] } finally: driver.quit() def main(): sentiment_model = create_sentiment_bert() url = st.text_input("URL을 입력하세요") if st.button("크롤링 시작"): if url: with st.spinner("크롤링 중..."): result = scrape_content(url) st.subheader("본문") st.write(result['content']) st.subheader("댓글") for idx, comment in enumerate(result['comments'], 1): st.write(f"{idx}. {comment}") else: st.error("URL을 입력해주세요") ''' test = st.form('test') sentence = test.text_input("Your sentence") submit = test.form_submit_button("Submit") if submit: movie_evaluation_predict(sentence) ''' return 0 if __name__ == "__main__": main()