import gradio as gr import tensorflow as tf import pickle import unicodedata import contractions import re import nltk import pandas as pd import numpy as np from nltk.corpus import stopwords, words from nltk.stem import WordNetLemmatizer from tensorflow.keras.models import load_model #type:ignore from tensorflow.keras.utils import pad_sequences # type: ignore nltk.download('words') nltk.download('punkt_tab') nltk.download('wordnet') nltk.download('stopwords') lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) english_words = set(words.words()) def loadCustomDict(path): with open(path, 'r') as file: return set(line.strip().lower() for line in file if line.strip()) def normalizeWhitespace(text): text = unicodedata.normalize('NFKC', text) text = contractions.fix(text) text = re.sub(r'[\t\r]+', ' ', text) # Menghapus tab text = re.sub(r'\b\d+\b', '', text) # Menghilangkan angka text = re.sub(r'[-‐‑‒–—―]+', '', text) text = re.sub(r'[_﹍﹎_]', '', text) text = re.sub(r'[^\w\s]', '', text) # Hilangkan symbol punctuation text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text) text = re.sub(r'\s+', ' ', text).strip().lower() return text def removeOtherLanguage(text): phrase = ' translated' pos = text.find(phrase) if pos != -1: text = text[:pos].rstrip() text = re.sub(r'\b\w*[^\x00-\x7F]\w*\b', '', text) text = re.sub(r'\s+', ' ', text).strip().lower() return text def removeNonEnglish(text_series, custom_dict): pattern = r'\b(?:' + '|'.join(re.escape(word) for word in custom_dict) + r')\b' temp_series = text_series.str.replace(pattern, '', case=False, regex=True) split_words = temp_series.str.split() exploded = split_words.explode() exploded = exploded[exploded.str.lower().isin(english_words)] filtered = exploded[~exploded.str.lower().isin(stop_words)] lemmatized = filtered.apply(lambda word: lemmatizer.lemmatize(word.lower())) cleaned_text_series = lemmatized.groupby(level=0).agg(' '.join) pattern2 = r'\b(\w+)(?:\s+\1\b)+' #, r'\1', text) ser = cleaned_text_series.reindex(text_series.index, fill_value='') text = ser.str.replace(pattern2, r'\1', case=False, regex=True) return text def cleanInference(df): custom_dict = loadCustomDict('custom_vocab.txt') df['poem'] = df['poem'].apply(normalizeWhitespace) df['poem'] = df['poem'].apply(removeOtherLanguage) df['poem'] = removeNonEnglish(df['poem'], custom_dict) return df def kerasTokenizer(text, tokenizer): text_sequence = tokenizer.texts_to_sequences(text) text_padded = pad_sequences(text_sequence, maxlen=128) return text_padded def getLabelEncoder(name): hartmann = ['sadness', 'fear', 'anger', 'joy', 'neutral', 'surprise', 'disgust'] savani = ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise'] deepseek = ['other', 'sadness', 'joy', 'hope', 'love'] if name=='hartmann': return {i : label for i, label in enumerate(sorted(hartmann))} if name=='savani': return {i : label for i, label in enumerate(sorted(savani))} if name=='deepseek': return {i : label for i, label in enumerate(sorted(deepseek))} with open(f"tokenizer_savani_0.1_lstm.pkl", "rb") as f: tokenizer_savani = pickle.load(f) with open(f"tokenizer_hartmann_0.1_lstm.pkl", "rb") as g: tokenizer_hartman = pickle.load(g) with open(f"tokenizer_deepseek_0.1_lstm.pkl", "rb") as h: tokenizer_deepseek = pickle.load(h) model_savani = load_model(f"best_model_savani_0.1_lstm.keras") model_hartman = load_model(f"best_model_hartmann_0.1_lstm.keras") model_deepseek = load_model(f"best_model_deepseek_0.1_lstm.keras") MODELS = { "savani": { "model": model_savani, "tokenizer": tokenizer_savani }, "hartmann": { "model": model_hartman, "tokenizer": tokenizer_hartman }, "deepseek": { "model": model_deepseek, "tokenizer": tokenizer_deepseek }, } loaded_models = {} def load_model(model_name): if model_name not in loaded_models: tokenizer = MODELS[model_name]['tokenizer'] model = MODELS[model_name]['model'] loaded_models[model_name] = (tokenizer, model) return loaded_models[model_name] def predict_poem(poem, model_name): tokenizer, model = load_model(model_name) poem_df = pd.DataFrame({'poem' : [poem]}) clean_poem_df = cleanInference(poem_df) text_keras = kerasTokenizer(clean_poem_df['poem'], tokenizer) result = model.predict(text_keras, verbose=0) predicted_labels = np.argmax(result, axis=1) dic = getLabelEncoder(model_name) return dic[predicted_labels[0]] with gr.Blocks(title="NLP Model Text Classifier") as demo: gr.Markdown("## 📜 Poem Emotion Classification") gr.Markdown(""" ### - **Step 1:** Select a labeling technique (model - each has different emotion labels) ### - **Step 2:** Enter your poem text ### - **Output:** Predicted emotion """) with gr.Row(): with gr.Column(): model_selector = gr.Dropdown( choices=list(MODELS.keys()), value="savani", interactive=True, label="Select Labelling Technique Model" ) text_input = gr.Textbox( lines=5, placeholder="Enter text here...", label="Input Text", interactive=True ) submit_btn = gr.Button("Classify", variant="primary") with gr.Column(): output_label = gr.Label(label="Classification Results") gr.Markdown(""" **Poem References** - [Poem Hunter](https://www.poemhunter.com) - [Poem Generator](https://www.poem-generator.org.uk) - [HelloPoetry](https://hellopoetry.com) """) gr.Markdown(""" **Class Available for Each Labelling Model Technique** - **Hartmann**: ['sadness', 'fear', 'anger', 'joy', 'neutral', 'surprise', 'disgust'] - **Savani**: ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise'] - **Deepseek**: ['other', 'sadness', 'joy', 'hope', 'love'] """) submit_btn.click( fn=predict_poem, inputs=[text_input, model_selector], outputs=[output_label] ) demo.launch()