hfzdzakii's picture
Revise markdown
3c0e106
import gradio as gr
import tensorflow as tf
import pickle
import unicodedata
import contractions
import re
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model #type:ignore
from tensorflow.keras.utils import pad_sequences # type: ignore
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
english_words = set(words.words())
def loadCustomDict(path):
with open(path, 'r') as file:
return set(line.strip().lower() for line in file if line.strip())
def normalizeWhitespace(text):
text = unicodedata.normalize('NFKC', text)
text = contractions.fix(text)
text = re.sub(r'[\t\r]+', ' ', text) # Menghapus tab
text = re.sub(r'\b\d+\b', '', text) # Menghilangkan angka
text = re.sub(r'[-‐‑‒–—―]+', '', text)
text = re.sub(r'[_﹍﹎_]', '', text)
text = re.sub(r'[^\w\s]', '', text) # Hilangkan symbol punctuation
text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text)
text = re.sub(r'\s+', ' ', text).strip().lower()
return text
def removeOtherLanguage(text):
phrase = ' translated'
pos = text.find(phrase)
if pos != -1:
text = text[:pos].rstrip()
text = re.sub(r'\b\w*[^\x00-\x7F]\w*\b', '', text)
text = re.sub(r'\s+', ' ', text).strip().lower()
return text
def removeNonEnglish(text_series, custom_dict):
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in custom_dict) + r')\b'
temp_series = text_series.str.replace(pattern, '', case=False, regex=True)
split_words = temp_series.str.split()
exploded = split_words.explode()
exploded = exploded[exploded.str.lower().isin(english_words)]
filtered = exploded[~exploded.str.lower().isin(stop_words)]
lemmatized = filtered.apply(lambda word: lemmatizer.lemmatize(word.lower()))
cleaned_text_series = lemmatized.groupby(level=0).agg(' '.join)
pattern2 = r'\b(\w+)(?:\s+\1\b)+' #, r'\1', text)
ser = cleaned_text_series.reindex(text_series.index, fill_value='')
text = ser.str.replace(pattern2, r'\1', case=False, regex=True)
return text
def cleanInference(df):
custom_dict = loadCustomDict('custom_vocab.txt')
df['poem'] = df['poem'].apply(normalizeWhitespace)
df['poem'] = df['poem'].apply(removeOtherLanguage)
df['poem'] = removeNonEnglish(df['poem'], custom_dict)
return df
def kerasTokenizer(text, tokenizer):
text_sequence = tokenizer.texts_to_sequences(text)
text_padded = pad_sequences(text_sequence, maxlen=128)
return text_padded
def getLabelEncoder(name):
hartmann = ['sadness', 'fear', 'anger', 'joy', 'neutral', 'surprise', 'disgust']
savani = ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise']
deepseek = ['other', 'sadness', 'joy', 'hope', 'love']
if name=='hartmann':
return {i : label for i, label in enumerate(sorted(hartmann))}
if name=='savani':
return {i : label for i, label in enumerate(sorted(savani))}
if name=='deepseek':
return {i : label for i, label in enumerate(sorted(deepseek))}
with open(f"tokenizer_savani_0.1_lstm.pkl", "rb") as f:
tokenizer_savani = pickle.load(f)
with open(f"tokenizer_hartmann_0.1_lstm.pkl", "rb") as g:
tokenizer_hartman = pickle.load(g)
with open(f"tokenizer_deepseek_0.1_lstm.pkl", "rb") as h:
tokenizer_deepseek = pickle.load(h)
model_savani = load_model(f"best_model_savani_0.1_lstm.keras")
model_hartman = load_model(f"best_model_hartmann_0.1_lstm.keras")
model_deepseek = load_model(f"best_model_deepseek_0.1_lstm.keras")
MODELS = {
"savani": {
"model": model_savani,
"tokenizer": tokenizer_savani
},
"hartmann": {
"model": model_hartman,
"tokenizer": tokenizer_hartman
},
"deepseek": {
"model": model_deepseek,
"tokenizer": tokenizer_deepseek
},
}
loaded_models = {}
def load_model(model_name):
if model_name not in loaded_models:
tokenizer = MODELS[model_name]['tokenizer']
model = MODELS[model_name]['model']
loaded_models[model_name] = (tokenizer, model)
return loaded_models[model_name]
def predict_poem(poem, model_name):
tokenizer, model = load_model(model_name)
poem_df = pd.DataFrame({'poem' : [poem]})
clean_poem_df = cleanInference(poem_df)
text_keras = kerasTokenizer(clean_poem_df['poem'], tokenizer)
result = model.predict(text_keras, verbose=0)
predicted_labels = np.argmax(result, axis=1)
dic = getLabelEncoder(model_name)
return dic[predicted_labels[0]]
with gr.Blocks(title="NLP Model Text Classifier") as demo:
gr.Markdown("## 📜 Poem Emotion Classification")
gr.Markdown("""
### - **Step 1:** Select a labeling technique (model - each has different emotion labels)
### - **Step 2:** Enter your poem text
### - **Output:** Predicted emotion
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(
choices=list(MODELS.keys()),
value="savani",
interactive=True,
label="Select Labelling Technique Model"
)
text_input = gr.Textbox(
lines=5,
placeholder="Enter text here...",
label="Input Text",
interactive=True
)
submit_btn = gr.Button("Classify", variant="primary")
with gr.Column():
output_label = gr.Label(label="Classification Results")
gr.Markdown("""
**Poem References**
- [Poem Hunter](https://www.poemhunter.com)
- [Poem Generator](https://www.poem-generator.org.uk)
- [HelloPoetry](https://hellopoetry.com)
""")
gr.Markdown("""
**Class Available for Each Labelling Model Technique**
- **Hartmann**: ['sadness', 'fear', 'anger', 'joy', 'neutral', 'surprise', 'disgust']
- **Savani**: ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise']
- **Deepseek**: ['other', 'sadness', 'joy', 'hope', 'love']
""")
submit_btn.click(
fn=predict_poem,
inputs=[text_input, model_selector],
outputs=[output_label]
)
demo.launch()