Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tensorflow as tf | |
| import pickle | |
| import unicodedata | |
| import contractions | |
| import re | |
| import nltk | |
| import pandas as pd | |
| import numpy as np | |
| from nltk.corpus import stopwords, words | |
| from nltk.stem import WordNetLemmatizer | |
| from tensorflow.keras.models import load_model #type:ignore | |
| from tensorflow.keras.utils import pad_sequences # type: ignore | |
| nltk.download('words') | |
| nltk.download('punkt_tab') | |
| nltk.download('wordnet') | |
| nltk.download('stopwords') | |
| lemmatizer = WordNetLemmatizer() | |
| stop_words = set(stopwords.words('english')) | |
| english_words = set(words.words()) | |
| def loadCustomDict(path): | |
| with open(path, 'r') as file: | |
| return set(line.strip().lower() for line in file if line.strip()) | |
| def normalizeWhitespace(text): | |
| text = unicodedata.normalize('NFKC', text) | |
| text = contractions.fix(text) | |
| text = re.sub(r'[\t\r]+', ' ', text) # Menghapus tab | |
| text = re.sub(r'\b\d+\b', '', text) # Menghilangkan angka | |
| text = re.sub(r'[-‐‑‒–—―]+', '', text) | |
| text = re.sub(r'[_﹍﹎_]', '', text) | |
| text = re.sub(r'[^\w\s]', '', text) # Hilangkan symbol punctuation | |
| text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text) | |
| text = re.sub(r'\s+', ' ', text).strip().lower() | |
| return text | |
| def removeOtherLanguage(text): | |
| phrase = ' translated' | |
| pos = text.find(phrase) | |
| if pos != -1: | |
| text = text[:pos].rstrip() | |
| text = re.sub(r'\b\w*[^\x00-\x7F]\w*\b', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip().lower() | |
| return text | |
| def removeNonEnglish(text_series, custom_dict): | |
| pattern = r'\b(?:' + '|'.join(re.escape(word) for word in custom_dict) + r')\b' | |
| temp_series = text_series.str.replace(pattern, '', case=False, regex=True) | |
| split_words = temp_series.str.split() | |
| exploded = split_words.explode() | |
| exploded = exploded[exploded.str.lower().isin(english_words)] | |
| filtered = exploded[~exploded.str.lower().isin(stop_words)] | |
| lemmatized = filtered.apply(lambda word: lemmatizer.lemmatize(word.lower())) | |
| cleaned_text_series = lemmatized.groupby(level=0).agg(' '.join) | |
| pattern2 = r'\b(\w+)(?:\s+\1\b)+' #, r'\1', text) | |
| ser = cleaned_text_series.reindex(text_series.index, fill_value='') | |
| text = ser.str.replace(pattern2, r'\1', case=False, regex=True) | |
| return text | |
| def cleanInference(df): | |
| custom_dict = loadCustomDict('custom_vocab.txt') | |
| df['poem'] = df['poem'].apply(normalizeWhitespace) | |
| df['poem'] = df['poem'].apply(removeOtherLanguage) | |
| df['poem'] = removeNonEnglish(df['poem'], custom_dict) | |
| return df | |
| def kerasTokenizer(text, tokenizer): | |
| text_sequence = tokenizer.texts_to_sequences(text) | |
| text_padded = pad_sequences(text_sequence, maxlen=128) | |
| return text_padded | |
| def getLabelEncoder(name): | |
| hartmann = ['sadness', 'fear', 'anger', 'joy', 'neutral', 'surprise', 'disgust'] | |
| savani = ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise'] | |
| deepseek = ['other', 'sadness', 'joy', 'hope', 'love'] | |
| if name=='hartmann': | |
| return {i : label for i, label in enumerate(sorted(hartmann))} | |
| if name=='savani': | |
| return {i : label for i, label in enumerate(sorted(savani))} | |
| if name=='deepseek': | |
| return {i : label for i, label in enumerate(sorted(deepseek))} | |
| with open(f"tokenizer_savani_0.1_lstm.pkl", "rb") as f: | |
| tokenizer_savani = pickle.load(f) | |
| with open(f"tokenizer_hartmann_0.1_lstm.pkl", "rb") as g: | |
| tokenizer_hartman = pickle.load(g) | |
| with open(f"tokenizer_deepseek_0.1_lstm.pkl", "rb") as h: | |
| tokenizer_deepseek = pickle.load(h) | |
| model_savani = load_model(f"best_model_savani_0.1_lstm.keras") | |
| model_hartman = load_model(f"best_model_hartmann_0.1_lstm.keras") | |
| model_deepseek = load_model(f"best_model_deepseek_0.1_lstm.keras") | |
| MODELS = { | |
| "savani": { | |
| "model": model_savani, | |
| "tokenizer": tokenizer_savani | |
| }, | |
| "hartmann": { | |
| "model": model_hartman, | |
| "tokenizer": tokenizer_hartman | |
| }, | |
| "deepseek": { | |
| "model": model_deepseek, | |
| "tokenizer": tokenizer_deepseek | |
| }, | |
| } | |
| loaded_models = {} | |
| def load_model(model_name): | |
| if model_name not in loaded_models: | |
| tokenizer = MODELS[model_name]['tokenizer'] | |
| model = MODELS[model_name]['model'] | |
| loaded_models[model_name] = (tokenizer, model) | |
| return loaded_models[model_name] | |
| def predict_poem(poem, model_name): | |
| tokenizer, model = load_model(model_name) | |
| poem_df = pd.DataFrame({'poem' : [poem]}) | |
| clean_poem_df = cleanInference(poem_df) | |
| text_keras = kerasTokenizer(clean_poem_df['poem'], tokenizer) | |
| result = model.predict(text_keras, verbose=0) | |
| predicted_labels = np.argmax(result, axis=1) | |
| dic = getLabelEncoder(model_name) | |
| return dic[predicted_labels[0]] | |
| with gr.Blocks(title="NLP Model Text Classifier") as demo: | |
| gr.Markdown("## 📜 Poem Emotion Classification") | |
| gr.Markdown(""" | |
| ### - **Step 1:** Select a labeling technique (model - each has different emotion labels) | |
| ### - **Step 2:** Enter your poem text | |
| ### - **Output:** Predicted emotion | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_selector = gr.Dropdown( | |
| choices=list(MODELS.keys()), | |
| value="savani", | |
| interactive=True, | |
| label="Select Labelling Technique Model" | |
| ) | |
| text_input = gr.Textbox( | |
| lines=5, | |
| placeholder="Enter text here...", | |
| label="Input Text", | |
| interactive=True | |
| ) | |
| submit_btn = gr.Button("Classify", variant="primary") | |
| with gr.Column(): | |
| output_label = gr.Label(label="Classification Results") | |
| gr.Markdown(""" | |
| **Poem References** | |
| - [Poem Hunter](https://www.poemhunter.com) | |
| - [Poem Generator](https://www.poem-generator.org.uk) | |
| - [HelloPoetry](https://hellopoetry.com) | |
| """) | |
| gr.Markdown(""" | |
| **Class Available for Each Labelling Model Technique** | |
| - **Hartmann**: ['sadness', 'fear', 'anger', 'joy', 'neutral', 'surprise', 'disgust'] | |
| - **Savani**: ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise'] | |
| - **Deepseek**: ['other', 'sadness', 'joy', 'hope', 'love'] | |
| """) | |
| submit_btn.click( | |
| fn=predict_poem, | |
| inputs=[text_input, model_selector], | |
| outputs=[output_label] | |
| ) | |
| demo.launch() |