Spaces:

YAMITEK
/

Text_Classification_using_RNN

Build error

File size: 7,029 Bytes

4a75943

import streamlit as st
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
from PIL import Image
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence



st.set_page_config(layout="centered")

# Add custom CSS for background image and styling
# Add custom CSS for background image and styling
st.markdown("""
    <style>
        .stApp {
            background-image: url("");
            background-size: cover;
            background-position: center;
            background-repeat: no-repeat;
            height: auto;  /* Allows the page to expand for scrolling */
            overflow: auto;  /* Enables scrolling if the page content overflows */
            # position : relative
        }

        /* Adjust opacity of overlay to make content more visible */
        .stApp::before {
            content: "";
            position: absolute;
            top: 0;
            left: 0;
            width: 100%;
            height: 100%;
            background-color: rgba(255, 255, 255, 0.8);  /* Slightly higher opacity */
            z-index: -1;
        }

        /* Ensure content appears above the overlay */
        .stApp > * {
            position: relative;
            z-index: 2;
        }

        /* Ensure the dataframe is visible */
        .dataframe {
            background-color: rgba(255, 255, 255, 0.9) !important;
            z-index: 3;
        }

        /* Style text elements for better visibility */
        h1, h3, span, div {
            text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.2);
        }
            
        /* Custom CSS for select box heading */
        div.stSelectbox > label {
            color: #000000 !important;  /* Change to your desired color */
            # background-color: black !important;  /* Background color of the dropdown */
            font-size: 24px !important;  /* Change font size */
            font-weight: bold !important;  /* Make text bold */
        }

        /* Custom CSS for image caption */
        .custom-caption {
            color: #000000 !important;  /* Change to your desired color */
            font-size: 24px !important;  /* Optional: Change font size */
            text-align: center;  /* Center-align the caption */
        }
            
        .stMainBlockContainer {
            background-color: white !important;  /* Background color of the dropdown */
        }

        .stTextArea{
            color: #000000 !important
        }
            
    </style>
""", unsafe_allow_html=True)


# Custom title styling functions
def colored_title(text, color):
    st.markdown(f"<h1 style='color: {color};'>{text}</h1>", unsafe_allow_html=True)

def colored_subheader(text, color):
    st.markdown(f"<h3 style='color: {color};'>{text}</h3>", unsafe_allow_html=True)

def colored_text(text, color):
    st.markdown(f"<span style='color: {color};'>{text}</span>", unsafe_allow_html=True)


embedding_dim = 128                        
hidden_units = 128                         
num_classes = 3                             

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units, num_classes):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_units, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_units, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.rnn(x)
        x = output[:, -1, :]  # Use last timestep output
        x = self.fc(x)
        return F.softmax(x, dim=1)

@st.cache_resource
def load_model(vocab_size):

    model = RNNModel(vocab_size,embedding_dim,hidden_units,num_classes)
    try:
        state_dict = torch.load('rnn_classification_model_weights.pth', map_location=torch.device('cpu'))
        model.load_state_dict(state_dict)
        model.eval()
        return model
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        return None

@st.cache_data
def load_data():

    df=pd.read_csv("alldata_1_for_kaggle.csv",encoding='latin1')
    df = df.rename(columns={'0': 'labels', 'a': 'text'})
    texts = df['text'].values
    unique_classes = (df['labels'].unique())  # Ensure consistent order
    class_mapping = {i: f"{idx}" for i,idx in enumerate(unique_classes)}

    # Tokenization (basic whitespace tokenizer)
    def tokenize(text):
        return text.lower().split()

    # Build vocabulary based on word frequency (similar to Keras Tokenizer)
    word_counts = Counter()
    for text in texts:
        word_counts.update(tokenize(text))

    # Sort words by frequency (most common words get lower indices)
    sorted_words = [word for word, _ in word_counts.most_common()]

    # Create vocabulary mapping with <pad> and <unk> tokens
    vocab = {"<pad>": 0, "<unk>": 1}
    vocab.update({word: idx + 2 for idx, word in enumerate(sorted_words)})

    # Initialize tokenizer
    tokenizer = Tokenizer(WordLevel(vocab, unk_token="<unk>"))
    tokenizer.pre_tokenizer = Whitespace()

    # Convert texts to sequences
    def text_to_sequence(texts):
        return [tokenizer.encode(text.lower()).ids for text in texts]

    X_train_seq = text_to_sequence(texts)
    max_len = max([len(seq) for seq in X_train_seq])
    # Convert each sequence to a tensor individually
    X_train_seq = [torch.tensor(seq) for seq in X_train_seq]

    # Padding the sequences correctly
    X_train_seq_reversed = [seq.flip(0) for seq in X_train_seq]
    X_train_pad_reversed = pad_sequence(X_train_seq_reversed, batch_first=True, padding_value=0)
    X_train_pad = X_train_pad_reversed.flip(1)
    vocab_size = len(tokenizer.get_vocab()) 

    return X_train_pad, texts, class_mapping, vocab_size


def main():
    colored_title("Text Classification using RNN", "black")

    # Load data
    X_test,texts,class_mapping, vocab_size = load_data()


    # Display test images for selection
    colored_subheader("Select a Row for Prediction:", "black")
    selected_index = st.selectbox("Select a row", options=range(len(texts)), index=0)

    colored_text("Selected Text:","black")
    st.text_area("Text Content:", value=texts[selected_index], height=150, disabled=True)

    # Predict button
    if st.button("Predict"):
        model = load_model(vocab_size)
        if model is not None:
            with torch.no_grad():
                output = model(X_test[[selected_index]])
                predicted_class = torch.argmax(output, dim=1).item()

            # Display prediction result
            colored_subheader("Prediction Results:", "green")
            colored_text(f"Predicted Class: {class_mapping[predicted_class]}", "green") 
    

            
if __name__ == "__main__":
    main()