import pandas as pd import numpy as np df = pd.read_csv('CrimeVsNoCrimeArticles.csv') titles = np.array(df['title'].to_list()) labels = np.array(df['is_crime_report'].to_list()) import gradio as gr import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.tokenize import TreebankWordTokenizer from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense,LSTM,Embedding stop_word = set(stopwords.words('english')) word_tokenizer = TreebankWordTokenizer() def preprocess(text): text = text.lower() text = re.sub(r'[^a-z\s]','',text) tokens = word_tokenizer.tokenize(text) filtered = [word for word in tokens if word not in stop_word] return ' '.join(filtered) preprocess_doc = [preprocess(doc) for doc in titles] num_tokenizer = Tokenizer(num_words=10000,oov_token='') num_tokenizer.fit_on_texts(preprocess_doc) seq= num_tokenizer.texts_to_sequences(preprocess_doc) padded_seq = pad_sequences(seq,maxlen=10,padding='post') model = Sequential([ Embedding(input_dim=10000,output_dim=16), LSTM(32), Dense(1,activation='sigmoid') ]) model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy']) model.fit(padded_seq,labels,epochs=50) def pre_sentiment(user_input): user = preprocess(user_input) seq_input = num_tokenizer.texts_to_sequences([user]) padded_input = pad_sequences(seq_input,maxlen=10,padding='post') prediction = model.predict(padded_input).item() result = 'CRIMINAL' if prediction>=0.5 else 'NOT CRIMINAL' return(f'{result} - Score: {prediction}') demo = gr.Interface(fn=pre_sentiment,inputs='text',outputs='text',title='Sentiment Analyst') demo.launch()