Hums003's picture
Update app.py
24a1572 verified
import gradio as gr
import pandas as pd
import numpy as np
import re
import string
import emoji
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download NLTK data with error handling
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', quiet=True)
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords', quiet=True)
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet', quiet=True)
# Initialize components
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# EMOJIXT dictionary (simplified version)
EMOJIXT_DICT = {
'πŸ˜€': 'happy', 'πŸ˜‚': 'laughing', '😊': 'smiling', '😍': 'love',
'❀️': 'love', 'πŸ‘': 'thumbs_up', 'πŸ‘Ž': 'thumbs_down', '😒': 'sad',
'😠': 'angry', '😭': 'crying', '😴': 'sleepy', '😎': 'cool',
'πŸ€”': 'thinking', '😱': 'shocked', 'πŸ™': 'praying', 'πŸŽ‰': 'celebrating',
'✈️': 'airplane', 'πŸ’Ί': 'seat', 'πŸ’―': 'perfect', 'πŸ”₯': 'fire',
'πŸ‘': 'clapping', 'πŸ™Œ': 'hooray', '😘': 'kissing', 'πŸ˜‡': 'angel'
}
# Preprocessing functions
def basic_text_preprocessing(text, remove_emojis=False, replace_emojis_method=None):
"""Perform basic text preprocessing"""
text = str(text).lower()
# Remove URLs, mentions, hashtags, RT
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'@\w+', '', text)
text = re.sub(r'#\w+', '', text)
text = re.sub(r'\brt\b', '', text)
# Handle emojis
if remove_emojis:
text = emoji.replace_emoji(text, replace='')
elif replace_emojis_method == 'builtin':
text = emoji.demojize(text, delimiters=(" ", " "))
elif replace_emojis_method == 'custom':
for emoji_char, replacement in EMOJIXT_DICT.items():
text = text.replace(emoji_char, f' {replacement} ')
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove extra whitespace
text = ' '.join(text.split())
return text
def preprocess_text(text, variant='I'):
"""Complete preprocessing pipeline"""
# Basic preprocessing
if variant == 'I':
processed_text = basic_text_preprocessing(text, remove_emojis=True)
elif variant == 'II':
processed_text = basic_text_preprocessing(text, replace_emojis_method='builtin')
elif variant == 'III':
processed_text = basic_text_preprocessing(text, replace_emojis_method='custom')
else:
processed_text = basic_text_preprocessing(text)
# Tokenization
try:
tokens = word_tokenize(processed_text)
except:
# Fallback simple tokenization if NLTK fails
tokens = processed_text.split()
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
# Apply stemming
tokens = [stemmer.stem(word) for word in tokens]
# Join back to string
return ' '.join(tokens)
# Model functions (simplified - in practice you'd load trained models)
def analyze_sentiment_tfidf(text):
"""Analyze sentiment using traditional ML methods"""
# Preprocess text
processed_text = preprocess_text(text, variant='I')
# Simple rule-based approach for demo
text_lower = text.lower()
# Keywords for sentiment
positive_keywords = ['good', 'great', 'excellent', 'love', 'thanks', 'thank', 'awesome',
'amazing', 'best', 'perfect', 'happy', 'smooth', 'comfortable']
negative_keywords = ['bad', 'terrible', 'worst', 'hate', 'awful', 'disappointed',
'delayed', 'canceled', 'rude', 'poor', 'problem', 'issue']
airline_keywords = ['flight', 'airline', 'airport', 'luggage', 'baggage', 'seat',
'service', 'crew', 'pilot', 'staff', 'check-in', 'boarding']
# Count occurrences
pos_count = sum(1 for word in positive_keywords if word in text_lower)
neg_count = sum(1 for word in negative_keywords if word in text_lower)
airline_related = any(word in text_lower for word in airline_keywords)
# Determine sentiment
if pos_count > neg_count:
sentiment = "positive"
confidence = min(0.7 + (pos_count * 0.05), 0.95)
elif neg_count > pos_count:
sentiment = "negative"
confidence = min(0.7 + (neg_count * 0.05), 0.95)
else:
sentiment = "neutral"
confidence = 0.6
# Adjust confidence if not airline-related
if not airline_related:
confidence = max(confidence - 0.1, 0.5)
return sentiment, confidence
def analyze_sentiment_bert(text):
"""Analyze sentiment using BERT (simulated for demo)"""
# Preprocess text
processed_text = preprocess_text(text, variant='II')
# More sophisticated rule-based approach for BERT demo
text_lower = text.lower()
# More nuanced keyword matching
strong_positive = ['love', 'excellent', 'outstanding', 'perfect', 'amazing']
moderate_positive = ['good', 'great', 'nice', 'pleasant', 'smooth']
strong_negative = ['hate', 'terrible', 'awful', 'horrible', 'disgusting']
moderate_negative = ['bad', 'poor', 'disappointed', 'frustrated', 'annoyed']
# Emoji sentiment
positive_emojis = ['πŸ˜€', 'πŸ˜‚', '😊', '😍', '❀️', 'πŸ‘', 'πŸŽ‰', 'πŸ‘', 'πŸ™Œ']
negative_emojis = ['😒', '😠', '😭', 'πŸ‘Ž', '😑']
# Calculate scores
score = 0
# Text analysis
for word in strong_positive:
if word in text_lower:
score += 2
for word in moderate_positive:
if word in text_lower:
score += 1
for word in strong_negative:
if word in text_lower:
score -= 2
for word in moderate_negative:
if word in text_lower:
score -= 1
# Emoji analysis
for emoji_char in positive_emojis:
if emoji_char in text:
score += 1
for emoji_char in negative_emojis:
if emoji_char in text:
score -= 1
# Determine sentiment
if score > 1:
sentiment = "positive"
confidence = min(0.8 + (score * 0.02), 0.98)
elif score < -1:
sentiment = "negative"
confidence = min(0.8 + (abs(score) * 0.02), 0.98)
else:
sentiment = "neutral"
confidence = 0.7
return sentiment, confidence
def count_emojis(text):
"""Count emojis in text"""
return sum(1 for char in str(text) if emoji.is_emoji(char))
def create_visualizations(text):
"""Create visualizations for the text analysis"""
# Create a simple word cloud from preprocessed text
processed_text = preprocess_text(text, variant='I')
# If processed text is too short, use original
if len(processed_text.split()) < 3:
processed_text = ' '.join([word for word in text.lower().split() if len(word) > 2])
# Generate word cloud
if processed_text.strip():
wordcloud = WordCloud(
width=400,
height=200,
background_color='white',
max_words=50,
contour_width=1,
contour_color='steelblue'
).generate(processed_text)
# Save wordcloud to file
wordcloud_file = "wordcloud.png"
wordcloud.to_file(wordcloud_file)
else:
# Create a simple placeholder word cloud
wordcloud = WordCloud(
width=400,
height=200,
background_color='white'
).generate("No enough words for word cloud")
wordcloud_file = "wordcloud.png"
wordcloud.to_file(wordcloud_file)
# Emoji count
emoji_count = count_emojis(text)
return wordcloud_file, emoji_count
# Gradio Interface
def analyze_tweet(tweet_text, analysis_method="Traditional ML (TF-IDF)", preprocessing_variant="I"):
"""Main analysis function for Gradio interface"""
if not tweet_text.strip():
return "Please enter some text to analyze.", None, 0, "Please enter text to see visualizations."
try:
# Analyze sentiment based on selected method
if analysis_method == "Traditional ML (TF-IDF)":
sentiment, confidence = analyze_sentiment_tfidf(tweet_text)
method_used = "Traditional ML with TF-IDF features"
elif analysis_method == "BERT (Deep Learning)":
sentiment, confidence = analyze_sentiment_bert(tweet_text)
method_used = "BERT Transformer Model"
# Create visualizations
wordcloud_file, emoji_count = create_visualizations(tweet_text)
# Get preprocessing variant description
variant_desc = {
"I": "Remove emojis",
"II": "Replace emojis with built-in descriptions",
"III": "Replace emojis with custom sentiment words"
}.get(preprocessing_variant, "Standard preprocessing")
# Format results with emoji indicators
sentiment_emoji = {
"positive": "😊",
"negative": "😠",
"neutral": "😐"
}.get(sentiment, "πŸ€”")
result_text = f"""
### πŸ“Š Sentiment Analysis Results {sentiment_emoji}
**Tweet:** {tweet_text[:150]}...
**Sentiment:** **{sentiment.upper()}** {sentiment_emoji}
**Confidence:** {confidence:.1%}
**Method:** {method_used}
**Preprocessing:** {variant_desc}
**Emoji Count:** {emoji_count} {'🎭' if emoji_count > 0 else ''}
**Detailed Analysis:**
- Text appears to convey **{sentiment}** sentiment
- Model confidence level: **{confidence:.1%}**
- Contains **{emoji_count}** emoji(s)
- Processed using variant **{preprocessing_variant}**
"""
# Create visualization description
vis_description = f"""
### πŸ“ˆ Visualizations
1. **Word Cloud** (right): Shows most frequent words after preprocessing
2. **Emoji Analysis:** Found {emoji_count} emoji(s) in the text
3. **Text Length:** {len(tweet_text)} characters, {len(tweet_text.split())} words
4. **Processing Variant:** {variant_desc}
"""
return result_text, wordcloud_file, emoji_count, vis_description
except Exception as e:
return f"Error analyzing text: {str(e)}", None, 0, ""
# Create Gradio interface
with gr.Blocks(title="Airline Sentiment Analysis", theme=gr.themes.Soft()) as demo:
gr.Markdown("# ✈️ Airline Sentiment Analyzer")
gr.Markdown("Analyze the sentiment of airline-related tweets using ML techniques from the research paper.")
with gr.Row():
with gr.Column(scale=2):
tweet_input = gr.Textbox(
label="Enter your airline tweet",
placeholder="e.g., '@VirginAmerica had a great flight today! βœˆοΈπŸ‘'",
lines=4
)
with gr.Row():
method_dropdown = gr.Dropdown(
choices=["Traditional ML (TF-IDF)", "BERT (Deep Learning)"],
value="Traditional ML (TF-IDF)",
label="Analysis Method"
)
variant_dropdown = gr.Dropdown(
choices=["I", "II", "III"],
value="I",
label="Preprocessing Variant"
)
gr.Markdown("**Variant I:** Remove emojis | **Variant II:** Replace with descriptions | **Variant III:** Replace with sentiment words")
analyze_btn = gr.Button("Analyze Sentiment", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### πŸ“ Example Tweets")
gr.Examples(
examples=[
["@VirginAmerica What @dhepburn said. πŸ‘"],
["@VirginAmerica plus you've added commercials to the experience... tacky. πŸ‘Ž"],
["@VirginAmerica I didn't today... Must mean I need to take another trip! ✈️"],
["@VirginAmerica it's really aggressive to blast obnoxious entertainment in your guests faces 😠"],
["@VirginAmerica and it's a really big bad thing about it πŸ”₯"],
["Loved the smooth flight and excellent service on my trip yesterday! 😊✈️"],
["Flight delayed for 3 hours with no explanation. Worst airline experience. 😑"]
],
inputs=[tweet_input],
label="Try these examples"
)
with gr.Row():
with gr.Column(scale=2):
result_output = gr.Markdown(label="Analysis Results")
vis_description = gr.Markdown(label="Visualization Details")
with gr.Column(scale=1):
emoji_count = gr.Number(label="Number of Emojis Found")
wordcloud_output = gr.Image(label="Word Cloud", type="filepath")
# Set up button click
analyze_btn.click(
fn=analyze_tweet,
inputs=[tweet_input, method_dropdown, variant_dropdown],
outputs=[result_output, wordcloud_output, emoji_count, vis_description]
)
gr.Markdown("---")
gr.Markdown("""
### ℹ️ About This Tool
This sentiment analyzer implements techniques from the research paper on airline sentiment analysis:
**Features:**
- Three preprocessing variants for emoji handling
- Traditional ML models (TF-IDF + classifiers)
- Deep learning with BERT
- Word cloud visualization
- Emoji analysis
**Preprocessing Variants:**
1. **Variant I**: Remove all emojis
2. **Variant II**: Replace emojis with built-in descriptions
3. **Variant III**: Replace emojis with custom sentiment words
**Methods Available:**
1. **Traditional ML**: Uses TF-IDF features with classifiers
2. **BERT**: State-of-the-art transformer model
**Note:** This is a demo version showing the preprocessing pipeline. For production, models would be trained on the full airline sentiment dataset.
""")
# Launch the app
if __name__ == "__main__":
demo.launch()