import gradio as gr import pandas as pd from nltk.sentiment import SentimentIntensityAnalyzer from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import requests import re import sentence_transformers from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import TfidfVectorizer import matplotlib.pyplot as plt import seaborn as sns import nltk from nltk.tokenize import word_tokenize from nltk import pos_tag, ne_chunk from nltk.tree import Tree from googleapiclient.discovery import build import emoji from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer nltk.download('vader_lexicon') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('maxent_ne_chunker') nltk.download('words') # Initialize the SentimentIntensityAnalyzer sia = SentimentIntensityAnalyzer() # Load the Sarcasm Detection model sarcasm_tokenizer = AutoTokenizer.from_pretrained("jkhan447/sarcasm-detection-Bert-base-uncased") sarcasm_model = AutoModelForSequenceClassification.from_pretrained("jkhan447/sarcasm-detection-Bert-base-uncased") # Move model to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") sarcasm_model.to(device) # Load SentenceTransformer model sentence_transformer_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') api_key = "AIzaSyDOw_v-T58ATLOmQjF00k5Mjha6VPQ-TAk" def extract_video_id(url): match = re.search(r"v=([a-zA-Z0-9_-]{11})", url) return match.group(1) if match else None def get_video_details(video_id): url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}" response = requests.get(url).json() if response["items"]: snippet = response["items"][0]["snippet"] return snippet["title"], snippet["categoryId"] return None, None def get_comments(video_id): comments = [] url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&maxResults=100&order=relevance" response = requests.get(url).json() for item in response["items"]: comment = item["snippet"]["topLevelComment"]["snippet"]["textOriginal"] comments.append(comment) return comments def sentiment_scores(comment_text): sentiment_dict = sia.polarity_scores(comment_text) return sentiment_dict['compound'] def detect_sarcasm_batch(comments): inputs = sarcasm_tokenizer(comments, return_tensors="pt", truncation=True, padding=True).to(device) with torch.no_grad(): outputs = sarcasm_model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=-1) sarcasm_scores = probs[:, 1].tolist() return sarcasm_scores def get_sentiment_label(row): polarity = row['polarity'] sarcasm_score = row['sarcasm_score'] category = row['category'] if sarcasm_score > 0.5: return "Sarcastic" if category == "Comedy": if polarity > 0.05: return "Funny/Enjoyable" elif polarity < -0.05: return "Unfunny/Criticism" else: return "Neutral" elif category == "Education": if polarity > 0.05: return "Helpful/Informative" elif polarity < -0.05: return "Confusing/Criticism" else: return "Neutral" elif category == "Music": if polarity > 0.05: return "Enjoyed" elif polarity < -0.05: return "Criticism/Disliked" else: return "Neutral" elif category == "Entertainment": if polarity > 0.05: return "Entertained" elif polarity < -0.05: return "Bored/Criticism" else: return "Neutral" else: if polarity > 0.05: return "Positive" elif polarity < -0.05: return "Negative" else: return "Neutral" def extract_keywords(comments_for_video_df): comment_embeddings = sentence_transformer_model.encode(comments_for_video_df['comment_text'].tolist()) tfidf = TfidfVectorizer(stop_words='english', max_features=20) tfidf.fit(comments_for_video_df['comment_text']) keywords = tfidf.get_feature_names_out() keyword_importance = tfidf.idf_ keyword_importance_df = pd.DataFrame({'keyword': keywords, 'importance': keyword_importance}) plt.figure(figsize=(10, 6)) sns.barplot(y='keyword', x='importance', data=keyword_importance_df, palette='pastel') plt.title('Top Keywords in Comments') plt.xlabel('TF-IDF Importance') plt.ylabel('Keyword') plt.tight_layout() return plt.gcf() def analyze_video_sentiment(video_url): video_id = extract_video_id(video_url) if video_id: video_title, category_id = get_video_details(video_id) categories = { "1": "Film & Animation", "2": "Autos & Vehicles", "10": "Music", "15": "Pets & Animals", "17": "Sports", "18": "Short Movies", "19": "Travel & Events", "20": "Gaming", "21": "Videoblogging", "22": "People & Blogs", "23": "Comedy", "24": "Entertainment", "25": "News & Politics", "26": "Howto & Style", "27": "Education", "28": "Science & Technology", "29": "Nonprofits & Activism", "30": "Movies", "31": "Anime/Animation", "32": "Action/Adventure", "33": "Classics", "34": "Comedy", "35": "Documentary", "36": "Drama", "37": "Family", "38": "Foreign", "39": "Horror", "40": "Sci-Fi/Fantasy", "41": "Thriller", "42": "Shorts", "43": "Shows", "44": "Trailers" } category = categories.get(category_id, "Unknown Category") comments = get_comments(video_id) if comments: comments_for_video_df = pd.DataFrame(comments, columns=["comment_text"]) comments_for_video_df['polarity'] = comments_for_video_df['comment_text'].apply(sentiment_scores) batch_size = 32 sarcasm_scores = [] for i in range(0, len(comments_for_video_df), batch_size): batch_comments = comments_for_video_df['comment_text'][i:i+batch_size].tolist() batch_scores = detect_sarcasm_batch(batch_comments) sarcasm_scores.extend(batch_scores) comments_for_video_df['sarcasm_score'] = sarcasm_scores comments_for_video_df['category'] = category # Assign the correct category to each comment comments_for_video_df['Prominent sentiment'] = comments_for_video_df.apply(get_sentiment_label, axis=1) keyword_plot = extract_keywords(comments_for_video_df) # Analyze all comments but display only the top 10 comments based on relevance top_10_comments = comments_for_video_df[['comment_text', 'Prominent sentiment']].head(10) return comments_for_video_df, top_10_comments, video_title, category, keyword_plot else: return pd.DataFrame({"Error": ["No comments found."]}), None, None, None, None else: return pd.DataFrame({"Error": ["Invalid YouTube URL."]}), None, None, None, None def plot_sentiment_distribution(df): if 'Prominent sentiment' in df.columns: sentiment_counts = df['Prominent sentiment'].value_counts().reset_index() sentiment_counts.columns = ['Sentiment', 'Comment Count'] plt.figure(figsize=(10, 6)) sns.barplot(x='Sentiment', y='Comment Count', hue='Sentiment', data=sentiment_counts, palette="pastel", legend=False) plt.title('Number of Comments by Sentiment', fontsize=14) plt.xlabel('Sentiment', fontsize=12) plt.ylabel('Number of Comments', fontsize=12) plt.xticks(rotation=45) plt.tight_layout() return plt.gcf() else: return None def plot_sarcasm_vs_polarity(df): if 'polarity' in df.columns and 'sarcasm_score' in df.columns: plt.figure(figsize=(10, 6)) sns.scatterplot(x='polarity', y='sarcasm_score', hue='Prominent sentiment', data=df, palette="pastel") plt.title('Polarity vs. Sarcasm Score', fontsize=14) plt.xlabel('Polarity Score', fontsize=12) plt.ylabel('Sarcasm Score', fontsize=12) plt.tight_layout() return plt.gcf() else: return None def gradio_interface(video_url): full_df, df, video_title, category, keyword_plot = analyze_video_sentiment(video_url) if category: sentiment_plot = plot_sentiment_distribution(full_df) sarcasm_plot = plot_sarcasm_vs_polarity(full_df) insights = f"**Title:** {video_title}\n\n**Category:** {category}" return df, sentiment_plot, sarcasm_plot, keyword_plot, insights, insights else: return df, None, None, None, "No insights available.", None with gr.Blocks(theme=gr.themes.Monochrome()) as demo: # Dark theme applied gr.Markdown( """ # 🎥 YouTube Sentiment Analysis Enter a YouTube video URL below to analyze the comments for sentiment and sarcasm """ ) with gr.Row(): video_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter a YouTube video URL here...") analyze_button = gr.Button("Analyze", variant="primary", elem_id="analyze-btn") video_details = gr.Markdown(label="Video Details", elem_id="video-details-box") with gr.Accordion("Top 10 Comments", open=False): comment_text = gr.Dataframe(label="Top 10 Comments", interactive=False) sentiment_graph = gr.Plot(label="Sentiment Distribution") sarcasm_graph = gr.Plot(label="Sarcasm vs Polarity") keyword_graph = gr.Plot(label="Top Keywords") insights_box = gr.Markdown(label="Insights", elem_id="insights-box") analyze_button.click(gradio_interface, inputs=video_input, outputs=[comment_text, sentiment_graph, sarcasm_graph, keyword_graph, insights_box, video_details]) # Custom CSS for improved styling gr.HTML( """ """ ) demo.launch(debug=True)