Spaces:

Aenuh
/

Youtube_Sentiment_Analysis

Build error

App Files Files Community

Youtube_Sentiment_Analysis / app.py

Aenuh

Update app.py

909cc87 verified over 1 year ago

raw

history blame contribute delete

10.9 kB

	import gradio as gr
	import pandas as pd
	from nltk.sentiment import SentimentIntensityAnalyzer
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import requests
	import re
	import sentence_transformers
	from sentence_transformers import SentenceTransformer
	from sklearn.feature_extraction.text import TfidfVectorizer
	import matplotlib.pyplot as plt
	import seaborn as sns
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk import pos_tag, ne_chunk
	from nltk.tree import Tree
	from googleapiclient.discovery import build
	import emoji
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



	nltk.download('vader_lexicon')
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('maxent_ne_chunker')
	nltk.download('words')

	# Initialize the SentimentIntensityAnalyzer
	sia = SentimentIntensityAnalyzer()

	# Load the Sarcasm Detection model
	sarcasm_tokenizer = AutoTokenizer.from_pretrained("jkhan447/sarcasm-detection-Bert-base-uncased")
	sarcasm_model = AutoModelForSequenceClassification.from_pretrained("jkhan447/sarcasm-detection-Bert-base-uncased")

	# Move model to GPU if available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	sarcasm_model.to(device)

	# Load SentenceTransformer model
	sentence_transformer_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

	api_key = "AIzaSyDOw_v-T58ATLOmQjF00k5Mjha6VPQ-TAk"

	def extract_video_id(url):
	match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
	return match.group(1) if match else None

	def get_video_details(video_id):
	url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"
	response = requests.get(url).json()
	if response["items"]:
	snippet = response["items"][0]["snippet"]
	return snippet["title"], snippet["categoryId"]
	return None, None

	def get_comments(video_id):
	comments = []
	url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&maxResults=100&order=relevance"
	response = requests.get(url).json()
	for item in response["items"]:
	comment = item["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
	comments.append(comment)
	return comments

	def sentiment_scores(comment_text):
	sentiment_dict = sia.polarity_scores(comment_text)
	return sentiment_dict['compound']

	def detect_sarcasm_batch(comments):
	inputs = sarcasm_tokenizer(comments, return_tensors="pt", truncation=True, padding=True).to(device)
	with torch.no_grad():
	outputs = sarcasm_model(**inputs)
	probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
	sarcasm_scores = probs[:, 1].tolist()
	return sarcasm_scores

	def get_sentiment_label(row):
	polarity = row['polarity']
	sarcasm_score = row['sarcasm_score']
	category = row['category']

	if sarcasm_score > 0.5:
	return "Sarcastic"

	if category == "Comedy":
	if polarity > 0.05:
	return "Funny/Enjoyable"
	elif polarity < -0.05:
	return "Unfunny/Criticism"
	else:
	return "Neutral"

	elif category == "Education":
	if polarity > 0.05:
	return "Helpful/Informative"
	elif polarity < -0.05:
	return "Confusing/Criticism"
	else:
	return "Neutral"

	elif category == "Music":
	if polarity > 0.05:
	return "Enjoyed"
	elif polarity < -0.05:
	return "Criticism/Disliked"
	else:
	return "Neutral"

	elif category == "Entertainment":
	if polarity > 0.05:
	return "Entertained"
	elif polarity < -0.05:
	return "Bored/Criticism"
	else:
	return "Neutral"

	else:
	if polarity > 0.05:
	return "Positive"
	elif polarity < -0.05:
	return "Negative"
	else:
	return "Neutral"

	def extract_keywords(comments_for_video_df):
	comment_embeddings = sentence_transformer_model.encode(comments_for_video_df['comment_text'].tolist())
	tfidf = TfidfVectorizer(stop_words='english', max_features=20)
	tfidf.fit(comments_for_video_df['comment_text'])
	keywords = tfidf.get_feature_names_out()
	keyword_importance = tfidf.idf_
	keyword_importance_df = pd.DataFrame({'keyword': keywords, 'importance': keyword_importance})

	plt.figure(figsize=(10, 6))
	sns.barplot(y='keyword', x='importance', data=keyword_importance_df, palette='pastel')
	plt.title('Top Keywords in Comments')
	plt.xlabel('TF-IDF Importance')
	plt.ylabel('Keyword')
	plt.tight_layout()

	return plt.gcf()

	def analyze_video_sentiment(video_url):
	video_id = extract_video_id(video_url)
	if video_id:
	video_title, category_id = get_video_details(video_id)

	categories = {
	"1": "Film & Animation", "2": "Autos & Vehicles", "10": "Music", "15": "Pets & Animals",
	"17": "Sports", "18": "Short Movies", "19": "Travel & Events", "20": "Gaming",
	"21": "Videoblogging", "22": "People & Blogs", "23": "Comedy", "24": "Entertainment",
	"25": "News & Politics", "26": "Howto & Style", "27": "Education", "28": "Science & Technology",
	"29": "Nonprofits & Activism", "30": "Movies", "31": "Anime/Animation", "32": "Action/Adventure",
	"33": "Classics", "34": "Comedy", "35": "Documentary", "36": "Drama", "37": "Family",
	"38": "Foreign", "39": "Horror", "40": "Sci-Fi/Fantasy", "41": "Thriller", "42": "Shorts",
	"43": "Shows", "44": "Trailers"
	}
	category = categories.get(category_id, "Unknown Category")

	comments = get_comments(video_id)
	if comments:
	comments_for_video_df = pd.DataFrame(comments, columns=["comment_text"])
	comments_for_video_df['polarity'] = comments_for_video_df['comment_text'].apply(sentiment_scores)

	batch_size = 32
	sarcasm_scores = []
	for i in range(0, len(comments_for_video_df), batch_size):
	batch_comments = comments_for_video_df['comment_text'][i:i+batch_size].tolist()
	batch_scores = detect_sarcasm_batch(batch_comments)
	sarcasm_scores.extend(batch_scores)

	comments_for_video_df['sarcasm_score'] = sarcasm_scores
	comments_for_video_df['category'] = category # Assign the correct category to each comment

	comments_for_video_df['Prominent sentiment'] = comments_for_video_df.apply(get_sentiment_label, axis=1)

	keyword_plot = extract_keywords(comments_for_video_df)

	# Analyze all comments but display only the top 10 comments based on relevance
	top_10_comments = comments_for_video_df[['comment_text', 'Prominent sentiment']].head(10)

	return comments_for_video_df, top_10_comments, video_title, category, keyword_plot
	else:
	return pd.DataFrame({"Error": ["No comments found."]}), None, None, None, None
	else:
	return pd.DataFrame({"Error": ["Invalid YouTube URL."]}), None, None, None, None

	def plot_sentiment_distribution(df):
	if 'Prominent sentiment' in df.columns:
	sentiment_counts = df['Prominent sentiment'].value_counts().reset_index()
	sentiment_counts.columns = ['Sentiment', 'Comment Count']

	plt.figure(figsize=(10, 6))
	sns.barplot(x='Sentiment', y='Comment Count', hue='Sentiment', data=sentiment_counts, palette="pastel", legend=False)
	plt.title('Number of Comments by Sentiment', fontsize=14)
	plt.xlabel('Sentiment', fontsize=12)
	plt.ylabel('Number of Comments', fontsize=12)
	plt.xticks(rotation=45)
	plt.tight_layout()

	return plt.gcf()
	else:
	return None

	def plot_sarcasm_vs_polarity(df):
	if 'polarity' in df.columns and 'sarcasm_score' in df.columns:
	plt.figure(figsize=(10, 6))
	sns.scatterplot(x='polarity', y='sarcasm_score', hue='Prominent sentiment', data=df, palette="pastel")
	plt.title('Polarity vs. Sarcasm Score', fontsize=14)
	plt.xlabel('Polarity Score', fontsize=12)
	plt.ylabel('Sarcasm Score', fontsize=12)
	plt.tight_layout()

	return plt.gcf()
	else:
	return None

	def gradio_interface(video_url):
	full_df, df, video_title, category, keyword_plot = analyze_video_sentiment(video_url)

	if category:
	sentiment_plot = plot_sentiment_distribution(full_df)
	sarcasm_plot = plot_sarcasm_vs_polarity(full_df)

	insights = f"Title: {video_title}\n\nCategory: {category}"

	return df, sentiment_plot, sarcasm_plot, keyword_plot, insights, insights
	else:
	return df, None, None, None, "No insights available.", None

	with gr.Blocks(theme=gr.themes.Monochrome()) as demo: # Dark theme applied
	gr.Markdown(
	"""
	# 🎥 YouTube Sentiment Analysis
	Enter a YouTube video URL below to analyze the comments for sentiment and sarcasm
	"""
	)
	with gr.Row():
	video_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter a YouTube video URL here...")
	analyze_button = gr.Button("Analyze", variant="primary", elem_id="analyze-btn")

	video_details = gr.Markdown(label="Video Details", elem_id="video-details-box")

	with gr.Accordion("Top 10 Comments", open=False):
	comment_text = gr.Dataframe(label="Top 10 Comments", interactive=False)

	sentiment_graph = gr.Plot(label="Sentiment Distribution")
	sarcasm_graph = gr.Plot(label="Sarcasm vs Polarity")
	keyword_graph = gr.Plot(label="Top Keywords")
	insights_box = gr.Markdown(label="Insights", elem_id="insights-box")

	analyze_button.click(gradio_interface,
	inputs=video_input,
	outputs=[comment_text, sentiment_graph, sarcasm_graph, keyword_graph, insights_box, video_details])

	# Custom CSS for improved styling
	gr.HTML(
	"""
	<style>
	#analyze-btn {
	background-color: #4CAF50; /* Green */
	color: white;
	border: none;
	padding: 10px 24px;
	text-align: center;
	text-decoration: none;
	display: inline-block;
	font-size: 16px;
	border-radius: 8px;
	cursor: pointer;
	}
	#insights-box {
	color: #FFD700;
	font-weight: bold;
	}
	#video-details-box {
	color: #1E90FF;
	font-weight: bold;
	}
	body {
	background-color: #1f1f1f;
	color: #e0e0e0;
	}
	</style>
	"""
	)

	demo.launch(debug=True)