Spaces:

clementBE
/

yt-sentiements

Sleeping

App Files Files Community

yt-sentiements / app.py

clementBE

Update app.py

777fa5a verified 5 months ago

raw

history blame contribute delete

6.7 kB

	import re
	import os
	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt
	from PIL import Image
	from io import BytesIO
	from googleapiclient.discovery import build
	from transformers import pipeline
	from langdetect import detect

	# -----------------------------
	# Load sentiment model
	# -----------------------------
	sentiment = pipeline(
	"sentiment-analysis",
	model="nlptown/bert-base-multilingual-uncased-sentiment"
	)

	# Hidden YouTube API key from Hugging Face secrets
	YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")

	# -----------------------------
	# Helper functions
	# -----------------------------

	def truncate_text(text, max_len=512):
	"""Prevents BERT from crashing by trimming long comments."""
	tokens = text.split()
	if len(tokens) > max_len:
	return " ".join(tokens[:max_len])
	return text


	def extract_video_id(url: str) -> str:
	regex = r"(?:v=\|youtu\.be/\|embed/)([a-zA-Z0-9_-]{11})"
	match = re.search(regex, url)
	return match.group(1) if match else url


	def fetch_comments(video_id, max_results=50):
	youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
	request = youtube.commentThreads().list(
	part="snippet",
	videoId=video_id,
	textFormat="plainText",
	maxResults=min(max_results, 100)
	)
	response = request.execute()

	comments_data = []
	for item in response.get("items", []):
	snippet = item["snippet"]["topLevelComment"]["snippet"]
	comments_data.append({
	"author": snippet.get("authorDisplayName"),
	"text": snippet.get("textDisplay"),
	"publishedAt": snippet.get("publishedAt"),
	"likeCount": snippet.get("likeCount")
	})
	return comments_data


	def is_french(text: str) -> bool:
	try:
	return detect(text) == "fr"
	except:
	return False


	def map_star_to_label(star: int) -> str:
	if star <= 2:
	return "Négatif"
	elif star == 3:
	return "Neutre"
	else:
	return "Positif"


	def plot_comments_over_time(df):
	plt.figure(figsize=(10, 4))
	colors = {"Positif": "green", "Neutre": "orange", "Négatif": "red"}

	df["day"] = pd.to_datetime(df["Publié le"]).dt.date

	for label in df["Sentiment"].unique():
	subset = df[df["Sentiment"] == label]
	plt.scatter(
	subset["day"],
	[label] * len(subset),
	color=colors[label],
	label=label,
	alpha=0.7
	)

	plt.xlabel("Date")
	plt.ylabel("Sentiment")
	plt.title("Commentaires par jour avec sentiment")
	plt.legend()
	plt.xticks(rotation=45)
	plt.tight_layout()

	buf = BytesIO()
	plt.savefig(buf, format="png")
	plt.close()
	buf.seek(0)

	return Image.open(buf)


	# -----------------------------
	# Main function
	# -----------------------------
	def analyze_comments(video_url, max_comments, uploaded_file):

	# -----------------------------
	# CASE 1 — CSV/XLSX UPLOADED
	# -----------------------------
	if uploaded_file is not None:
	try:
	if uploaded_file.name.endswith(".xlsx"):
	df_input = pd.read_excel(uploaded_file.name)
	else:
	df_input = pd.read_csv(uploaded_file.name)
	except Exception as e:
	return f"Erreur lors de la lecture du fichier : {e}", None, None, None

	if "Commentaire" not in df_input.columns:
	return "Le fichier doit contenir une colonne 'Commentaire'.", None, None, None

	df_input["Auteur"] = df_input.get("Auteur", "Inconnu")
	df_input["Publié le"] = df_input.get("Publié le", pd.Timestamp.now())
	texts = df_input["Commentaire"].astype(str).tolist()

	# -----------------------------
	# CASE 2 — Fetch from YouTube
	# -----------------------------
	else:
	if not YOUTUBE_API_KEY:
	return "⚠️ API key manquante.", None, None, None

	video_id = extract_video_id(video_url)
	comments_data = fetch_comments(video_id, max_results=max_comments)

	if not comments_data:
	return "Aucun commentaire trouvé.", None, None, None

	french_comments = [c for c in comments_data if is_french(c["text"])]

	if not french_comments:
	return "Aucun commentaire en français détecté.", None, None, None

	df_input = pd.DataFrame({
	"Auteur": [c["author"] for c in french_comments],
	"Commentaire": [c["text"] for c in french_comments],
	"Publié le": [c["publishedAt"] for c in french_comments],
	"Likes": [c["likeCount"] for c in french_comments],
	})

	texts = df_input["Commentaire"].tolist()

	# -----------------------------
	# Sentiment analysis (with safety truncation)
	# -----------------------------
	texts = [truncate_text(t) for t in texts] # prevents BERT crash

	results = sentiment(texts)

	labels = []
	scores = []

	for res in results:
	star = int(res["label"].split()[0])
	labels.append(map_star_to_label(star))
	scores.append(round(res["score"], 3))

	df_input["Sentiment"] = labels
	df_input["Score"] = scores

	formatted = [
	f"📝 {row['Commentaire']}\n➡️ {row['Sentiment']} ({row['Score']})"
	for _, row in df_input.iterrows()
	]

	excel_path = "comments_sentiment.xlsx"
	df_input.to_excel(excel_path, index=False)

	plot_img = plot_comments_over_time(df_input)

	return "", "\n\n".join(formatted), excel_path, plot_img


	# -----------------------------
	# Gradio UI
	# -----------------------------
	with gr.Blocks() as demo:

	gr.Markdown("## 📊 Analyse de sentiments des commentaires YouTube OU d'un fichier CSV/XLSX (FR)")

	with gr.Row():
	video_url = gr.Textbox(label="URL de la vidéo YouTube")
	uploaded_file = gr.File(
	label="Ou téléversez un fichier CSV/XLSX",
	file_types=[".csv", ".xlsx"]
	)

	max_comments = gr.Slider(
	label="Nombre de commentaires YouTube à analyser",
	minimum=10,
	maximum=200,
	step=10,
	value=50
	)

	output_msg = gr.Textbox(label="Messages", interactive=False)
	output_results = gr.Textbox(label="Résultats", lines=15, interactive=False)
	output_file = gr.File(label="Télécharger le fichier Excel")
	output_plot = gr.Image(label="Commentaires par jour")

	analyze_btn = gr.Button("Analyser les commentaires")
	analyze_btn.click(
	fn=analyze_comments,
	inputs=[video_url, max_comments, uploaded_file],
	outputs=[output_msg, output_results, output_file, output_plot]
	)

	demo.launch()