yt-sentiements / app.py
clementBE's picture
Update app.py
777fa5a verified
import re
import os
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
from googleapiclient.discovery import build
from transformers import pipeline
from langdetect import detect
# -----------------------------
# Load sentiment model
# -----------------------------
sentiment = pipeline(
"sentiment-analysis",
model="nlptown/bert-base-multilingual-uncased-sentiment"
)
# Hidden YouTube API key from Hugging Face secrets
YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
# -----------------------------
# Helper functions
# -----------------------------
def truncate_text(text, max_len=512):
"""Prevents BERT from crashing by trimming long comments."""
tokens = text.split()
if len(tokens) > max_len:
return " ".join(tokens[:max_len])
return text
def extract_video_id(url: str) -> str:
regex = r"(?:v=|youtu\.be/|embed/)([a-zA-Z0-9_-]{11})"
match = re.search(regex, url)
return match.group(1) if match else url
def fetch_comments(video_id, max_results=50):
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
request = youtube.commentThreads().list(
part="snippet",
videoId=video_id,
textFormat="plainText",
maxResults=min(max_results, 100)
)
response = request.execute()
comments_data = []
for item in response.get("items", []):
snippet = item["snippet"]["topLevelComment"]["snippet"]
comments_data.append({
"author": snippet.get("authorDisplayName"),
"text": snippet.get("textDisplay"),
"publishedAt": snippet.get("publishedAt"),
"likeCount": snippet.get("likeCount")
})
return comments_data
def is_french(text: str) -> bool:
try:
return detect(text) == "fr"
except:
return False
def map_star_to_label(star: int) -> str:
if star <= 2:
return "Négatif"
elif star == 3:
return "Neutre"
else:
return "Positif"
def plot_comments_over_time(df):
plt.figure(figsize=(10, 4))
colors = {"Positif": "green", "Neutre": "orange", "Négatif": "red"}
df["day"] = pd.to_datetime(df["Publié le"]).dt.date
for label in df["Sentiment"].unique():
subset = df[df["Sentiment"] == label]
plt.scatter(
subset["day"],
[label] * len(subset),
color=colors[label],
label=label,
alpha=0.7
)
plt.xlabel("Date")
plt.ylabel("Sentiment")
plt.title("Commentaires par jour avec sentiment")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
buf = BytesIO()
plt.savefig(buf, format="png")
plt.close()
buf.seek(0)
return Image.open(buf)
# -----------------------------
# Main function
# -----------------------------
def analyze_comments(video_url, max_comments, uploaded_file):
# -----------------------------
# CASE 1 — CSV/XLSX UPLOADED
# -----------------------------
if uploaded_file is not None:
try:
if uploaded_file.name.endswith(".xlsx"):
df_input = pd.read_excel(uploaded_file.name)
else:
df_input = pd.read_csv(uploaded_file.name)
except Exception as e:
return f"Erreur lors de la lecture du fichier : {e}", None, None, None
if "Commentaire" not in df_input.columns:
return "Le fichier doit contenir une colonne 'Commentaire'.", None, None, None
df_input["Auteur"] = df_input.get("Auteur", "Inconnu")
df_input["Publié le"] = df_input.get("Publié le", pd.Timestamp.now())
texts = df_input["Commentaire"].astype(str).tolist()
# -----------------------------
# CASE 2 — Fetch from YouTube
# -----------------------------
else:
if not YOUTUBE_API_KEY:
return "⚠️ API key manquante.", None, None, None
video_id = extract_video_id(video_url)
comments_data = fetch_comments(video_id, max_results=max_comments)
if not comments_data:
return "Aucun commentaire trouvé.", None, None, None
french_comments = [c for c in comments_data if is_french(c["text"])]
if not french_comments:
return "Aucun commentaire en français détecté.", None, None, None
df_input = pd.DataFrame({
"Auteur": [c["author"] for c in french_comments],
"Commentaire": [c["text"] for c in french_comments],
"Publié le": [c["publishedAt"] for c in french_comments],
"Likes": [c["likeCount"] for c in french_comments],
})
texts = df_input["Commentaire"].tolist()
# -----------------------------
# Sentiment analysis (with safety truncation)
# -----------------------------
texts = [truncate_text(t) for t in texts] # prevents BERT crash
results = sentiment(texts)
labels = []
scores = []
for res in results:
star = int(res["label"].split()[0])
labels.append(map_star_to_label(star))
scores.append(round(res["score"], 3))
df_input["Sentiment"] = labels
df_input["Score"] = scores
formatted = [
f"📝 {row['Commentaire']}\n➡️ {row['Sentiment']} ({row['Score']})"
for _, row in df_input.iterrows()
]
excel_path = "comments_sentiment.xlsx"
df_input.to_excel(excel_path, index=False)
plot_img = plot_comments_over_time(df_input)
return "", "\n\n".join(formatted), excel_path, plot_img
# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("## 📊 Analyse de sentiments des commentaires YouTube OU d'un fichier CSV/XLSX (FR)")
with gr.Row():
video_url = gr.Textbox(label="URL de la vidéo YouTube")
uploaded_file = gr.File(
label="Ou téléversez un fichier CSV/XLSX",
file_types=[".csv", ".xlsx"]
)
max_comments = gr.Slider(
label="Nombre de commentaires YouTube à analyser",
minimum=10,
maximum=200,
step=10,
value=50
)
output_msg = gr.Textbox(label="Messages", interactive=False)
output_results = gr.Textbox(label="Résultats", lines=15, interactive=False)
output_file = gr.File(label="Télécharger le fichier Excel")
output_plot = gr.Image(label="Commentaires par jour")
analyze_btn = gr.Button("Analyser les commentaires")
analyze_btn.click(
fn=analyze_comments,
inputs=[video_url, max_comments, uploaded_file],
outputs=[output_msg, output_results, output_file, output_plot]
)
demo.launch()