| | import re |
| | import requests |
| | from bs4 import BeautifulSoup |
| | from collections import Counter |
| | import plotly.express as px |
| | from dash import Dash, dcc, html, Input, Output, State |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
| | from googleapiclient.discovery import build |
| | from urllib.parse import urlparse, parse_qs |
| | import pandas as pd |
| | from dotenv import load_dotenv |
| | import os |
| | |
| | |
| | |
| | model_name = "ahs95/banglabert-sentiment-analysis" |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForSequenceClassification.from_pretrained(model_name) |
| | nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1 ) |
| | load_dotenv() |
| | YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY") |
| | |
| | |
| | |
| | with open("bangla_stopwords.txt", encoding="utf-8") as f: |
| | bangla_stopwords = set(f.read().split()) |
| |
|
| | |
| | |
| | |
| | def get_news_text(url): |
| | try: |
| | headers = {"User-Agent": "Mozilla/5.0"} |
| | r = requests.get(url, headers=headers, timeout=10) |
| | soup = BeautifulSoup(r.content, "html.parser") |
| | paragraphs = soup.find_all("p") |
| | return " ".join(p.get_text() for p in paragraphs) |
| | except: |
| | return "" |
| |
|
| | def get_news_metadata(url): |
| | try: |
| | headers = {"User-Agent": "Mozilla/5.0"} |
| | r = requests.get(url, headers=headers, timeout=10) |
| | soup = BeautifulSoup(r.content, "html.parser") |
| |
|
| | title = soup.title.get_text().strip() if soup.title else "No title found" |
| |
|
| | images = [ |
| | img.get("src") |
| | for img in soup.find_all("img") |
| | if img.get("src") and img.get("src").startswith("http") |
| | ] |
| |
|
| | videos = [ |
| | video.get("src") |
| | for video in soup.find_all("video") |
| | if video.get("src") |
| | ] |
| |
|
| | return title, images[:5], videos[:3] |
| | except: |
| | return "No title", [], [] |
| |
|
| | def clean_text(text): |
| | text = re.sub(r"<.*?>", "", text) |
| | text = re.sub(r"[^\w\s\u0980-\u09FF]", " ", text) |
| | return text.lower() |
| |
|
| | def text_statistics(text): |
| | tokens = clean_text(text).split() |
| | return len(tokens), len(set(tokens)) |
| |
|
| | def top_words_figure(text): |
| | tokens = [ |
| | w for w in clean_text(text).split() |
| | if len(w) > 2 and w not in bangla_stopwords |
| | ] |
| |
|
| | freq = Counter(tokens).most_common(15) |
| |
|
| | if not freq: |
| | return px.bar(title="No meaningful words found") |
| |
|
| | words, counts = zip(*freq) |
| | fig = px.bar( |
| | x=words, |
| | y=counts, |
| | text=counts, |
| | title="Top 15 Meaningful Words (Stopwords Removed)" |
| | ) |
| | fig.update_traces(textposition="outside") |
| | return fig |
| |
|
| | def sentiment_figure(text): |
| | if not text.strip(): |
| | return px.pie(title="No text"), "No sentiment detected" |
| |
|
| | result = nlp(text[:512])[0] |
| | label = result["label"] |
| | score = round(result["score"] * 100, 2) |
| |
|
| | fig = px.pie( |
| | names=[label], |
| | values=[score], |
| | title=f"Sentiment: {label} ({score}%)" |
| | ) |
| |
|
| | confidence_text = f"π Sentiment Confidence Score: {score}%" |
| | return fig, confidence_text |
| | def paragraph_sentiment_analysis(url): |
| | """ |
| | Extract paragraphs and compute sentiment for each |
| | """ |
| | try: |
| | headers = {"User-Agent": "Mozilla/5.0"} |
| | r = requests.get(url, headers=headers, timeout=10) |
| | soup = BeautifulSoup(r.content, "html.parser") |
| |
|
| | paragraphs = [ |
| | p.get_text().strip() |
| | for p in soup.find_all("p") |
| | if len(p.get_text().strip()) > 50 |
| | ] |
| |
|
| | results = [] |
| | for para in paragraphs: |
| | cleaned = clean_text(para) |
| | if cleaned.strip(): |
| | result = nlp(cleaned[:512])[0] |
| | results.append({ |
| | "text": para, |
| | "label": result["label"], |
| | "score": round(result["score"] * 100, 2) |
| | }) |
| |
|
| | return results |
| |
|
| | except: |
| | return [] |
| | def sentiment_to_numeric(label, confidence): |
| | """ |
| | Map sentiment labels to numeric range |
| | """ |
| | label = label.lower() |
| |
|
| | if label == "very negative": |
| | return -2 * confidence / 100 |
| | elif label == "negative": |
| | return -1 * confidence / 100 |
| | elif label == "neutral": |
| | return 0 |
| | elif label == "positive": |
| | return 1 * confidence / 100 |
| | elif label == "very positive": |
| | return 2 * confidence / 100 |
| | else: |
| | return 0 |
| | def paragraph_sentiment_lineplot(paragraph_sentiments): |
| | if not paragraph_sentiments: |
| | return px.line(title="No paragraph sentiment data") |
| |
|
| | x = [] |
| | y = [] |
| | labels = [] |
| |
|
| | for i, ps in enumerate(paragraph_sentiments): |
| | numeric_score = sentiment_to_numeric(ps["label"], ps["score"]) |
| | x.append(f"P{i+1}") |
| | y.append(numeric_score) |
| | labels.append(f"{ps['label']} ({ps['score']}%)") |
| |
|
| | fig = px.line( |
| | x=x, |
| | y=y, |
| | markers=True, |
| | title="π Paragraph-wise Sentiment Transition", |
| | labels={"x": "Paragraph", "y": "Sentiment Intensity"} |
| | ) |
| |
|
| | fig.update_traces( |
| | text=labels, |
| | textposition="top center" |
| | ) |
| |
|
| | fig.update_layout( |
| | yaxis=dict( |
| | range=[-2.2, 2.2], |
| | zeroline=True, |
| | zerolinewidth=2, |
| | zerolinecolor='gray' |
| | ) |
| | ) |
| |
|
| | return fig |
| | def extract_video_id(url): |
| | parsed = urlparse(url) |
| | if "youtu.be" in parsed.netloc: |
| | return parsed.path[1:] |
| | if "youtube.com" in parsed.netloc: |
| | return parse_qs(parsed.query).get("v", [None])[0] |
| | return None |
| | def get_youtube_video_info(video_id): |
| | youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY) |
| |
|
| | response = youtube.videos().list( |
| | part="snippet,statistics", |
| | id=video_id |
| | ).execute() |
| |
|
| | item = response["items"][0] |
| | snippet = item["snippet"] |
| | stats = item["statistics"] |
| |
|
| | return { |
| | "title": snippet["title"], |
| | "views": int(stats.get("viewCount", 0)), |
| | "likes": int(stats.get("likeCount", 0)), |
| | "comments": int(stats.get("commentCount", 0)) |
| | } |
| | def get_youtube_comments(video_id, max_comments=100): |
| | youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY) |
| |
|
| | comments = [] |
| | request = youtube.commentThreads().list( |
| | part="snippet", |
| | videoId=video_id, |
| | maxResults=max_comments, |
| | textFormat="plainText" |
| | ) |
| |
|
| | response = request.execute() |
| | for item in response["items"]: |
| | text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"] |
| | comments.append(text) |
| |
|
| | return comments |
| | def analyze_comments(comments): |
| | """ |
| | Returns: |
| | - sentiment_counts (dict) |
| | - comment_results (list) |
| | """ |
| |
|
| | sentiment_map = { |
| | "very negative": "negative", |
| | "negative": "negative", |
| | "neutral": "neutral", |
| | "positive": "positive", |
| | "very positive": "positive" |
| | } |
| |
|
| | counts = { |
| | "positive": 0, |
| | "neutral": 0, |
| | "negative": 0 |
| | } |
| |
|
| | comment_results = [] |
| |
|
| | for comment in comments: |
| | if len(comment.strip()) < 5: |
| | continue |
| |
|
| | result = nlp(comment[:512])[0] |
| |
|
| | raw_label = result["label"].lower() |
| | score = round(result["score"] * 100, 2) |
| |
|
| | |
| | label = sentiment_map.get(raw_label, "neutral") |
| |
|
| | counts[label] += 1 |
| |
|
| | comment_results.append({ |
| | "text": comment, |
| | "length": len(comment.split()), |
| | "sentiment": label, |
| | "raw_sentiment": raw_label, |
| | "confidence": score |
| | }) |
| |
|
| | return counts, comment_results |
| |
|
| |
|
| | def comment_sentiment_bar(counter): |
| | return px.bar( |
| | x=list(counter.keys()), |
| | y=list(counter.values()), |
| | title="Comment Sentiment Distribution", |
| | labels={"x": "Sentiment", "y": "Count"} |
| | ) |
| | def comment_tone_pie(counter): |
| | return px.pie( |
| | names=list(counter.keys()), |
| | values=list(counter.values()), |
| | title="Comment Tone Analysis" |
| | ) |
| | def comment_length_vs_sentiment(comment_results): |
| | if not comment_results: |
| | return px.scatter(title="No comment data") |
| |
|
| | df = pd.DataFrame(comment_results) |
| |
|
| | sentiment_map = { |
| | "negative": -1, |
| | "neutral": 0, |
| | "positive": 1 |
| | } |
| |
|
| | df["sentiment_value"] = df["sentiment"].map(sentiment_map) |
| |
|
| | fig = px.scatter( |
| | df, |
| | x="length", |
| | y="sentiment_value", |
| | color="sentiment", |
| | size="confidence", |
| | title="Comment Length vs Sentiment", |
| | labels={ |
| | "length": "Comment Length (words)", |
| | "sentiment_value": "Sentiment Scale" |
| | } |
| | ) |
| |
|
| | fig.update_yaxes( |
| | tickvals=[-1, 0, 1], |
| | ticktext=["Negative", "Neutral", "Positive"] |
| | ) |
| |
|
| | return fig |
| | |
| | |
| | |
| | app = Dash(__name__, suppress_callback_exceptions=True) |
| |
|
| | app.layout = html.Div([ |
| | html.H1("Content Analysis Dashboard", style={ |
| | 'textAlign': 'center', |
| | 'color': "#0D092CFD", |
| | 'marginBottom': '30px' |
| | }), |
| |
|
| | html.Div([ |
| | html.Label("Select Link Type:"), |
| | dcc.Dropdown( |
| | id="link-type", |
| | options=[ |
| | {"label": "π° News Link", "value": "news"}, |
| | {"label": "π Facebook Post", "value": "facebook"}, |
| | {"label": "βΆοΈ YouTube Link", "value": "youtube"}, |
| | ], |
| | value="news", |
| | style={ |
| | 'width': '100%', |
| | 'backgroundColor': "#F2EDFE44", |
| | 'color': '#000000', |
| | 'marginTop': '10px' |
| | } |
| | ), |
| | html.Br(), |
| | html.Label("Enter Link:"), |
| | dcc.Input(id="input-link", type="text", style={ |
| | 'width': '70%', |
| | 'padding': '5px', |
| | 'borderRadius': '5px', |
| | 'border': '1px solid #ccc', |
| | 'marginLeft': '10px', |
| | 'marginRight': '10px' |
| | }), |
| | |
| | html.Button("Process", id="process-btn", n_clicks=0, |
| | style={ |
| | 'backgroundColor': '#0f3460', |
| | 'color': '#ffffff', |
| | 'border': 'none', |
| | 'padding': '10px 20px', |
| | 'borderRadius': '5px', |
| | 'cursor': 'pointer' |
| | }) |
| | ]), |
| |
|
| | html.Hr(style={'borderColor': '#e0e0e0'}), |
| | html.Div(id="dynamic-dashboard") |
| | ]) |
| |
|
| | |
| | |
| | |
| | @app.callback( |
| | Output("dynamic-dashboard", "children"), |
| | Input("process-btn", "n_clicks"), |
| | State("input-link", "value"), |
| | State("link-type", "value") |
| | ) |
| |
|
| | def process_link(n_clicks, link, link_type): |
| | if n_clicks == 0 or not link: |
| | return html.P("Please enter a link and click Process.") |
| |
|
| | |
| | if link_type == "news": |
| | text = get_news_text(link) |
| | title, images, videos = get_news_metadata(link) |
| | total_words, unique_words = text_statistics(text) |
| | fig_sent, confidence = sentiment_figure(text) |
| |
|
| | return html.Div([ |
| |
|
| | html.H2(title), |
| |
|
| | html.Div([ |
| | html.P(f"π News Length (characters): {len(text)}"), |
| | html.P(f"π Total Words: {total_words}"), |
| | html.P(f"π Unique Words: {unique_words}") |
| | ], style={ |
| | 'border': '1px solid #ddd', |
| | 'padding': '10px', |
| | 'marginBottom': '20px', |
| | 'backgroundColor': "#7C46FA0F", |
| | }), |
| |
|
| | html.H4("Attachments"), |
| | html.Div([ |
| | html.Img(src=img, style={'width': '200px', 'margin': '5px'}) |
| | for img in images |
| | ]), |
| |
|
| | html.Hr(), |
| |
|
| | html.H4("Full News Content"), |
| | html.Div( |
| | text, |
| | style={ |
| | 'whiteSpace': 'pre-wrap', |
| | 'maxHeight': '400px', |
| | 'overflowY': 'scroll', |
| | 'border': '1px solid #ccc', |
| | 'padding': '10px' |
| | } |
| | ), |
| |
|
| | html.Button( |
| | "π Show Paragraph-wise Analysis", |
| | id="para-btn", |
| | n_clicks=0, |
| | style={ |
| | 'backgroundColor': '#6a0dad', |
| | 'color': '#ffffff', |
| | 'border': 'none', |
| | 'padding': '10px 15px', |
| | 'borderRadius': '5px', |
| | 'cursor': 'pointer', |
| | 'marginBottom': '20px' |
| | } |
| | ), |
| |
|
| | html.Div(id="paragraph-analysis-container"), |
| |
|
| | html.Hr(), |
| |
|
| | dcc.Graph(figure=top_words_figure(text)), |
| |
|
| | html.H4("Sentiment Analysis"), |
| | dcc.Graph(figure=fig_sent), |
| | html.P(confidence, style={'fontWeight': 'bold', 'color': 'green'}) |
| | ]) |
| |
|
| | |
| | elif link_type == "facebook": |
| | fig_sent, confidence = sentiment_figure(link) |
| | return html.Div([ |
| | html.H3("Facebook Post Analysis"), |
| | dcc.Graph(figure=fig_sent), |
| | html.P(confidence) |
| | ]) |
| |
|
| | |
| | elif link_type == "youtube": |
| | video_id = extract_video_id(link) |
| | if not video_id: |
| | return html.P("Invalid YouTube link") |
| |
|
| | info = get_youtube_video_info(video_id) |
| | comments = get_youtube_comments(video_id) |
| |
|
| | sentiment_counts, comment_results = analyze_comments(comments) |
| |
|
| | fig_bar = comment_sentiment_bar(sentiment_counts) |
| | fig_pie = comment_tone_pie(sentiment_counts) |
| | fig_scatter = comment_length_vs_sentiment(comment_results) |
| |
|
| | return html.Div([ |
| |
|
| | html.H2(info["title"]), |
| |
|
| | html.Div([ |
| | html.P(f"π Views: {info['views']}"), |
| | html.P(f"π Likes: {info['likes']}"), |
| | html.P("π Dislikes: Not Public"), |
| | html.P(f"π¬ Total Comments: {info['comments']}") |
| | ], style={ |
| | 'border': '1px solid #ddd', |
| | 'padding': '10px', |
| | 'marginBottom': '20px', |
| | 'backgroundColor': "#7C46FA0F", |
| | }), |
| |
|
| | html.Hr(), |
| |
|
| | html.H3("π Comment Sentiment Analysis"), |
| | dcc.Graph(figure=fig_bar), |
| |
|
| | html.H3("π Comment Tone Analysis"), |
| | dcc.Graph(figure=fig_pie), |
| |
|
| | html.H3("π Comment Length vs Sentiment"), |
| | dcc.Graph(figure=fig_scatter) |
| | ]) |
| | @app.callback( |
| | Output("paragraph-analysis-container", "children"), |
| | Input("para-btn", "n_clicks"), |
| | State("input-link", "value"), |
| | prevent_initial_call=True |
| | ) |
| | def load_paragraph_analysis(n_clicks, link): |
| | if not link: |
| | return html.P("No link provided.") |
| |
|
| | paragraph_sentiments = paragraph_sentiment_analysis(link) |
| |
|
| | if not paragraph_sentiments: |
| | return html.P("No paragraph sentiment data found.") |
| |
|
| | return html.Div([ |
| | html.H3("π Paragraph-wise Sentiment Analysis"), |
| |
|
| | html.Div([ |
| | html.Div([ |
| | html.P(f"π§Ύ Paragraph {i+1}", style={'fontWeight': 'bold'}), |
| | html.P(ps["text"]), |
| | html.P( |
| | f"Sentiment: {ps['label']} | {ps['score']}%", |
| | style={'fontWeight': 'bold'} |
| | ), |
| | html.Hr() |
| | ], style={ |
| | 'padding': '10px', |
| | 'border': '1px solid #ddd', |
| | 'marginBottom': '10px' |
| | }) |
| | for i, ps in enumerate(paragraph_sentiments) |
| | ]), |
| |
|
| | dcc.Graph( |
| | figure=paragraph_sentiment_lineplot(paragraph_sentiments) |
| | ) |
| | ]) |
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| | |
| | if __name__ == "__main__": |
| | port = int(os.environ.get("PORT", 7860)) |
| | app.run(host="0.0.0.0", port=port) |