Spaces:

Sajjadistic
/

telegram-sentiment-analysis

Sleeping

App Files Files Community

Sajjadistic commited on Dec 13, 2025

Commit

f4b0a68

verified ·

1 Parent(s): c2b5d94

Update analysis_core.py

Browse files

Files changed (1) hide show

analysis_core.py +66 -59

analysis_core.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import re
-from collections import Counter
 from typing import Any, Dict, List, Tuple
 import numpy as np
@@ -45,7 +45,7 @@ MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE
 MODEL.eval()
-# -------------------- Lexicons --------------------
 persian_positive = {
     "خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
     "دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
@@ -159,16 +159,17 @@ def score_en_vader(text: str) -> float:
 @torch.inference_mode()
-def score_fa_bert(text: str) -> float:
-    if not text:
-        return 0.0
-    inputs = TOKENIZER(text, return_tensors="pt", truncation=True, max_length=256).to(DEVICE)
-    out = MODEL(**inputs)
-    probs = torch.softmax(out.logits, dim=-1).squeeze(0)
-    if probs.numel() >= 2:
-        score = float(probs[1] - probs[0])  # assume 0=neg, 1=pos
-        return float(max(-1.0, min(1.0, score)))
-    return 0.0
 def persian_lexicon_score(text: str) -> float:
@@ -193,12 +194,16 @@ def persian_lexicon_score(text: str) -> float:
     return float((pos - neg) / max(1, (pos + neg)))
-def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 200) -> Tuple[pd.DataFrame, int]:
     if df.empty:
         return df, 0
     df = df.copy()
-    df["sentiment_final"] = np.nan
     pers_mask = df["text"].astype(str).apply(is_persian)
     pers_idx = df.index[pers_mask].tolist()
@@ -212,7 +217,9 @@ def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 200) -> Tuple[p
     bert_idx = pers_idx[: max(0, int(max_bert_persian))]
     if bert_idx:
-        df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
     df["sentiment_final"] = df["sentiment_final"].fillna(0.0)
     return df, len(bert_idx)
@@ -243,9 +250,9 @@ def extract_lex_words_from_text(text: str, polarity: str, min_words: int = 4) ->
     out: List[str] = []
     for t in tokens:
-        if polarity == "pos" and t in persian_positive:
             out.append(t)
-        if polarity == "neg" and t in persian_negative:
             out.append(t)
     if len(out) < min_words:
@@ -278,7 +285,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str):
         "low_word_main": None,
     }
-    fig = plt.figure(figsize=(18, 7))
     plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
     plt.xlabel("Time (weeks)")
     plt.ylabel("Average sentiment score")
@@ -292,18 +299,16 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str):
     x = ws.index
     y = ws.values.astype(float)
-    q_lo = float(np.quantile(y, 0.05))
-    q_hi = float(np.quantile(y, 0.95))
-    if q_hi - q_lo < 0.15:
-        q_lo = min(q_lo, 0.0) - 0.1
-        q_hi = max(q_hi, 0.0) + 0.1
-    pad = 0.08 * (q_hi - q_lo)
-    y_min_plot = q_lo - pad
-    y_max_plot = q_hi + pad
     plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
     plt.ylim(y_min_plot, y_max_plot)
-    plt.margins(x=0.05, y=0.2)
     peak_week = ws.idxmax()
     low_week = ws.idxmin()
@@ -335,62 +340,63 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str):
     plt.annotate(
         _shape_fa("، ".join(peak_words) + f" (peak={peak_y:.3f})"),
         xy=(peak_week, peak_y_plot),
-        xytext=(peak_week, y_max_plot + 0.08 * (y_max_plot - y_min_plot)),
         arrowprops=dict(arrowstyle="->"),
         ha="center",
-        fontsize=9,
     )
     plt.scatter([low_week], [low_y_plot])
     plt.annotate(
         _shape_fa("، ".join(low_words) + f" (low={low_y:.3f})"),
         xy=(low_week, low_y_plot),
-        xytext=(low_week, y_min_plot - 0.08 * (y_max_plot - y_min_plot)),
         arrowprops=dict(arrowstyle="->"),
         ha="center",
-        fontsize=9,
     )
     plt.tight_layout()
     return fig, info
-# -------------------- Top lex words with score --------------------
-def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-    tokens_all: List[str] = []
-    for t in df["text"].astype(str).tolist():
-        tokens_all.extend([w for w in custom_tokenize(t) if len(w) > 1 and w not in stopwords_all])
-    if not tokens_all:
-        return [], []
-    total = len(tokens_all)
-    cnt = Counter(tokens_all)
-    pos_items = []
-    for w in persian_positive:
-        c = cnt.get(w, 0)
-        if c > 0:
-            pos_items.append((w, c / total))
-    neg_items = []
-    for w in persian_negative:
-        c = cnt.get(w, 0)
-        if c > 0:
-            neg_items.append((w, -c / total))
-    pos_items.sort(key=lambda x: x[1], reverse=True)
-    neg_items.sort(key=lambda x: x[1])  # more negative first
-    pos_top = [{"word": w, "score": float(s)} for w, s in pos_items[:top_n]]
-    neg_top = [{"word": w, "score": float(s)} for w, s in neg_items[:top_n]]
     return pos_top, neg_top
 # -------------------- Main entry --------------------
 def analyze_chat(
     chat: Dict[str, Any],
-    max_bert_persian: int = 200
 ) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
     df = build_df(chat)
@@ -410,12 +416,12 @@ def analyze_chat(
             "top5_positive_lex": [],
             "top5_negative_lex": [],
         }
-        fig = plt.figure(figsize=(18, 7))
         plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
         plt.tight_layout()
         return empty, fig, [], []
-    df, used = compute_sentiments(df, max_bert_persian=max_bert_persian)
     ws = weekly_series(df)
     weekly_records = [{"week_end": idx.isoformat(), "avg_sentiment": float(val)} for idx, val in ws.items()]
@@ -423,7 +429,8 @@ def analyze_chat(
     fig, info = make_weekly_plot(df, name)
-    pos_top, neg_top = top_lex_words(df, top_n=5)
     result = {
         "chat_name": name,

 from __future__ import annotations
 import re
+from collections import Counter, defaultdict
 from typing import Any, Dict, List, Tuple
 import numpy as np
 MODEL.eval()
+# -------------------- Lexicons (small helper lists; not the only source anymore) --------------------
 persian_positive = {
     "خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
     "دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
 @torch.inference_mode()
+def score_fa_bert_batch(texts: List[str], batch_size: int = 16) -> List[float]:
+    scores: List[float] = []
+    for i in range(0, len(texts), batch_size):
+        chunk = texts[i:i + batch_size]
+        inputs = TOKENIZER(chunk, return_tensors="pt", truncation=True, padding=True, max_length=256).to(DEVICE)
+        out = MODEL(**inputs)
+        probs = torch.softmax(out.logits, dim=-1)
+        diff = (probs[:, 1] - probs[:, 0]).detach().cpu().numpy().astype(float).tolist()
+        diff = [float(max(-1.0, min(1.0, d))) for d in diff]
+        scores.extend(diff)
+    return scores
 def persian_lexicon_score(text: str) -> float:
     return float((pos - neg) / max(1, (pos + neg)))
+def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 500, bert_batch_size: int = 16) -> Tuple[pd.DataFrame, int]:
+    """
+    English messages: VADER
+    Persian messages: BERT on first N Persian messages (batched for speed), lexicon for the rest
+    """
     if df.empty:
         return df, 0
     df = df.copy()
+    df["sentiment_final"] = 0.0
     pers_mask = df["text"].astype(str).apply(is_persian)
     pers_idx = df.index[pers_mask].tolist()
     bert_idx = pers_idx[: max(0, int(max_bert_persian))]
     if bert_idx:
+        texts = df.loc[bert_idx, "text"].astype(str).tolist()
+        scores = score_fa_bert_batch(texts, batch_size=int(bert_batch_size))
+        df.loc[bert_idx, "sentiment_final"] = scores
     df["sentiment_final"] = df["sentiment_final"].fillna(0.0)
     return df, len(bert_idx)
     out: List[str] = []
     for t in tokens:
+        if polarity == "pos" and (t in persian_positive):
             out.append(t)
+        if polarity == "neg" and (t in persian_negative):
             out.append(t)
     if len(out) < min_words:
         "low_word_main": None,
     }
+    fig = plt.figure(figsize=(22, 8))
     plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
     plt.xlabel("Time (weeks)")
     plt.ylabel("Average sentiment score")
     x = ws.index
     y = ws.values.astype(float)
+    # IMPORTANT: show true peaks (use full min/max range)
+    y_min = float(np.min(y))
+    y_max = float(np.max(y))
+    pad = 0.08 * max(1e-9, (y_max - y_min))
+    y_min_plot = y_min - pad
+    y_max_plot = y_max + pad
     plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
     plt.ylim(y_min_plot, y_max_plot)
+    plt.margins(x=0.03, y=0.15)
     peak_week = ws.idxmax()
     low_week = ws.idxmin()
     plt.annotate(
         _shape_fa("، ".join(peak_words) + f" (peak={peak_y:.3f})"),
         xy=(peak_week, peak_y_plot),
+        xytext=(peak_week, y_max_plot + 0.06 * (y_max_plot - y_min_plot)),
         arrowprops=dict(arrowstyle="->"),
         ha="center",
+        fontsize=10,
     )
     plt.scatter([low_week], [low_y_plot])
     plt.annotate(
         _shape_fa("، ".join(low_words) + f" (low={low_y:.3f})"),
         xy=(low_week, low_y_plot),
+        xytext=(low_week, y_min_plot - 0.06 * (y_max_plot - y_min_plot)),
         arrowprops=dict(arrowstyle="->"),
         ha="center",
+        fontsize=10,
     )
     plt.tight_layout()
     return fig, info
+# -------------------- Weighted top words (fixes "no negative words") --------------------
+def top_words_weighted_by_sentiment(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Extracts top positive/negative words by weighting tokens with message sentiment magnitude.
+    This does NOT depend on a tiny predefined negative list, so you will get negative results if chat has negativity.
+    """
+    pos_w = defaultdict(float)
+    neg_w = defaultdict(float)
+    for text, s in zip(df["text"].astype(str).tolist(), df["sentiment_final"].astype(float).tolist()):
+        tokens = [t for t in custom_tokenize(text) if len(t) > 1 and t not in stopwords_all]
+        if not tokens:
+            continue
+        mag = float(abs(s))
+        if mag < 1e-9:
+            continue
+        if s > 0:
+            for t in tokens:
+                pos_w[t] += mag
+        elif s < 0:
+            for t in tokens:
+                neg_w[t] += mag
+    pos_items = sorted(pos_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
+    neg_items = sorted(neg_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
+    pos_top = [{"word": w, "score": float(v)} for w, v in pos_items]
+    neg_top = [{"word": w, "score": float(-v)} for w, v in neg_items]  # negative sign to show negativity
     return pos_top, neg_top
 # -------------------- Main entry --------------------
 def analyze_chat(
     chat: Dict[str, Any],
+    max_bert_persian: int = 500
 ) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
     df = build_df(chat)
             "top5_positive_lex": [],
             "top5_negative_lex": [],
         }
+        fig = plt.figure(figsize=(22, 8))
         plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
         plt.tight_layout()
         return empty, fig, [], []
+    df, used = compute_sentiments(df, max_bert_persian=max_bert_persian, bert_batch_size=16)
     ws = weekly_series(df)
     weekly_records = [{"week_end": idx.isoformat(), "avg_sentiment": float(val)} for idx, val in ws.items()]
     fig, info = make_weekly_plot(df, name)
+    # IMPORTANT: this fixes "no negative words" even when negatives exist
+    pos_top, neg_top = top_words_weighted_by_sentiment(df, top_n=5)
     result = {
         "chat_name": name,