Spaces:

Sajjadistic
/

telegram-sentiment-analysis

Sleeping

App Files Files Community

Sajjadistic commited on Dec 13, 2025

Commit

52e6ffe

verified ·

1 Parent(s): 456ed3b

Update analysis_core.py

Browse files

Files changed (1) hide show

analysis_core.py +302 -123

analysis_core.py CHANGED Viewed

@@ -36,7 +36,7 @@ _ensure_nltk()
 SIA = SentimentIntensityAnalyzer()
-# ===================== BERT (Persian) =====================
 MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -45,7 +45,7 @@ MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE
 MODEL.eval()
-# ===================== Lexicons =====================
 persian_positive = {
     "خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
     "دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
@@ -56,10 +56,13 @@ persian_negative = {
     "حالم_بده","حالمبده","😡","😠","😞","😔","😭","💔"
 }
 persian_stopwords = {
     "و","در","به","از","که","این","اون","آن","من","تو","ما","شما","او","هم","یا","اما","اگر",
-    "برای","با","روی","تا","نه","را","همه","چی","چیه","چرا","کجا","کی","چه","یه","یک","بود",
-    "هست","می","میشه","کرد","کردم","کن","کردی","کردن","دارم","داری","داره","داشت","شد","شده"
 }
 english_stopwords = {
     "the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
@@ -69,33 +72,6 @@ english_stopwords = {
 stopwords_all = persian_stopwords.union(english_stopwords)
-# ===================== Utils =====================
-_FA_RE = re.compile(r"[\u0600-\u06FF]")
-def _shape_fa(s: str) -> str:
-    try:
-        return get_display(arabic_reshaper.reshape(str(s)))
-    except Exception:
-        return str(s)
-def is_persian(text: str) -> bool:
-    return bool(_FA_RE.search(text or ""))
-def custom_tokenize(text: str) -> List[str]:
-    text = re.sub(r"http\S+|www\.\S+", " ", str(text))
-    text = text.replace("\u200c", " ")
-    tokens = re.findall(r"[\w\u0600-\u06FF]+", text)
-    tokens = [
-        t.replace("دوستتدارم", "دوستت_دارم")
-         .replace("حالمبده", "حالم_بده")
-        for t in tokens
-    ]
-    return tokens
 # ===================== Telegram parsing =====================
 def extract_text(msg_text: Any) -> str:
     if msg_text is None:
@@ -103,175 +79,378 @@ def extract_text(msg_text: Any) -> str:
     if isinstance(msg_text, str):
         return msg_text
     if isinstance(msg_text, list):
-        out = []
-        for p in msg_text:
-            if isinstance(p, str):
-                out.append(p)
-            elif isinstance(p, dict) and isinstance(p.get("text"), str):
-                out.append(p["text"])
-        return "".join(out)
     return ""
 def extract_chats(data: Dict[str, Any]) -> List[Dict[str, Any]]:
-    if "chats" in data and "list" in data["chats"]:
-        return data["chats"]["list"]
-    if "messages" in data:
         return [data]
-    raise ValueError("Unsupported Telegram JSON format")
 def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
-    return chat.get("name") or chat.get("title") or fallback
 def build_df(chat: Dict[str, Any]) -> pd.DataFrame:
-    rows = []
     for msg in chat.get("messages", []):
-        text = extract_text(msg.get("text"))
         if not text:
             continue
-        date = msg.get("date")
-        if not date:
             continue
-        rows.append({
-            "date": pd.to_datetime(date, utc=True).tz_convert(None),
-            "text": text
-        })
-    return pd.DataFrame(rows).sort_values("date").reset_index(drop=True)
-# ===================== Sentiment =====================
-def score_en(text: str) -> float:
     return float(SIA.polarity_scores(text)["compound"])
 @torch.inference_mode()
-def score_fa_bert(texts: List[str], batch: int = 16) -> List[float]:
-    out = []
-    for i in range(0, len(texts), batch):
-        t = texts[i:i+batch]
-        inp = TOKENIZER(t, return_tensors="pt", truncation=True,
-                        padding=True, max_length=256).to(DEVICE)
-        logits = MODEL(**inp).logits
-        probs = torch.softmax(logits, dim=-1)
-        diff = (probs[:,1] - probs[:,0]).cpu().numpy()
-        out.extend(diff.tolist())
-    return out
-def compute_sentiments(df: pd.DataFrame, max_bert: int = 500):
     df = df.copy()
     df["sentiment_final"] = 0.0
-    fa_mask = df["text"].apply(is_persian)
-    df.loc[~fa_mask, "sentiment_final"] = df.loc[
-        ~fa_mask, "text"
-    ].apply(score_en)
-    df.loc[fa_mask, "sentiment_final"] = df.loc[
-        fa_mask, "text"
-    ].apply(lambda t: 0.0)
-    idx = df.index[fa_mask][:max_bert]
-    if len(idx):
-        scores = score_fa_bert(df.loc[idx, "text"].tolist())
-        df.loc[idx, "sentiment_final"] = scores
-    return df, len(idx)
-# ===================== Weekly =====================
 def weekly_series(df: pd.DataFrame) -> pd.Series:
-    return df.set_index("date").resample("W")["sentiment_final"].mean().dropna()
 def make_weekly_plot(df: pd.DataFrame, chat_name: str):
     ws = weekly_series(df)
     fig = plt.figure(figsize=(22, 8))
     plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
     plt.xlabel("Time (weeks)")
     plt.ylabel("Average sentiment score")
-    plt.axhline(0, linestyle="--")
     plt.grid(True)
     if ws.empty:
-        return fig, {}
     x = ws.index
     y = ws.values.astype(float)
-    ymin, ymax = float(y.min()), float(y.max())
-    pad = 0.1 * max(1e-6, ymax - ymin)
-    plt.ylim(ymin - pad, ymax + pad)
-    plt.plot(x, y, marker="o", color="red", linewidth=2)
     plt.tight_layout()
-    return fig, {}
-# ===================== TOP WORDS (NORMALIZED) =====================
-def top_words_weighted_by_sentiment(
-    df: pd.DataFrame,
-    top_n: int = 5
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
     pos_w = defaultdict(float)
     neg_w = defaultdict(float)
-    for text, s in zip(df["text"], df["sentiment_final"]):
-        tokens = [
-            t for t in custom_tokenize(text)
-            if len(t) > 1 and t not in stopwords_all
-        ]
         if not tokens or abs(s) < 1e-9:
             continue
         if s > 0:
             for t in tokens:
-                pos_w[t] += s
         elif s < 0:
             for t in tokens:
-                neg_w[t] += abs(s)
-    max_pos = max(pos_w.values(), default=1.0)
-    max_neg = max(neg_w.values(), default=1.0)
-    pos_top = [
-        {"word": w, "score": v / max_pos}
-        for w, v in sorted(pos_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
-    ]
-    neg_top = [
-        {"word": w, "score": -v / max_neg}
-        for w, v in sorted(neg_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
-    ]
     return pos_top, neg_top
-# ===================== Main =====================
-def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 500):
     df = build_df(chat)
-    name = get_chat_name(chat, "Chat")
     if df.empty:
         fig = plt.figure(figsize=(22, 8))
-        return {}, fig, [], []
-    df, used = compute_sentiments(df, max_bert=max_bert_persian)
-    fig, _ = make_weekly_plot(df, name)
-    pos_top, neg_top = top_words_weighted_by_sentiment(df)
     result = {
         "chat_name": name,
-        "message_count": len(df),
-        "bert_used_on_persian_messages": used,
-        "overall_avg_sentiment": float(df["sentiment_final"].mean()),
         "top5_positive_lex": pos_top,
         "top5_negative_lex": neg_top,
     }

 SIA = SentimentIntensityAnalyzer()
+# ===================== Persian sentiment model =====================
 MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODEL.eval()
+# ===================== Small lex helpers (not used for final top words scale anymore) =====================
 persian_positive = {
     "خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
     "دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
     "حالم_بده","حالمبده","😡","😠","😞","😔","😭","💔"
 }
+persian_positive_phrases = {"دوستت دارم", "خیلی دوستت دارم", "دمت گرم", "آفرین"}
+persian_negative_phrases = {"حالم بده", "خیلی بده", "اعصابم خورد"}
 persian_stopwords = {
     "و","در","به","از","که","این","اون","آن","من","تو","ما","شما","او","هم","یا","اما","اگر",
+    "برای","با","روی","تا","نه","را","همه","چی","چیه","چرا","کجا","کی","چه","یه","یک","بود","هست",
+    "می","میشه","کرد","کردم","کن","کردی","کردن","دارم","داری","داره","داشت","شد","شده"
 }
 english_stopwords = {
     "the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
 stopwords_all = persian_stopwords.union(english_stopwords)
 # ===================== Telegram parsing =====================
 def extract_text(msg_text: Any) -> str:
     if msg_text is None:
     if isinstance(msg_text, str):
         return msg_text
     if isinstance(msg_text, list):
+        parts: List[str] = []
+        for part in msg_text:
+            if isinstance(part, str):
+                parts.append(part)
+            elif isinstance(part, dict):
+                t = part.get("text")
+                if isinstance(t, str):
+                    parts.append(t)
+        return "".join(parts)
     return ""
 def extract_chats(data: Dict[str, Any]) -> List[Dict[str, Any]]:
+    if isinstance(data, dict) and "chats" in data and isinstance(data["chats"], dict) and "list" in data["chats"]:
+        lst = data["chats"]["list"]
+        if isinstance(lst, list):
+            return lst
+    if isinstance(data, dict) and "messages" in data and isinstance(data["messages"], list):
         return [data]
+    raise ValueError("JSON format not recognized. Need Telegram export result.json with chats.list.")
 def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
+    name = chat.get("name") or chat.get("title")
+    if isinstance(name, str) and name.strip():
+        return name.strip()
+    return fallback
 def build_df(chat: Dict[str, Any]) -> pd.DataFrame:
+    rows: List[Dict[str, Any]] = []
     for msg in chat.get("messages", []):
+        if not isinstance(msg, dict):
+            continue
+        text = extract_text(msg.get("text", "")).strip()
         if not text:
             continue
+        date_raw = msg.get("date")
+        if not isinstance(date_raw, str) or not date_raw:
             continue
+        rows.append({"date_raw": date_raw, "text": text})
+    df = pd.DataFrame(rows)
+    if df.empty:
+        return df
+    df["date"] = pd.to_datetime(df["date_raw"], errors="coerce", utc=True).dt.tz_convert(None)
+    df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
+    return df
+# ===================== Tokenize + Persian detect =====================
+_FA_RE = re.compile(r"[\u0600-\u06FF]")
+def is_persian(text: str) -> bool:
+    return bool(_FA_RE.search(text or ""))
+def custom_tokenize(text: str) -> List[str]:
+    text = re.sub(r"http\S+|www\.\S+", " ", str(text))
+    text = text.replace("\u200c", " ")
+    tokens = re.findall(r"[\w\u0600-\u06FF]+", text)
+    tokens = [t.replace("دوستتدارم", "دوستت_دارم").replace("حالمبده", "حالم_بده") for t in tokens]
+    return tokens
+# ===================== Sentiment scoring =====================
+def score_en_vader(text: str) -> float:
     return float(SIA.polarity_scores(text)["compound"])
 @torch.inference_mode()
+def score_fa_bert_batch(texts: List[str], batch_size: int = 16) -> List[float]:
+    scores: List[float] = []
+    for i in range(0, len(texts), batch_size):
+        chunk = texts[i:i + batch_size]
+        inputs = TOKENIZER(chunk, return_tensors="pt", truncation=True, padding=True, max_length=256).to(DEVICE)
+        out = MODEL(**inputs)
+        probs = torch.softmax(out.logits, dim=-1)
+        diff = (probs[:, 1] - probs[:, 0]).detach().cpu().numpy().astype(float).tolist()
+        diff = [float(max(-1.0, min(1.0, d))) for d in diff]  # keep sentiment in [-1, 1]
+        scores.extend(diff)
+    return scores
+def persian_lexicon_score(text: str) -> float:
+    tokens = custom_tokenize(text)
+    pos = 0
+    neg = 0
+    for t in tokens:
+        if t in persian_positive:
+            pos += 1
+        elif t in persian_negative:
+            neg += 1
+    norm = str(text).replace("\u200c", " ")
+    for ph in persian_positive_phrases:
+        if ph in norm:
+            pos += 2
+    for ph in persian_negative_phrases:
+        if ph in norm:
+            neg += 2
+    return float((pos - neg) / max(1, (pos + neg)))
+def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 500, bert_batch_size: int = 16) -> Tuple[pd.DataFrame, int]:
+    if df.empty:
+        return df, 0
     df = df.copy()
     df["sentiment_final"] = 0.0
+    pers_mask = df["text"].astype(str).apply(is_persian)
+    pers_idx = df.index[pers_mask].tolist()
+    en_idx = df.index[~pers_mask].tolist()
+    if en_idx:
+        df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
+    if pers_idx:
+        df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
+    bert_idx = pers_idx[: max(0, int(max_bert_persian))]
+    if bert_idx:
+        texts = df.loc[bert_idx, "text"].astype(str).tolist()
+        scores = score_fa_bert_batch(texts, batch_size=int(bert_batch_size))
+        df.loc[bert_idx, "sentiment_final"] = scores
+    df["sentiment_final"] = df["sentiment_final"].fillna(0.0)
+    df["sentiment_final"] = df["sentiment_final"].clip(-1.0, 1.0)  # safety
+    return df, len(bert_idx)
+# ===================== Weekly aggregation + extremes =====================
 def weekly_series(df: pd.DataFrame) -> pd.Series:
+    return (
+        df.set_index("date")
+          .resample("W")["sentiment_final"]
+          .mean()
+          .dropna()
+    )
+def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str):
+    start = week_end - pd.Timedelta(days=7)
+    sub = df[(df["date"] > start) & (df["date"] <= week_end)]
+    if sub.empty:
+        return None
+    if mode == "max":
+        return sub.loc[sub["sentiment_final"].idxmax()]
+    return sub.loc[sub["sentiment_final"].idxmin()]
+def extract_lex_words_from_text(text: str, polarity: str, min_words: int = 4) -> List[str]:
+    tokens = [t for t in custom_tokenize(text) if len(t) > 1 and t not in stopwords_all]
+    out: List[str] = []
+    for t in tokens:
+        if polarity == "pos" and (t in persian_positive):
+            out.append(t)
+        if polarity == "neg" and (t in persian_negative):
+            out.append(t)
+    if len(out) < min_words:
+        for t in tokens:
+            if t not in out:
+                out.append(t)
+            if len(out) >= min_words:
+                break
+    return out[:max(min_words, 4)]
+# ===================== Plot helpers =====================
+def _shape_fa(s: str) -> str:
+    try:
+        return get_display(arabic_reshaper.reshape(str(s)))
+    except Exception:
+        return str(s)
 def make_weekly_plot(df: pd.DataFrame, chat_name: str):
     ws = weekly_series(df)
+    info: Dict[str, Any] = {
+        "peak_week_end": None,
+        "low_week_end": None,
+        "peak_words": [],
+        "low_words": [],
+        "peak_word_main": None,
+        "low_word_main": None,
+    }
+    # big plot so Persian annotations are readable
     fig = plt.figure(figsize=(22, 8))
     plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
     plt.xlabel("Time (weeks)")
     plt.ylabel("Average sentiment score")
     plt.grid(True)
+    plt.axhline(0, linestyle="--")
     if ws.empty:
+        plt.tight_layout()
+        return fig, info
     x = ws.index
     y = ws.values.astype(float)
+    # show true peaks: full min/max range
+    y_min = float(np.min(y))
+    y_max = float(np.max(y))
+    pad = 0.08 * max(1e-9, (y_max - y_min))
+    y_min_plot = y_min - pad
+    y_max_plot = y_max + pad
+    plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
+    plt.ylim(y_min_plot, y_max_plot)
+    plt.margins(x=0.03, y=0.15)
+    peak_week = ws.idxmax()
+    low_week = ws.idxmin()
+    info["peak_week_end"] = peak_week.isoformat()
+    info["low_week_end"] = low_week.isoformat()
+    peak_msg = message_in_week(df, peak_week, "max")
+    low_msg = message_in_week(df, low_week, "min")
+    peak_words = extract_lex_words_from_text(str(peak_msg["text"]) if peak_msg is not None else "", "pos", 4)
+    low_words = extract_lex_words_from_text(str(low_msg["text"]) if low_msg is not None else "", "neg", 4)
+    info["peak_words"] = peak_words
+    info["low_words"] = low_words
+    info["peak_word_main"] = peak_words[0] if peak_words else None
+    info["low_word_main"] = low_words[0] if low_words else None
+    peak_y = float(ws.loc[peak_week])
+    low_y = float(ws.loc[low_week])
+    def clamp(val: float) -> float:
+        return float(min(max(val, y_min_plot), y_max_plot))
+    peak_y_plot = clamp(peak_y)
+    low_y_plot = clamp(low_y)
+    plt.scatter([peak_week], [peak_y_plot])
+    plt.annotate(
+        _shape_fa("، ".join(peak_words) + f" (peak={peak_y:.3f})"),
+        xy=(peak_week, peak_y_plot),
+        xytext=(peak_week, y_max_plot + 0.06 * (y_max_plot - y_min_plot)),
+        arrowprops=dict(arrowstyle="->"),
+        ha="center",
+        fontsize=10,
+    )
+    plt.scatter([low_week], [low_y_plot])
+    plt.annotate(
+        _shape_fa("، ".join(low_words) + f" (low={low_y:.3f})"),
+        xy=(low_week, low_y_plot),
+        xytext=(low_week, y_min_plot - 0.06 * (y_max_plot - y_min_plot)),
+        arrowprops=dict(arrowstyle="->"),
+        ha="center",
+        fontsize=10,
+    )
     plt.tight_layout()
+    return fig, info
+# ===================== Top words (normalized to [-1, +1]) =====================
+def top_words_weighted_by_sentiment(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Uses message sentiment to weight tokens, then normalizes scores into [-1, +1].
+    Positive table: 0..+1
+    Negative table: -1..0
+    """
     pos_w = defaultdict(float)
     neg_w = defaultdict(float)
+    for text, s in zip(df["text"].astype(str).tolist(), df["sentiment_final"].astype(float).tolist()):
+        tokens = [t for t in custom_tokenize(text) if len(t) > 1 and t not in stopwords_all]
         if not tokens or abs(s) < 1e-9:
             continue
         if s > 0:
             for t in tokens:
+                pos_w[t] += float(s)
         elif s < 0:
             for t in tokens:
+                neg_w[t] += float(abs(s))
+    max_pos = max(pos_w.values(), default=0.0)
+    max_neg = max(neg_w.values(), default=0.0)
+    pos_items = sorted(pos_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
+    neg_items = sorted(neg_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
+    if max_pos <= 1e-12:
+        pos_top = []
+    else:
+        pos_top = [{"word": w, "score": float(v / max_pos)} for w, v in pos_items]
+        for d in pos_top:
+            d["score"] = float(max(0.0, min(1.0, d["score"])))
+    if max_neg <= 1e-12:
+        neg_top = []
+    else:
+        neg_top = [{"word": w, "score": float(-(v / max_neg))} for w, v in neg_items]
+        for d in neg_top:
+            d["score"] = float(min(0.0, max(-1.0, d["score"])))
     return pos_top, neg_top
+# ===================== Main entry =====================
+def analyze_chat(
+    chat: Dict[str, Any],
+    max_bert_persian: int = 500
+) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
     df = build_df(chat)
+    name = get_chat_name(chat, "Selected chat")
     if df.empty:
+        empty = {
+            "chat_name": name,
+            "message_count": 0,
+            "bert_used_on_persian_messages": 0,
+            "overall_avg_sentiment": 0.0,
+            "peak_word_main": None,
+            "low_word_main": None,
+            "peak_words": [],
+            "low_words": [],
+            "weekly": [],
+            "top5_positive_lex": [],
+            "top5_negative_lex": [],
+        }
         fig = plt.figure(figsize=(22, 8))
+        plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
+        plt.tight_layout()
+        return empty, fig, [], []
+    df, used = compute_sentiments(df, max_bert_persian=max_bert_persian, bert_batch_size=16)
+    ws = weekly_series(df)
+    weekly_records = [{"week_end": idx.isoformat(), "avg_sentiment": float(val)} for idx, val in ws.items()]
+    overall = float(df["sentiment_final"].mean())
+    fig, info = make_weekly_plot(df, name)
+    pos_top, neg_top = top_words_weighted_by_sentiment(df, top_n=5)
     result = {
         "chat_name": name,
+        "message_count": int(len(df)),
+        "bert_used_on_persian_messages": int(used),
+        "overall_avg_sentiment": overall,
+        "weekly": weekly_records,
+        "peak_week_end": info["peak_week_end"],
+        "low_week_end": info["low_week_end"],
+        "peak_word_main": info["peak_word_main"],
+        "low_word_main": info["low_word_main"],
+        "peak_words": info["peak_words"],
+        "low_words": info["low_words"],
         "top5_positive_lex": pos_top,
         "top5_negative_lex": neg_top,
     }