Spaces:

Sajjadistic
/

telegram-sentiment-analysis

Sleeping

App Files Files Community

Sajjadistic commited on Dec 13, 2025

Commit

8b31d5e

verified ·

1 Parent(s): 94c5181

Update analysis_core.py

Browse files

Files changed (1) hide show

analysis_core.py +130 -204

analysis_core.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import re
-from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 import pandas as pd
@@ -30,64 +30,18 @@ def _ensure_nltk() -> None:
     except LookupError:
         nltk.download("vader_lexicon", quiet=True)
-    try:
-        nltk.data.find("tokenizers/punkt_tab")
-    except LookupError:
-        try:
-            nltk.download("punkt_tab", quiet=True)
-        except Exception:
-            pass
 _ensure_nltk()
-sia = SentimentIntensityAnalyzer()
-# -------------------- Transformer (ParsBERT DeepSentiPers) --------------------
 MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
-BERT_MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
-BERT_MODEL.eval()
-# -------------------- Lexicons (from your notebook idea) --------------------
-persian_positive = {
-    "خوب","خیلی خوب","عالی","عالیه","خفن","باحال","نایس","قشنگ","زیبا","خوشگل",
-    "عاشق","عاشقتم","دوستت","دوستت_دارم","دوستتدارم","مرسی","ممنون","ممنونم",
-    "دمت گرم","دمت‌گرم","شاد","خوشحال","خوشحالم","آرومم","آرامشم","خوشبختم","راضیم",
-    "بی‌نظیر","فوق‌العاده","توپ","محشر","شگفت‌انگیز","ایول","قربونت","قربانت","عزیزمی","عزیزم",
-    "❤️","💖","💗","💙","💚","💛","💜","💕"
-}
-persian_negative = {
-    "بد","خیلی بد","بدم","افتضاح","مزخرف","چرند","حالم_بده","حالمبده","ناراحت","غمگین",
-    "اعصابم","اعصاب","کلافه","خسته","داغون","نفرت","متنفرم","لعنت","لعنتی","مسخره",
-    "😡","😠","😞","😔","😭","💔"
-}
-persian_positive_phrases = {
-    "خیلی دوستت دارم", "دوستت دارم", "دمت گرم", "آفرین", "دمت", "به‌به", "به به"
-}
-persian_negative_phrases = {
-    "حالم بده", "خیلی بده", "اعصابم خورد", "حوصله ندارم"
-}
-persian_stopwords = {
-    "و","در","به","از","که","این","اون","آن","من","تو","ما","شما","او","هیچ","هم","یا","اما","اگر",
-    "برای","با","روی","تا","نه","را","همه","چی","چیه","چرا","کجا","کی","چه","یه","یک","بود","هست",
-    "می","میخوام","میشه","کرد","کردم","کن","کردی","کردن","دارم","داری","داره","داشت","شد","شده"
-}
-english_stopwords = {
-    "the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
-    "is","am","are","was","were","be","been","being","i","you","he","she","we","they",
-    "this","that","these","those","it","as","by","from","not","no","yes","do","does","did"
-}
-stopwords_all = persian_stopwords.union(english_stopwords)
 # -------------------- Telegram parsing --------------------
@@ -101,8 +55,10 @@ def extract_text(msg_text: Any) -> str:
         for part in msg_text:
             if isinstance(part, str):
                 parts.append(part)
-            elif isinstance(part, dict) and "text" in part and isinstance(part["text"], str):
-                parts.append(part["text"])
         return "".join(parts)
     return ""
@@ -114,7 +70,7 @@ def extract_chats(data: Dict[str, Any]) -> List[Dict[str, Any]]:
             return lst
     if isinstance(data, dict) and "messages" in data and isinstance(data["messages"], list):
         return [data]
-    raise ValueError("JSON format not recognized. expected Telegram export with data['chats']['list'].")
 def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
@@ -124,28 +80,27 @@ def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
     return fallback
-def build_df(selected_chat: Dict[str, Any]) -> pd.DataFrame:
-    records: List[Dict[str, Any]] = []
-    for msg in selected_chat.get("messages", []):
         if not isinstance(msg, dict):
             continue
         text = extract_text(msg.get("text", "")).strip()
         if not text:
             continue
-        date_str = msg.get("date")
-        if not isinstance(date_str, str) or not date_str:
             continue
         sender = msg.get("from") or msg.get("actor") or "Unknown"
-        records.append(
-            {
-                "id": msg.get("id"),
-                "date_raw": date_str,
-                "sender": sender,
-                "text": text,
-            }
-        )
-    df = pd.DataFrame(records)
     if df.empty:
         return df
@@ -154,178 +109,149 @@ def build_df(selected_chat: Dict[str, Any]) -> pd.DataFrame:
     return df
-# -------------------- Tokenization + language detection --------------------
-def contains_persian(text: str) -> bool:
-    return any("\u0600" <= ch <= "\u06FF" for ch in str(text))
-def custom_tokenize(text: str) -> List[str]:
-    text = re.sub(r"http\S+|www\.\S+", " ", str(text))
-    text = text.replace("\u200c", " ")
-    tokens = re.findall(r"[\w\u0600-\u06FF]+", text)
-    tokens = [t.replace("دوستتدارم", "دوستت_دارم").replace("حالمبده", "حالم_بده") for t in tokens]
-    return tokens
-def persian_lexicon_score(text: str) -> float:
-    text = str(text)
-    tokens = custom_tokenize(text)
-    pos = 0
-    neg = 0
-    for t in tokens:
-        if t in persian_positive:
-            pos += 1
-        elif t in persian_negative:
-            neg += 1
-    norm_text = text.replace("\u200c", " ")
-    for phrase in persian_positive_phrases:
-        if phrase in norm_text:
-            pos += 2
-    for phrase in persian_negative_phrases:
-        if phrase in norm_text:
-            neg += 2
-    score = (pos - neg) / max(1, (pos + neg))
-    return float(max(-1.0, min(1.0, score)))
 @torch.inference_mode()
-def persian_sent_bert(text: str) -> float:
     inputs = TOKENIZER(text, return_tensors="pt", truncation=True, max_length=256).to(DEVICE)
-    out = BERT_MODEL(**inputs)
     probs = torch.softmax(out.logits, dim=-1).squeeze(0)
     if probs.numel() >= 2:
-        score = float(probs[1] - probs[0])  # 0=neg, 1=pos
         return float(max(-1.0, min(1.0, score)))
     return 0.0
-def persian_sentiment_hybrid(text: str) -> float:
-    # hybrid: transformer + lexicon (keeps your notebook spirit)
-    trf = persian_sent_bert(text)
-    lex = persian_lexicon_score(text)
-    return float(0.7 * trf + 0.3 * lex)
-def label_sentiment(score: float, th: float = 0.1) -> str:
-    if score > th:
-        return "positive"
-    if score < -th:
-        return "negative"
-    return "neutral"
-# -------------------- Persian shaping for plot --------------------
-def shape_text(text: str) -> str:
-    text = str(text)
     try:
-        reshaped = arabic_reshaper.reshape(text)
-        return get_display(reshaped)
     except Exception:
-        return text
-# -------------------- Plot helpers (weekly + annotate peak/low) --------------------
-def _weekly_df(df: pd.DataFrame) -> pd.DataFrame:
-    df_time = (
         df.set_index("date")
           .resample("W")["sentiment_final"]
           .mean()
-          .to_frame("avg_sentiment")
           .dropna()
     )
-    return df_time
-def _extreme_message(df: pd.DataFrame, week_end: pd.Timestamp, mode: str = "max"):
-    start = week_end - pd.Timedelta(days=7)
-    sub = df[(df["date"] > start) & (df["date"] <= week_end)]
-    if sub.empty:
-        return None
-    if mode == "max":
-        return sub.loc[sub["sentiment_final"].idxmax()]
-    return sub.loc[sub["sentiment_final"].idxmin()]
-def _extract_words_from_message(text: str, polarity: str = "pos", min_words: int = 4) -> List[str]:
-    tokens = custom_tokenize(text)
-    tokens = [t for t in tokens if len(t) > 1 and t not in stopwords_all]
-    words: List[str] = []
-    for t in tokens:
-        if polarity == "pos" and t in persian_positive:
-            words.append(t)
-        elif polarity == "neg" and t in persian_negative:
-            words.append(t)
-    if len(words) < min_words:
-        for t in tokens:
-            if t not in words:
-                words.append(t)
-            if len(words) >= min_words:
-                break
-    return words[:max(min_words, 4)]
-def make_weekly_plot(df: pd.DataFrame, chat_name: str):
-    df_time = _weekly_df(df)
-    fig = plt.figure(figsize=(12, 5))
-    plt.plot(df_time.index, df_time["avg_sentiment"], color="red")
     plt.axhline(0, linestyle="--")
-    plt.title(shape_text(f"Emotion Trajectory in Chat: {chat_name}"))
-    plt.xlabel("Time (weeks)")
-    plt.ylabel("Average sentiment score")
-    plt.grid(True)
-    if not df_time.empty:
-        peak_week = df_time["avg_sentiment"].idxmax()
-        low_week = df_time["avg_sentiment"].idxmin()
-        peak_msg = _extreme_message(df, peak_week, "max")
-        low_msg = _extreme_message(df, low_week, "min")
-        peak_words = _extract_words_from_message(peak_msg["text"], "pos", 4) if peak_msg is not None else []
-        low_words = _extract_words_from_message(low_msg["text"], "neg", 4) if low_msg is not None else []
-        plt.scatter([peak_week], [df_time.loc[peak_week, "avg_sentiment"]])
-        plt.annotate(
-            shape_text("، ".join(peak_words)),
-            xy=(peak_week, df_time.loc[peak_week, "avg_sentiment"]),
-            xytext=(peak_week, df_time["avg_sentiment"].max() + 0.05),
-            arrowprops=dict(arrowstyle="->"),
-            ha="center",
-            fontsize=10,
-        )
-        plt.scatter([low_week], [df_time.loc[low_week, "avg_sentiment"]])
-        plt.annotate(
-            shape_text("، ".join(low_words)),
-            xy=(low_week, df_time.loc[low_week, "avg_sentiment"]),
-            xytext=(low_week, df_time["avg_sentiment"].min() - 0.05),
-            arrowprops=dict(arrowstyle="->"),
-            ha="center",
-            fontsize=10,
-        )
     plt.tight_layout()
     return fig
-# -------------------- Main analysis (matches your notebook behavior) --------------------
-def analyze_selected_chat(chat: Dict[str, Any], n_trf: int = 100) -> Tuple[Dict[str, Any], Any]:
     df = build_df(chat)
-    chat_name = get_chat_name(chat, "Selected chat")
     if df.empty:
-        empty = {
-            "chat_name": chat_name,
             "message_count": 0,
-            "transformer_used_on_persian_messages": 0,
-            "over

 from __future__ import annotations
 import re
+from typing import Any, Dict, List, Tuple
 import numpy as np
 import pandas as pd
     except LookupError:
         nltk.download("vader_lexicon", quiet=True)
 _ensure_nltk()
+SIA = SentimentIntensityAnalyzer()
+# -------------------- Model (Persian sentiment) --------------------
 MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
+MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
+MODEL.eval()
 # -------------------- Telegram parsing --------------------
         for part in msg_text:
             if isinstance(part, str):
                 parts.append(part)
+            elif isinstance(part, dict):
+                t = part.get("text")
+                if isinstance(t, str):
+                    parts.append(t)
         return "".join(parts)
     return ""
             return lst
     if isinstance(data, dict) and "messages" in data and isinstance(data["messages"], list):
         return [data]
+    raise ValueError("JSON format not recognized. Need Telegram export result.json with chats.list.")
 def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
     return fallback
+def build_df(chat: Dict[str, Any]) -> pd.DataFrame:
+    rows: List[Dict[str, Any]] = []
+    for msg in chat.get("messages", []):
         if not isinstance(msg, dict):
             continue
         text = extract_text(msg.get("text", "")).strip()
         if not text:
             continue
+        date_raw = msg.get("date")
+        if not isinstance(date_raw, str) or not date_raw:
             continue
         sender = msg.get("from") or msg.get("actor") or "Unknown"
+        if not isinstance(sender, str):
+            sender = str(sender)
+        rows.append({"date_raw": date_raw, "sender": sender, "text": text})
+    df = pd.DataFrame(rows)
     if df.empty:
         return df
     return df
+# -------------------- Sentiment scoring --------------------
+_FA_RE = re.compile(r"[\u0600-\u06FF]")
+def is_persian(text: str) -> bool:
+    return bool(_FA_RE.search(text or ""))
+def score_en_vader(text: str) -> float:
+    return float(SIA.polarity_scores(text)["compound"])
 @torch.inference_mode()
+def score_fa_bert(text: str) -> float:
+    if not text:
+        return 0.0
     inputs = TOKENIZER(text, return_tensors="pt", truncation=True, max_length=256).to(DEVICE)
+    out = MODEL(**inputs)
     probs = torch.softmax(out.logits, dim=-1).squeeze(0)
     if probs.numel() >= 2:
+        score = float(probs[1] - probs[0])  # assume 0=neg, 1=pos
         return float(max(-1.0, min(1.0, score)))
     return 0.0
+def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 200) -> Tuple[pd.DataFrame, int]:
+    """
+    - english: vader only
+    - persian: bert, but only first max_bert_persian persian messages (speed)
+    - for remaining persian messages: use vader (fallback) to avoid NaNs
+    """
+    if df.empty:
+        return df, 0
+    df = df.copy()
+    df["sentiment_final"] = np.nan
+    pers_mask = df["text"].astype(str).apply(is_persian)
+    pers_idx = df.index[pers_mask].tolist()
+    en_idx = df.index[~pers_mask].tolist()
+    # english
+    if en_idx:
+        df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
+    # persian - bert on first N
+    bert_idx = pers_idx[: max(0, int(max_bert_persian))]
+    if bert_idx:
+        df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
+    # persian fallback - vader
+    rest_idx = [i for i in pers_idx if i not in set(bert_idx)]
+    if rest_idx:
+        df.loc[rest_idx, "sentiment_final"] = df.loc[rest_idx, "text"].astype(str).apply(score_en_vader)
+    df["sentiment_final"] = df["sentiment_final"].fillna(0.0)
+    return df, len(bert_idx)
+# -------------------- Plotting --------------------
+def _shape_fa(s: str) -> str:
     try:
+        return get_display(arabic_reshaper.reshape(str(s)))
     except Exception:
+        return str(s)
+def make_weekly_plot(df: pd.DataFrame, chat_name: str):
+    fig = plt.figure(figsize=(12, 5))
+    plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
+    plt.xlabel("Time (weeks)")
+    plt.ylabel("Average sentiment score")
+    plt.grid(True)
+    if df.empty:
+        plt.tight_layout()
+        return fig
+    weekly = (
         df.set_index("date")
           .resample("W")["sentiment_final"]
           .mean()
           .dropna()
     )
+    if weekly.empty:
+        plt.tight_layout()
+        return fig
+    plt.plot(weekly.index, weekly.values, color="red")
     plt.axhline(0, linestyle="--")
+    # mark max/min weeks
+    wmax = weekly.idxmax()
+    wmin = weekly.idxmin()
+    plt.scatter([wmax], [float(weekly.loc[wmax])])
+    plt.scatter([wmin], [float(weekly.loc[wmin])])
     plt.tight_layout()
     return fig
+# -------------------- Main API for UI --------------------
+def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dict[str, Any], Any]:
     df = build_df(chat)
+    name = get_chat_name(chat, "Selected chat")
     if df.empty:
+        result = {
+            "chat_name": name,
             "message_count": 0,
+            "bert_used_on_persian_messages": 0,
+            "overall_avg_sentiment": 0.0,
+            "weekly": [],
+        }
+        fig = make_weekly_plot(df, name)
+        return result, fig
+    df, used = compute_sentiments(df, max_bert_persian=max_bert_persian)
+    overall = float(df["sentiment_final"].mean())
+    weekly = (
+        df.set_index("date")
+          .resample("W")["sentiment_final"]
+          .mean()
+          .dropna()
+          .reset_index()
+          .rename(columns={"date": "week_end", "sentiment_final": "avg_sentiment"})
+    )
+    weekly_records = [
+        {"week_end": r["week_end"].isoformat(), "avg_sentiment": float(r["avg_sentiment"])}
+        for _, r in weekly.iterrows()
+    ]
+    result = {
+        "chat_name": name,
+        "message_count": int(len(df)),
+        "bert_used_on_persian_messages": int(used),
+        "overall_avg_sentiment": overall,
+        "weekly": weekly_records,
+    }
+    fig = make_weekly_plot(df, name)
+    return result, fig