Spaces:

Sajjadistic
/

telegram-sentiment-analysis

Sleeping

App Files Files Community

Sajjadistic commited on Dec 13, 2025

Commit

a273b11

verified ·

1 Parent(s): 306b7e2

Update analysis_core.py

Browse files

Files changed (1) hide show

analysis_core.py +20 -20

analysis_core.py CHANGED Viewed

@@ -212,11 +212,11 @@ def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 200) -> Tuple[p
     if en_idx:
         df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
-    # Persian: lexicon for all (fast baseline)
     if pers_idx:
         df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
-    # Persian: overwrite first N with BERT (better quality)
     bert_idx = pers_idx[: max(0, int(max_bert_persian))]
     if bert_idx:
         df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
@@ -235,7 +235,7 @@ def weekly_series(df: pd.DataFrame) -> pd.Series:
     )
-def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str) -> pd.Series | None:
     start = week_end - pd.Timedelta(days=7)
     sub = df[(df["date"] > start) & (df["date"] <= week_end)]
     if sub.empty:
@@ -255,7 +255,6 @@ def extract_lex_words_from_text(text: str, polarity: str, min_words: int = 4) ->
         if polarity == "neg" and t in persian_negative:
             out.append(t)
-    # if no lex hits, fall back to first tokens so annotation is never empty
     if len(out) < min_words:
         for t in tokens:
             if t not in out:
@@ -274,13 +273,7 @@ def _shape_fa(s: str) -> str:
         return str(s)
-def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, Any]]:
-    """
-    returns (fig, info_for_text_outputs)
-    info contains:
-      peak_week_end, low_week_end, peak_words, low_words,
-      peak_word_main, low_word_main
-    """
     ws = weekly_series(df)
     info: Dict[str, Any] = {
         "peak_week_end": None,
@@ -305,7 +298,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
     x = ws.index
     y = ws.values.astype(float)
-    # robust y-limits so one extreme point doesn't flatten everything
     q_lo = float(np.quantile(y, 0.05))
     q_hi = float(np.quantile(y, 0.95))
     if q_hi - q_lo < 0.15:
@@ -317,7 +310,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
     plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
-    # find peak/low week
     peak_week = ws.idxmax()
     low_week = ws.idxmin()
@@ -335,14 +328,11 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
     info["peak_word_main"] = peak_words[0] if peak_words else None
     info["low_word_main"] = low_words[0] if low_words else None
-    # annotate peak
     peak_y = float(ws.loc[peak_week])
     low_y = float(ws.loc[low_week])
-    # set y-limits (robust)
     plt.ylim(y_min_plot, y_max_plot)
-    # if peak/low is outside robust bounds, annotate at border with true value
     def clamp(val: float) -> float:
         return float(min(max(val, y_min_plot), y_max_plot))
@@ -376,7 +366,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
 # -------------------- Top lex words with score --------------------
 def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
     """
-    score definition (simple + consistent):
     positive word score = count(word) / total_tokens
     negative word score = - count(word) / total_tokens
     """
@@ -411,7 +401,11 @@ def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]
 # -------------------- Main entry --------------------
-def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
     df = build_df(chat)
     name = get_chat_name(chat, "Selected chat")
@@ -426,6 +420,8 @@ def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dic
             "peak_words": [],
             "low_words": [],
             "weekly": [],
         }
         fig = plt.figure(figsize=(12, 5))
         plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
@@ -449,12 +445,16 @@ def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dic
         "bert_used_on_persian_messages": int(used),
         "overall_avg_sentiment": overall,
         "weekly": weekly_records,
         "peak_week_end": info["peak_week_end"],
         "low_week_end": info["low_week_end"],
-        "peak_words": info["peak_words"],
-        "low_words": info["low_words"],
         "peak_word_main": info["peak_word_main"],
         "low_word_main": info["low_word_main"],
         "top5_positive_lex": pos_top,
         "top5_negative_lex": neg_top,
     }

     if en_idx:
         df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
+    # Persian: lexicon baseline for all
     if pers_idx:
         df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
+    # Persian: overwrite first N with BERT
     bert_idx = pers_idx[: max(0, int(max_bert_persian))]
     if bert_idx:
         df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
     )
+def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str):
     start = week_end - pd.Timedelta(days=7)
     sub = df[(df["date"] > start) & (df["date"] <= week_end)]
     if sub.empty:
         if polarity == "neg" and t in persian_negative:
             out.append(t)
     if len(out) < min_words:
         for t in tokens:
             if t not in out:
         return str(s)
+def make_weekly_plot(df: pd.DataFrame, chat_name: str):
     ws = weekly_series(df)
     info: Dict[str, Any] = {
         "peak_week_end": None,
     x = ws.index
     y = ws.values.astype(float)
+    # robust y-limits: use 5-95% so an extreme outlier doesn't flatten the plot
     q_lo = float(np.quantile(y, 0.05))
     q_hi = float(np.quantile(y, 0.95))
     if q_hi - q_lo < 0.15:
     plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
+    # peak/low weeks
     peak_week = ws.idxmax()
     low_week = ws.idxmin()
     info["peak_word_main"] = peak_words[0] if peak_words else None
     info["low_word_main"] = low_words[0] if low_words else None
     peak_y = float(ws.loc[peak_week])
     low_y = float(ws.loc[low_week])
     plt.ylim(y_min_plot, y_max_plot)
     def clamp(val: float) -> float:
         return float(min(max(val, y_min_plot), y_max_plot))
 # -------------------- Top lex words with score --------------------
 def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
     """
+    score definition:
     positive word score = count(word) / total_tokens
     negative word score = - count(word) / total_tokens
     """
 # -------------------- Main entry --------------------
+def analyze_chat(
+    chat: Dict[str, Any],
+    max_bert_persian: int = 200
+) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
     df = build_df(chat)
     name = get_chat_name(chat, "Selected chat")
             "peak_words": [],
             "low_words": [],
             "weekly": [],
+            "top5_positive_lex": [],
+            "top5_negative_lex": [],
         }
         fig = plt.figure(figsize=(12, 5))
         plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
         "bert_used_on_persian_messages": int(used),
         "overall_avg_sentiment": overall,
         "weekly": weekly_records,
+        # requirement (2)
         "peak_week_end": info["peak_week_end"],
         "low_week_end": info["low_week_end"],
         "peak_word_main": info["peak_word_main"],
         "low_word_main": info["low_word_main"],
+        "peak_words": info["peak_words"],
+        "low_words": info["low_words"],
+        # requirement (4)
         "top5_positive_lex": pos_top,
         "top5_negative_lex": neg_top,
     }