Sajjadistic commited on
Commit
52e6ffe
·
verified ·
1 Parent(s): 456ed3b

Update analysis_core.py

Browse files
Files changed (1) hide show
  1. analysis_core.py +302 -123
analysis_core.py CHANGED
@@ -36,7 +36,7 @@ _ensure_nltk()
36
  SIA = SentimentIntensityAnalyzer()
37
 
38
 
39
- # ===================== BERT (Persian) =====================
40
  MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
41
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
 
@@ -45,7 +45,7 @@ MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE
45
  MODEL.eval()
46
 
47
 
48
- # ===================== Lexicons =====================
49
  persian_positive = {
50
  "خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
51
  "دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
@@ -56,10 +56,13 @@ persian_negative = {
56
  "حالم_بده","حالمبده","😡","😠","😞","😔","😭","💔"
57
  }
58
 
 
 
 
59
  persian_stopwords = {
60
  "و","در","به","از","که","این","اون","آن","من","تو","ما","شما","او","هم","یا","اما","اگر",
61
- "برای","با","روی","تا","نه","را","همه","چی","چیه","چرا","کجا","کی","چه","یه","یک","بود",
62
- "هست","می","میشه","کرد","کردم","کن","کردی","کردن","دارم","داری","داره","داشت","شد","شده"
63
  }
64
  english_stopwords = {
65
  "the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
@@ -69,33 +72,6 @@ english_stopwords = {
69
  stopwords_all = persian_stopwords.union(english_stopwords)
70
 
71
 
72
- # ===================== Utils =====================
73
- _FA_RE = re.compile(r"[\u0600-\u06FF]")
74
-
75
-
76
- def _shape_fa(s: str) -> str:
77
- try:
78
- return get_display(arabic_reshaper.reshape(str(s)))
79
- except Exception:
80
- return str(s)
81
-
82
-
83
- def is_persian(text: str) -> bool:
84
- return bool(_FA_RE.search(text or ""))
85
-
86
-
87
- def custom_tokenize(text: str) -> List[str]:
88
- text = re.sub(r"http\S+|www\.\S+", " ", str(text))
89
- text = text.replace("\u200c", " ")
90
- tokens = re.findall(r"[\w\u0600-\u06FF]+", text)
91
- tokens = [
92
- t.replace("دوستتدارم", "دوستت_دارم")
93
- .replace("حالمبده", "حالم_بده")
94
- for t in tokens
95
- ]
96
- return tokens
97
-
98
-
99
  # ===================== Telegram parsing =====================
100
  def extract_text(msg_text: Any) -> str:
101
  if msg_text is None:
@@ -103,175 +79,378 @@ def extract_text(msg_text: Any) -> str:
103
  if isinstance(msg_text, str):
104
  return msg_text
105
  if isinstance(msg_text, list):
106
- out = []
107
- for p in msg_text:
108
- if isinstance(p, str):
109
- out.append(p)
110
- elif isinstance(p, dict) and isinstance(p.get("text"), str):
111
- out.append(p["text"])
112
- return "".join(out)
 
 
113
  return ""
114
 
115
 
116
  def extract_chats(data: Dict[str, Any]) -> List[Dict[str, Any]]:
117
- if "chats" in data and "list" in data["chats"]:
118
- return data["chats"]["list"]
119
- if "messages" in data:
 
 
120
  return [data]
121
- raise ValueError("Unsupported Telegram JSON format")
122
 
123
 
124
  def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
125
- return chat.get("name") or chat.get("title") or fallback
 
 
 
126
 
127
 
128
  def build_df(chat: Dict[str, Any]) -> pd.DataFrame:
129
- rows = []
130
  for msg in chat.get("messages", []):
131
- text = extract_text(msg.get("text"))
 
 
 
132
  if not text:
133
  continue
134
- date = msg.get("date")
135
- if not date:
 
136
  continue
137
- rows.append({
138
- "date": pd.to_datetime(date, utc=True).tz_convert(None),
139
- "text": text
140
- })
141
- return pd.DataFrame(rows).sort_values("date").reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
 
144
- # ===================== Sentiment =====================
145
- def score_en(text: str) -> float:
 
 
 
 
 
 
 
 
146
  return float(SIA.polarity_scores(text)["compound"])
147
 
148
 
149
  @torch.inference_mode()
150
- def score_fa_bert(texts: List[str], batch: int = 16) -> List[float]:
151
- out = []
152
- for i in range(0, len(texts), batch):
153
- t = texts[i:i+batch]
154
- inp = TOKENIZER(t, return_tensors="pt", truncation=True,
155
- padding=True, max_length=256).to(DEVICE)
156
- logits = MODEL(**inp).logits
157
- probs = torch.softmax(logits, dim=-1)
158
- diff = (probs[:,1] - probs[:,0]).cpu().numpy()
159
- out.extend(diff.tolist())
160
- return out
161
-
162
-
163
- def compute_sentiments(df: pd.DataFrame, max_bert: int = 500):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  df = df.copy()
165
  df["sentiment_final"] = 0.0
166
 
167
- fa_mask = df["text"].apply(is_persian)
 
 
168
 
169
- df.loc[~fa_mask, "sentiment_final"] = df.loc[
170
- ~fa_mask, "text"
171
- ].apply(score_en)
172
 
173
- df.loc[fa_mask, "sentiment_final"] = df.loc[
174
- fa_mask, "text"
175
- ].apply(lambda t: 0.0)
176
 
177
- idx = df.index[fa_mask][:max_bert]
178
- if len(idx):
179
- scores = score_fa_bert(df.loc[idx, "text"].tolist())
180
- df.loc[idx, "sentiment_final"] = scores
 
181
 
182
- return df, len(idx)
 
 
183
 
184
 
185
- # ===================== Weekly =====================
186
  def weekly_series(df: pd.DataFrame) -> pd.Series:
187
- return df.set_index("date").resample("W")["sentiment_final"].mean().dropna()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
 
190
  def make_weekly_plot(df: pd.DataFrame, chat_name: str):
191
  ws = weekly_series(df)
192
 
 
 
 
 
 
 
 
 
 
 
193
  fig = plt.figure(figsize=(22, 8))
194
  plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
195
  plt.xlabel("Time (weeks)")
196
  plt.ylabel("Average sentiment score")
197
- plt.axhline(0, linestyle="--")
198
  plt.grid(True)
 
199
 
200
  if ws.empty:
201
- return fig, {}
 
202
 
203
  x = ws.index
204
  y = ws.values.astype(float)
205
 
206
- ymin, ymax = float(y.min()), float(y.max())
207
- pad = 0.1 * max(1e-6, ymax - ymin)
208
- plt.ylim(ymin - pad, ymax + pad)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- plt.plot(x, y, marker="o", color="red", linewidth=2)
211
  plt.tight_layout()
 
212
 
213
- return fig, {}
214
-
215
-
216
- # ===================== TOP WORDS (NORMALIZED) =====================
217
- def top_words_weighted_by_sentiment(
218
- df: pd.DataFrame,
219
- top_n: int = 5
220
- ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
221
 
 
 
 
 
 
 
 
222
  pos_w = defaultdict(float)
223
  neg_w = defaultdict(float)
224
 
225
- for text, s in zip(df["text"], df["sentiment_final"]):
226
- tokens = [
227
- t for t in custom_tokenize(text)
228
- if len(t) > 1 and t not in stopwords_all
229
- ]
230
  if not tokens or abs(s) < 1e-9:
231
  continue
232
 
233
  if s > 0:
234
  for t in tokens:
235
- pos_w[t] += s
236
  elif s < 0:
237
  for t in tokens:
238
- neg_w[t] += abs(s)
239
 
240
- max_pos = max(pos_w.values(), default=1.0)
241
- max_neg = max(neg_w.values(), default=1.0)
242
 
243
- pos_top = [
244
- {"word": w, "score": v / max_pos}
245
- for w, v in sorted(pos_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
246
- ]
247
 
248
- neg_top = [
249
- {"word": w, "score": -v / max_neg}
250
- for w, v in sorted(neg_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
251
- ]
 
 
 
 
 
 
 
 
 
252
 
253
  return pos_top, neg_top
254
 
255
 
256
- # ===================== Main =====================
257
- def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 500):
 
 
 
 
258
  df = build_df(chat)
259
- name = get_chat_name(chat, "Chat")
260
 
261
  if df.empty:
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  fig = plt.figure(figsize=(22, 8))
263
- return {}, fig, [], []
 
 
264
 
265
- df, used = compute_sentiments(df, max_bert=max_bert_persian)
266
- fig, _ = make_weekly_plot(df, name)
267
 
268
- pos_top, neg_top = top_words_weighted_by_sentiment(df)
 
 
 
 
 
 
269
 
270
  result = {
271
  "chat_name": name,
272
- "message_count": len(df),
273
- "bert_used_on_persian_messages": used,
274
- "overall_avg_sentiment": float(df["sentiment_final"].mean()),
 
 
 
 
 
 
 
 
 
275
  "top5_positive_lex": pos_top,
276
  "top5_negative_lex": neg_top,
277
  }
 
36
  SIA = SentimentIntensityAnalyzer()
37
 
38
 
39
+ # ===================== Persian sentiment model =====================
40
  MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
41
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
 
 
45
  MODEL.eval()
46
 
47
 
48
+ # ===================== Small lex helpers (not used for final top words scale anymore) =====================
49
  persian_positive = {
50
  "خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
51
  "دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
 
56
  "حالم_بده","حالمبده","😡","😠","😞","😔","😭","💔"
57
  }
58
 
59
+ persian_positive_phrases = {"دوستت دارم", "خیلی دوستت دارم", "دمت گرم", "آفرین"}
60
+ persian_negative_phrases = {"حالم بده", "خیلی بده", "اعصابم خورد"}
61
+
62
  persian_stopwords = {
63
  "و","در","به","از","که","این","اون","آن","من","تو","ما","شما","او","هم","یا","اما","اگر",
64
+ "برای","با","روی","تا","نه","را","همه","چی","چیه","چرا","کجا","کی","چه","یه","یک","بود","هست",
65
+ "می","میشه","کرد","کردم","کن","کردی","کردن","دارم","داری","داره","داشت","شد","شده"
66
  }
67
  english_stopwords = {
68
  "the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
 
72
  stopwords_all = persian_stopwords.union(english_stopwords)
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # ===================== Telegram parsing =====================
76
  def extract_text(msg_text: Any) -> str:
77
  if msg_text is None:
 
79
  if isinstance(msg_text, str):
80
  return msg_text
81
  if isinstance(msg_text, list):
82
+ parts: List[str] = []
83
+ for part in msg_text:
84
+ if isinstance(part, str):
85
+ parts.append(part)
86
+ elif isinstance(part, dict):
87
+ t = part.get("text")
88
+ if isinstance(t, str):
89
+ parts.append(t)
90
+ return "".join(parts)
91
  return ""
92
 
93
 
94
  def extract_chats(data: Dict[str, Any]) -> List[Dict[str, Any]]:
95
+ if isinstance(data, dict) and "chats" in data and isinstance(data["chats"], dict) and "list" in data["chats"]:
96
+ lst = data["chats"]["list"]
97
+ if isinstance(lst, list):
98
+ return lst
99
+ if isinstance(data, dict) and "messages" in data and isinstance(data["messages"], list):
100
  return [data]
101
+ raise ValueError("JSON format not recognized. Need Telegram export result.json with chats.list.")
102
 
103
 
104
  def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
105
+ name = chat.get("name") or chat.get("title")
106
+ if isinstance(name, str) and name.strip():
107
+ return name.strip()
108
+ return fallback
109
 
110
 
111
  def build_df(chat: Dict[str, Any]) -> pd.DataFrame:
112
+ rows: List[Dict[str, Any]] = []
113
  for msg in chat.get("messages", []):
114
+ if not isinstance(msg, dict):
115
+ continue
116
+
117
+ text = extract_text(msg.get("text", "")).strip()
118
  if not text:
119
  continue
120
+
121
+ date_raw = msg.get("date")
122
+ if not isinstance(date_raw, str) or not date_raw:
123
  continue
124
+
125
+ rows.append({"date_raw": date_raw, "text": text})
126
+
127
+ df = pd.DataFrame(rows)
128
+ if df.empty:
129
+ return df
130
+
131
+ df["date"] = pd.to_datetime(df["date_raw"], errors="coerce", utc=True).dt.tz_convert(None)
132
+ df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
133
+ return df
134
+
135
+
136
+ # ===================== Tokenize + Persian detect =====================
137
+ _FA_RE = re.compile(r"[\u0600-\u06FF]")
138
+
139
+
140
+ def is_persian(text: str) -> bool:
141
+ return bool(_FA_RE.search(text or ""))
142
 
143
 
144
+ def custom_tokenize(text: str) -> List[str]:
145
+ text = re.sub(r"http\S+|www\.\S+", " ", str(text))
146
+ text = text.replace("\u200c", " ")
147
+ tokens = re.findall(r"[\w\u0600-\u06FF]+", text)
148
+ tokens = [t.replace("دوستتدارم", "دوستت_دارم").replace("حالمبده", "حالم_بده") for t in tokens]
149
+ return tokens
150
+
151
+
152
+ # ===================== Sentiment scoring =====================
153
+ def score_en_vader(text: str) -> float:
154
  return float(SIA.polarity_scores(text)["compound"])
155
 
156
 
157
  @torch.inference_mode()
158
+ def score_fa_bert_batch(texts: List[str], batch_size: int = 16) -> List[float]:
159
+ scores: List[float] = []
160
+ for i in range(0, len(texts), batch_size):
161
+ chunk = texts[i:i + batch_size]
162
+ inputs = TOKENIZER(chunk, return_tensors="pt", truncation=True, padding=True, max_length=256).to(DEVICE)
163
+ out = MODEL(**inputs)
164
+ probs = torch.softmax(out.logits, dim=-1)
165
+ diff = (probs[:, 1] - probs[:, 0]).detach().cpu().numpy().astype(float).tolist()
166
+ diff = [float(max(-1.0, min(1.0, d))) for d in diff] # keep sentiment in [-1, 1]
167
+ scores.extend(diff)
168
+ return scores
169
+
170
+
171
+ def persian_lexicon_score(text: str) -> float:
172
+ tokens = custom_tokenize(text)
173
+ pos = 0
174
+ neg = 0
175
+
176
+ for t in tokens:
177
+ if t in persian_positive:
178
+ pos += 1
179
+ elif t in persian_negative:
180
+ neg += 1
181
+
182
+ norm = str(text).replace("\u200c", " ")
183
+ for ph in persian_positive_phrases:
184
+ if ph in norm:
185
+ pos += 2
186
+ for ph in persian_negative_phrases:
187
+ if ph in norm:
188
+ neg += 2
189
+
190
+ return float((pos - neg) / max(1, (pos + neg)))
191
+
192
+
193
+ def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 500, bert_batch_size: int = 16) -> Tuple[pd.DataFrame, int]:
194
+ if df.empty:
195
+ return df, 0
196
+
197
  df = df.copy()
198
  df["sentiment_final"] = 0.0
199
 
200
+ pers_mask = df["text"].astype(str).apply(is_persian)
201
+ pers_idx = df.index[pers_mask].tolist()
202
+ en_idx = df.index[~pers_mask].tolist()
203
 
204
+ if en_idx:
205
+ df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
 
206
 
207
+ if pers_idx:
208
+ df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
 
209
 
210
+ bert_idx = pers_idx[: max(0, int(max_bert_persian))]
211
+ if bert_idx:
212
+ texts = df.loc[bert_idx, "text"].astype(str).tolist()
213
+ scores = score_fa_bert_batch(texts, batch_size=int(bert_batch_size))
214
+ df.loc[bert_idx, "sentiment_final"] = scores
215
 
216
+ df["sentiment_final"] = df["sentiment_final"].fillna(0.0)
217
+ df["sentiment_final"] = df["sentiment_final"].clip(-1.0, 1.0) # safety
218
+ return df, len(bert_idx)
219
 
220
 
221
+ # ===================== Weekly aggregation + extremes =====================
222
  def weekly_series(df: pd.DataFrame) -> pd.Series:
223
+ return (
224
+ df.set_index("date")
225
+ .resample("W")["sentiment_final"]
226
+ .mean()
227
+ .dropna()
228
+ )
229
+
230
+
231
+ def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str):
232
+ start = week_end - pd.Timedelta(days=7)
233
+ sub = df[(df["date"] > start) & (df["date"] <= week_end)]
234
+ if sub.empty:
235
+ return None
236
+ if mode == "max":
237
+ return sub.loc[sub["sentiment_final"].idxmax()]
238
+ return sub.loc[sub["sentiment_final"].idxmin()]
239
+
240
+
241
+ def extract_lex_words_from_text(text: str, polarity: str, min_words: int = 4) -> List[str]:
242
+ tokens = [t for t in custom_tokenize(text) if len(t) > 1 and t not in stopwords_all]
243
+
244
+ out: List[str] = []
245
+ for t in tokens:
246
+ if polarity == "pos" and (t in persian_positive):
247
+ out.append(t)
248
+ if polarity == "neg" and (t in persian_negative):
249
+ out.append(t)
250
+
251
+ if len(out) < min_words:
252
+ for t in tokens:
253
+ if t not in out:
254
+ out.append(t)
255
+ if len(out) >= min_words:
256
+ break
257
+
258
+ return out[:max(min_words, 4)]
259
+
260
+
261
+ # ===================== Plot helpers =====================
262
+ def _shape_fa(s: str) -> str:
263
+ try:
264
+ return get_display(arabic_reshaper.reshape(str(s)))
265
+ except Exception:
266
+ return str(s)
267
 
268
 
269
  def make_weekly_plot(df: pd.DataFrame, chat_name: str):
270
  ws = weekly_series(df)
271
 
272
+ info: Dict[str, Any] = {
273
+ "peak_week_end": None,
274
+ "low_week_end": None,
275
+ "peak_words": [],
276
+ "low_words": [],
277
+ "peak_word_main": None,
278
+ "low_word_main": None,
279
+ }
280
+
281
+ # big plot so Persian annotations are readable
282
  fig = plt.figure(figsize=(22, 8))
283
  plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
284
  plt.xlabel("Time (weeks)")
285
  plt.ylabel("Average sentiment score")
 
286
  plt.grid(True)
287
+ plt.axhline(0, linestyle="--")
288
 
289
  if ws.empty:
290
+ plt.tight_layout()
291
+ return fig, info
292
 
293
  x = ws.index
294
  y = ws.values.astype(float)
295
 
296
+ # show true peaks: full min/max range
297
+ y_min = float(np.min(y))
298
+ y_max = float(np.max(y))
299
+ pad = 0.08 * max(1e-9, (y_max - y_min))
300
+ y_min_plot = y_min - pad
301
+ y_max_plot = y_max + pad
302
+
303
+ plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
304
+ plt.ylim(y_min_plot, y_max_plot)
305
+ plt.margins(x=0.03, y=0.15)
306
+
307
+ peak_week = ws.idxmax()
308
+ low_week = ws.idxmin()
309
+
310
+ info["peak_week_end"] = peak_week.isoformat()
311
+ info["low_week_end"] = low_week.isoformat()
312
+
313
+ peak_msg = message_in_week(df, peak_week, "max")
314
+ low_msg = message_in_week(df, low_week, "min")
315
+
316
+ peak_words = extract_lex_words_from_text(str(peak_msg["text"]) if peak_msg is not None else "", "pos", 4)
317
+ low_words = extract_lex_words_from_text(str(low_msg["text"]) if low_msg is not None else "", "neg", 4)
318
+
319
+ info["peak_words"] = peak_words
320
+ info["low_words"] = low_words
321
+ info["peak_word_main"] = peak_words[0] if peak_words else None
322
+ info["low_word_main"] = low_words[0] if low_words else None
323
+
324
+ peak_y = float(ws.loc[peak_week])
325
+ low_y = float(ws.loc[low_week])
326
+
327
+ def clamp(val: float) -> float:
328
+ return float(min(max(val, y_min_plot), y_max_plot))
329
+
330
+ peak_y_plot = clamp(peak_y)
331
+ low_y_plot = clamp(low_y)
332
+
333
+ plt.scatter([peak_week], [peak_y_plot])
334
+ plt.annotate(
335
+ _shape_fa("، ".join(peak_words) + f" (peak={peak_y:.3f})"),
336
+ xy=(peak_week, peak_y_plot),
337
+ xytext=(peak_week, y_max_plot + 0.06 * (y_max_plot - y_min_plot)),
338
+ arrowprops=dict(arrowstyle="->"),
339
+ ha="center",
340
+ fontsize=10,
341
+ )
342
+
343
+ plt.scatter([low_week], [low_y_plot])
344
+ plt.annotate(
345
+ _shape_fa("، ".join(low_words) + f" (low={low_y:.3f})"),
346
+ xy=(low_week, low_y_plot),
347
+ xytext=(low_week, y_min_plot - 0.06 * (y_max_plot - y_min_plot)),
348
+ arrowprops=dict(arrowstyle="->"),
349
+ ha="center",
350
+ fontsize=10,
351
+ )
352
 
 
353
  plt.tight_layout()
354
+ return fig, info
355
 
 
 
 
 
 
 
 
 
356
 
357
+ # ===================== Top words (normalized to [-1, +1]) =====================
358
+ def top_words_weighted_by_sentiment(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
359
+ """
360
+ Uses message sentiment to weight tokens, then normalizes scores into [-1, +1].
361
+ Positive table: 0..+1
362
+ Negative table: -1..0
363
+ """
364
  pos_w = defaultdict(float)
365
  neg_w = defaultdict(float)
366
 
367
+ for text, s in zip(df["text"].astype(str).tolist(), df["sentiment_final"].astype(float).tolist()):
368
+ tokens = [t for t in custom_tokenize(text) if len(t) > 1 and t not in stopwords_all]
 
 
 
369
  if not tokens or abs(s) < 1e-9:
370
  continue
371
 
372
  if s > 0:
373
  for t in tokens:
374
+ pos_w[t] += float(s)
375
  elif s < 0:
376
  for t in tokens:
377
+ neg_w[t] += float(abs(s))
378
 
379
+ max_pos = max(pos_w.values(), default=0.0)
380
+ max_neg = max(neg_w.values(), default=0.0)
381
 
382
+ pos_items = sorted(pos_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
383
+ neg_items = sorted(neg_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
 
 
384
 
385
+ if max_pos <= 1e-12:
386
+ pos_top = []
387
+ else:
388
+ pos_top = [{"word": w, "score": float(v / max_pos)} for w, v in pos_items]
389
+ for d in pos_top:
390
+ d["score"] = float(max(0.0, min(1.0, d["score"])))
391
+
392
+ if max_neg <= 1e-12:
393
+ neg_top = []
394
+ else:
395
+ neg_top = [{"word": w, "score": float(-(v / max_neg))} for w, v in neg_items]
396
+ for d in neg_top:
397
+ d["score"] = float(min(0.0, max(-1.0, d["score"])))
398
 
399
  return pos_top, neg_top
400
 
401
 
402
+ # ===================== Main entry =====================
403
+ def analyze_chat(
404
+ chat: Dict[str, Any],
405
+ max_bert_persian: int = 500
406
+ ) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
407
+
408
  df = build_df(chat)
409
+ name = get_chat_name(chat, "Selected chat")
410
 
411
  if df.empty:
412
+ empty = {
413
+ "chat_name": name,
414
+ "message_count": 0,
415
+ "bert_used_on_persian_messages": 0,
416
+ "overall_avg_sentiment": 0.0,
417
+ "peak_word_main": None,
418
+ "low_word_main": None,
419
+ "peak_words": [],
420
+ "low_words": [],
421
+ "weekly": [],
422
+ "top5_positive_lex": [],
423
+ "top5_negative_lex": [],
424
+ }
425
  fig = plt.figure(figsize=(22, 8))
426
+ plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
427
+ plt.tight_layout()
428
+ return empty, fig, [], []
429
 
430
+ df, used = compute_sentiments(df, max_bert_persian=max_bert_persian, bert_batch_size=16)
 
431
 
432
+ ws = weekly_series(df)
433
+ weekly_records = [{"week_end": idx.isoformat(), "avg_sentiment": float(val)} for idx, val in ws.items()]
434
+ overall = float(df["sentiment_final"].mean())
435
+
436
+ fig, info = make_weekly_plot(df, name)
437
+
438
+ pos_top, neg_top = top_words_weighted_by_sentiment(df, top_n=5)
439
 
440
  result = {
441
  "chat_name": name,
442
+ "message_count": int(len(df)),
443
+ "bert_used_on_persian_messages": int(used),
444
+ "overall_avg_sentiment": overall,
445
+ "weekly": weekly_records,
446
+
447
+ "peak_week_end": info["peak_week_end"],
448
+ "low_week_end": info["low_week_end"],
449
+ "peak_word_main": info["peak_word_main"],
450
+ "low_word_main": info["low_word_main"],
451
+ "peak_words": info["peak_words"],
452
+ "low_words": info["low_words"],
453
+
454
  "top5_positive_lex": pos_top,
455
  "top5_negative_lex": neg_top,
456
  }