Sajjadistic commited on
Commit
a273b11
·
verified ·
1 Parent(s): 306b7e2

Update analysis_core.py

Browse files
Files changed (1) hide show
  1. analysis_core.py +20 -20
analysis_core.py CHANGED
@@ -212,11 +212,11 @@ def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 200) -> Tuple[p
212
  if en_idx:
213
  df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
214
 
215
- # Persian: lexicon for all (fast baseline)
216
  if pers_idx:
217
  df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
218
 
219
- # Persian: overwrite first N with BERT (better quality)
220
  bert_idx = pers_idx[: max(0, int(max_bert_persian))]
221
  if bert_idx:
222
  df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
@@ -235,7 +235,7 @@ def weekly_series(df: pd.DataFrame) -> pd.Series:
235
  )
236
 
237
 
238
- def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str) -> pd.Series | None:
239
  start = week_end - pd.Timedelta(days=7)
240
  sub = df[(df["date"] > start) & (df["date"] <= week_end)]
241
  if sub.empty:
@@ -255,7 +255,6 @@ def extract_lex_words_from_text(text: str, polarity: str, min_words: int = 4) ->
255
  if polarity == "neg" and t in persian_negative:
256
  out.append(t)
257
 
258
- # if no lex hits, fall back to first tokens so annotation is never empty
259
  if len(out) < min_words:
260
  for t in tokens:
261
  if t not in out:
@@ -274,13 +273,7 @@ def _shape_fa(s: str) -> str:
274
  return str(s)
275
 
276
 
277
- def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, Any]]:
278
- """
279
- returns (fig, info_for_text_outputs)
280
- info contains:
281
- peak_week_end, low_week_end, peak_words, low_words,
282
- peak_word_main, low_word_main
283
- """
284
  ws = weekly_series(df)
285
  info: Dict[str, Any] = {
286
  "peak_week_end": None,
@@ -305,7 +298,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
305
  x = ws.index
306
  y = ws.values.astype(float)
307
 
308
- # robust y-limits so one extreme point doesn't flatten everything
309
  q_lo = float(np.quantile(y, 0.05))
310
  q_hi = float(np.quantile(y, 0.95))
311
  if q_hi - q_lo < 0.15:
@@ -317,7 +310,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
317
 
318
  plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
319
 
320
- # find peak/low week
321
  peak_week = ws.idxmax()
322
  low_week = ws.idxmin()
323
 
@@ -335,14 +328,11 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
335
  info["peak_word_main"] = peak_words[0] if peak_words else None
336
  info["low_word_main"] = low_words[0] if low_words else None
337
 
338
- # annotate peak
339
  peak_y = float(ws.loc[peak_week])
340
  low_y = float(ws.loc[low_week])
341
 
342
- # set y-limits (robust)
343
  plt.ylim(y_min_plot, y_max_plot)
344
 
345
- # if peak/low is outside robust bounds, annotate at border with true value
346
  def clamp(val: float) -> float:
347
  return float(min(max(val, y_min_plot), y_max_plot))
348
 
@@ -376,7 +366,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
376
  # -------------------- Top lex words with score --------------------
377
  def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
378
  """
379
- score definition (simple + consistent):
380
  positive word score = count(word) / total_tokens
381
  negative word score = - count(word) / total_tokens
382
  """
@@ -411,7 +401,11 @@ def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]
411
 
412
 
413
  # -------------------- Main entry --------------------
414
- def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
 
 
 
 
415
  df = build_df(chat)
416
  name = get_chat_name(chat, "Selected chat")
417
 
@@ -426,6 +420,8 @@ def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dic
426
  "peak_words": [],
427
  "low_words": [],
428
  "weekly": [],
 
 
429
  }
430
  fig = plt.figure(figsize=(12, 5))
431
  plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
@@ -449,12 +445,16 @@ def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dic
449
  "bert_used_on_persian_messages": int(used),
450
  "overall_avg_sentiment": overall,
451
  "weekly": weekly_records,
 
 
452
  "peak_week_end": info["peak_week_end"],
453
  "low_week_end": info["low_week_end"],
454
- "peak_words": info["peak_words"],
455
- "low_words": info["low_words"],
456
  "peak_word_main": info["peak_word_main"],
457
  "low_word_main": info["low_word_main"],
 
 
 
 
458
  "top5_positive_lex": pos_top,
459
  "top5_negative_lex": neg_top,
460
  }
 
212
  if en_idx:
213
  df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
214
 
215
+ # Persian: lexicon baseline for all
216
  if pers_idx:
217
  df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
218
 
219
+ # Persian: overwrite first N with BERT
220
  bert_idx = pers_idx[: max(0, int(max_bert_persian))]
221
  if bert_idx:
222
  df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
 
235
  )
236
 
237
 
238
+ def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str):
239
  start = week_end - pd.Timedelta(days=7)
240
  sub = df[(df["date"] > start) & (df["date"] <= week_end)]
241
  if sub.empty:
 
255
  if polarity == "neg" and t in persian_negative:
256
  out.append(t)
257
 
 
258
  if len(out) < min_words:
259
  for t in tokens:
260
  if t not in out:
 
273
  return str(s)
274
 
275
 
276
+ def make_weekly_plot(df: pd.DataFrame, chat_name: str):
 
 
 
 
 
 
277
  ws = weekly_series(df)
278
  info: Dict[str, Any] = {
279
  "peak_week_end": None,
 
298
  x = ws.index
299
  y = ws.values.astype(float)
300
 
301
+ # robust y-limits: use 5-95% so an extreme outlier doesn't flatten the plot
302
  q_lo = float(np.quantile(y, 0.05))
303
  q_hi = float(np.quantile(y, 0.95))
304
  if q_hi - q_lo < 0.15:
 
310
 
311
  plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
312
 
313
+ # peak/low weeks
314
  peak_week = ws.idxmax()
315
  low_week = ws.idxmin()
316
 
 
328
  info["peak_word_main"] = peak_words[0] if peak_words else None
329
  info["low_word_main"] = low_words[0] if low_words else None
330
 
 
331
  peak_y = float(ws.loc[peak_week])
332
  low_y = float(ws.loc[low_week])
333
 
 
334
  plt.ylim(y_min_plot, y_max_plot)
335
 
 
336
  def clamp(val: float) -> float:
337
  return float(min(max(val, y_min_plot), y_max_plot))
338
 
 
366
  # -------------------- Top lex words with score --------------------
367
  def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
368
  """
369
+ score definition:
370
  positive word score = count(word) / total_tokens
371
  negative word score = - count(word) / total_tokens
372
  """
 
401
 
402
 
403
  # -------------------- Main entry --------------------
404
+ def analyze_chat(
405
+ chat: Dict[str, Any],
406
+ max_bert_persian: int = 200
407
+ ) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
408
+
409
  df = build_df(chat)
410
  name = get_chat_name(chat, "Selected chat")
411
 
 
420
  "peak_words": [],
421
  "low_words": [],
422
  "weekly": [],
423
+ "top5_positive_lex": [],
424
+ "top5_negative_lex": [],
425
  }
426
  fig = plt.figure(figsize=(12, 5))
427
  plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
 
445
  "bert_used_on_persian_messages": int(used),
446
  "overall_avg_sentiment": overall,
447
  "weekly": weekly_records,
448
+
449
+ # requirement (2)
450
  "peak_week_end": info["peak_week_end"],
451
  "low_week_end": info["low_week_end"],
 
 
452
  "peak_word_main": info["peak_word_main"],
453
  "low_word_main": info["low_word_main"],
454
+ "peak_words": info["peak_words"],
455
+ "low_words": info["low_words"],
456
+
457
+ # requirement (4)
458
  "top5_positive_lex": pos_top,
459
  "top5_negative_lex": neg_top,
460
  }