Update analysis_core.py
Browse files- analysis_core.py +20 -20
analysis_core.py
CHANGED
|
@@ -212,11 +212,11 @@ def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 200) -> Tuple[p
|
|
| 212 |
if en_idx:
|
| 213 |
df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
|
| 214 |
|
| 215 |
-
# Persian: lexicon for all
|
| 216 |
if pers_idx:
|
| 217 |
df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
|
| 218 |
|
| 219 |
-
# Persian: overwrite first N with BERT
|
| 220 |
bert_idx = pers_idx[: max(0, int(max_bert_persian))]
|
| 221 |
if bert_idx:
|
| 222 |
df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
|
|
@@ -235,7 +235,7 @@ def weekly_series(df: pd.DataFrame) -> pd.Series:
|
|
| 235 |
)
|
| 236 |
|
| 237 |
|
| 238 |
-
def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str)
|
| 239 |
start = week_end - pd.Timedelta(days=7)
|
| 240 |
sub = df[(df["date"] > start) & (df["date"] <= week_end)]
|
| 241 |
if sub.empty:
|
|
@@ -255,7 +255,6 @@ def extract_lex_words_from_text(text: str, polarity: str, min_words: int = 4) ->
|
|
| 255 |
if polarity == "neg" and t in persian_negative:
|
| 256 |
out.append(t)
|
| 257 |
|
| 258 |
-
# if no lex hits, fall back to first tokens so annotation is never empty
|
| 259 |
if len(out) < min_words:
|
| 260 |
for t in tokens:
|
| 261 |
if t not in out:
|
|
@@ -274,13 +273,7 @@ def _shape_fa(s: str) -> str:
|
|
| 274 |
return str(s)
|
| 275 |
|
| 276 |
|
| 277 |
-
def make_weekly_plot(df: pd.DataFrame, chat_name: str)
|
| 278 |
-
"""
|
| 279 |
-
returns (fig, info_for_text_outputs)
|
| 280 |
-
info contains:
|
| 281 |
-
peak_week_end, low_week_end, peak_words, low_words,
|
| 282 |
-
peak_word_main, low_word_main
|
| 283 |
-
"""
|
| 284 |
ws = weekly_series(df)
|
| 285 |
info: Dict[str, Any] = {
|
| 286 |
"peak_week_end": None,
|
|
@@ -305,7 +298,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
|
|
| 305 |
x = ws.index
|
| 306 |
y = ws.values.astype(float)
|
| 307 |
|
| 308 |
-
# robust y-limits so
|
| 309 |
q_lo = float(np.quantile(y, 0.05))
|
| 310 |
q_hi = float(np.quantile(y, 0.95))
|
| 311 |
if q_hi - q_lo < 0.15:
|
|
@@ -317,7 +310,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
|
|
| 317 |
|
| 318 |
plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
|
| 319 |
|
| 320 |
-
#
|
| 321 |
peak_week = ws.idxmax()
|
| 322 |
low_week = ws.idxmin()
|
| 323 |
|
|
@@ -335,14 +328,11 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
|
|
| 335 |
info["peak_word_main"] = peak_words[0] if peak_words else None
|
| 336 |
info["low_word_main"] = low_words[0] if low_words else None
|
| 337 |
|
| 338 |
-
# annotate peak
|
| 339 |
peak_y = float(ws.loc[peak_week])
|
| 340 |
low_y = float(ws.loc[low_week])
|
| 341 |
|
| 342 |
-
# set y-limits (robust)
|
| 343 |
plt.ylim(y_min_plot, y_max_plot)
|
| 344 |
|
| 345 |
-
# if peak/low is outside robust bounds, annotate at border with true value
|
| 346 |
def clamp(val: float) -> float:
|
| 347 |
return float(min(max(val, y_min_plot), y_max_plot))
|
| 348 |
|
|
@@ -376,7 +366,7 @@ def make_weekly_plot(df: pd.DataFrame, chat_name: str) -> Tuple[Any, Dict[str, A
|
|
| 376 |
# -------------------- Top lex words with score --------------------
|
| 377 |
def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
| 378 |
"""
|
| 379 |
-
score definition
|
| 380 |
positive word score = count(word) / total_tokens
|
| 381 |
negative word score = - count(word) / total_tokens
|
| 382 |
"""
|
|
@@ -411,7 +401,11 @@ def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]
|
|
| 411 |
|
| 412 |
|
| 413 |
# -------------------- Main entry --------------------
|
| 414 |
-
def analyze_chat(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
df = build_df(chat)
|
| 416 |
name = get_chat_name(chat, "Selected chat")
|
| 417 |
|
|
@@ -426,6 +420,8 @@ def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dic
|
|
| 426 |
"peak_words": [],
|
| 427 |
"low_words": [],
|
| 428 |
"weekly": [],
|
|
|
|
|
|
|
| 429 |
}
|
| 430 |
fig = plt.figure(figsize=(12, 5))
|
| 431 |
plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
|
|
@@ -449,12 +445,16 @@ def analyze_chat(chat: Dict[str, Any], max_bert_persian: int = 200) -> Tuple[Dic
|
|
| 449 |
"bert_used_on_persian_messages": int(used),
|
| 450 |
"overall_avg_sentiment": overall,
|
| 451 |
"weekly": weekly_records,
|
|
|
|
|
|
|
| 452 |
"peak_week_end": info["peak_week_end"],
|
| 453 |
"low_week_end": info["low_week_end"],
|
| 454 |
-
"peak_words": info["peak_words"],
|
| 455 |
-
"low_words": info["low_words"],
|
| 456 |
"peak_word_main": info["peak_word_main"],
|
| 457 |
"low_word_main": info["low_word_main"],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
"top5_positive_lex": pos_top,
|
| 459 |
"top5_negative_lex": neg_top,
|
| 460 |
}
|
|
|
|
| 212 |
if en_idx:
|
| 213 |
df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
|
| 214 |
|
| 215 |
+
# Persian: lexicon baseline for all
|
| 216 |
if pers_idx:
|
| 217 |
df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
|
| 218 |
|
| 219 |
+
# Persian: overwrite first N with BERT
|
| 220 |
bert_idx = pers_idx[: max(0, int(max_bert_persian))]
|
| 221 |
if bert_idx:
|
| 222 |
df.loc[bert_idx, "sentiment_final"] = df.loc[bert_idx, "text"].astype(str).apply(score_fa_bert)
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
|
| 238 |
+
def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str):
|
| 239 |
start = week_end - pd.Timedelta(days=7)
|
| 240 |
sub = df[(df["date"] > start) & (df["date"] <= week_end)]
|
| 241 |
if sub.empty:
|
|
|
|
| 255 |
if polarity == "neg" and t in persian_negative:
|
| 256 |
out.append(t)
|
| 257 |
|
|
|
|
| 258 |
if len(out) < min_words:
|
| 259 |
for t in tokens:
|
| 260 |
if t not in out:
|
|
|
|
| 273 |
return str(s)
|
| 274 |
|
| 275 |
|
| 276 |
+
def make_weekly_plot(df: pd.DataFrame, chat_name: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
ws = weekly_series(df)
|
| 278 |
info: Dict[str, Any] = {
|
| 279 |
"peak_week_end": None,
|
|
|
|
| 298 |
x = ws.index
|
| 299 |
y = ws.values.astype(float)
|
| 300 |
|
| 301 |
+
# robust y-limits: use 5-95% so an extreme outlier doesn't flatten the plot
|
| 302 |
q_lo = float(np.quantile(y, 0.05))
|
| 303 |
q_hi = float(np.quantile(y, 0.95))
|
| 304 |
if q_hi - q_lo < 0.15:
|
|
|
|
| 310 |
|
| 311 |
plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
|
| 312 |
|
| 313 |
+
# peak/low weeks
|
| 314 |
peak_week = ws.idxmax()
|
| 315 |
low_week = ws.idxmin()
|
| 316 |
|
|
|
|
| 328 |
info["peak_word_main"] = peak_words[0] if peak_words else None
|
| 329 |
info["low_word_main"] = low_words[0] if low_words else None
|
| 330 |
|
|
|
|
| 331 |
peak_y = float(ws.loc[peak_week])
|
| 332 |
low_y = float(ws.loc[low_week])
|
| 333 |
|
|
|
|
| 334 |
plt.ylim(y_min_plot, y_max_plot)
|
| 335 |
|
|
|
|
| 336 |
def clamp(val: float) -> float:
|
| 337 |
return float(min(max(val, y_min_plot), y_max_plot))
|
| 338 |
|
|
|
|
| 366 |
# -------------------- Top lex words with score --------------------
|
| 367 |
def top_lex_words(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
| 368 |
"""
|
| 369 |
+
score definition:
|
| 370 |
positive word score = count(word) / total_tokens
|
| 371 |
negative word score = - count(word) / total_tokens
|
| 372 |
"""
|
|
|
|
| 401 |
|
| 402 |
|
| 403 |
# -------------------- Main entry --------------------
|
| 404 |
+
def analyze_chat(
|
| 405 |
+
chat: Dict[str, Any],
|
| 406 |
+
max_bert_persian: int = 200
|
| 407 |
+
) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
|
| 408 |
+
|
| 409 |
df = build_df(chat)
|
| 410 |
name = get_chat_name(chat, "Selected chat")
|
| 411 |
|
|
|
|
| 420 |
"peak_words": [],
|
| 421 |
"low_words": [],
|
| 422 |
"weekly": [],
|
| 423 |
+
"top5_positive_lex": [],
|
| 424 |
+
"top5_negative_lex": [],
|
| 425 |
}
|
| 426 |
fig = plt.figure(figsize=(12, 5))
|
| 427 |
plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
|
|
|
|
| 445 |
"bert_used_on_persian_messages": int(used),
|
| 446 |
"overall_avg_sentiment": overall,
|
| 447 |
"weekly": weekly_records,
|
| 448 |
+
|
| 449 |
+
# requirement (2)
|
| 450 |
"peak_week_end": info["peak_week_end"],
|
| 451 |
"low_week_end": info["low_week_end"],
|
|
|
|
|
|
|
| 452 |
"peak_word_main": info["peak_word_main"],
|
| 453 |
"low_word_main": info["low_word_main"],
|
| 454 |
+
"peak_words": info["peak_words"],
|
| 455 |
+
"low_words": info["low_words"],
|
| 456 |
+
|
| 457 |
+
# requirement (4)
|
| 458 |
"top5_positive_lex": pos_top,
|
| 459 |
"top5_negative_lex": neg_top,
|
| 460 |
}
|