Update analysis_core.py
Browse files- analysis_core.py +302 -123
analysis_core.py
CHANGED
|
@@ -36,7 +36,7 @@ _ensure_nltk()
|
|
| 36 |
SIA = SentimentIntensityAnalyzer()
|
| 37 |
|
| 38 |
|
| 39 |
-
# =====================
|
| 40 |
MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
|
| 41 |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 42 |
|
|
@@ -45,7 +45,7 @@ MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE
|
|
| 45 |
MODEL.eval()
|
| 46 |
|
| 47 |
|
| 48 |
-
# =====================
|
| 49 |
persian_positive = {
|
| 50 |
"خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
|
| 51 |
"دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
|
|
@@ -56,10 +56,13 @@ persian_negative = {
|
|
| 56 |
"حالم_بده","حالمبده","😡","😠","😞","😔","😭","💔"
|
| 57 |
}
|
| 58 |
|
|
|
|
|
|
|
|
|
|
| 59 |
persian_stopwords = {
|
| 60 |
"و","در","به","از","که","این","اون","آن","من","تو","ما","شما","او","هم","یا","اما","اگر",
|
| 61 |
-
"برای","با","روی","تا","نه","را","همه","چی","چیه","چرا","کجا","کی","چه","یه","یک","بود",
|
| 62 |
-
"
|
| 63 |
}
|
| 64 |
english_stopwords = {
|
| 65 |
"the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
|
|
@@ -69,33 +72,6 @@ english_stopwords = {
|
|
| 69 |
stopwords_all = persian_stopwords.union(english_stopwords)
|
| 70 |
|
| 71 |
|
| 72 |
-
# ===================== Utils =====================
|
| 73 |
-
_FA_RE = re.compile(r"[\u0600-\u06FF]")
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
def _shape_fa(s: str) -> str:
|
| 77 |
-
try:
|
| 78 |
-
return get_display(arabic_reshaper.reshape(str(s)))
|
| 79 |
-
except Exception:
|
| 80 |
-
return str(s)
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def is_persian(text: str) -> bool:
|
| 84 |
-
return bool(_FA_RE.search(text or ""))
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def custom_tokenize(text: str) -> List[str]:
|
| 88 |
-
text = re.sub(r"http\S+|www\.\S+", " ", str(text))
|
| 89 |
-
text = text.replace("\u200c", " ")
|
| 90 |
-
tokens = re.findall(r"[\w\u0600-\u06FF]+", text)
|
| 91 |
-
tokens = [
|
| 92 |
-
t.replace("دوستتدارم", "دوستت_دارم")
|
| 93 |
-
.replace("حالمبده", "حالم_بده")
|
| 94 |
-
for t in tokens
|
| 95 |
-
]
|
| 96 |
-
return tokens
|
| 97 |
-
|
| 98 |
-
|
| 99 |
# ===================== Telegram parsing =====================
|
| 100 |
def extract_text(msg_text: Any) -> str:
|
| 101 |
if msg_text is None:
|
|
@@ -103,175 +79,378 @@ def extract_text(msg_text: Any) -> str:
|
|
| 103 |
if isinstance(msg_text, str):
|
| 104 |
return msg_text
|
| 105 |
if isinstance(msg_text, list):
|
| 106 |
-
|
| 107 |
-
for
|
| 108 |
-
if isinstance(
|
| 109 |
-
|
| 110 |
-
elif isinstance(
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
| 113 |
return ""
|
| 114 |
|
| 115 |
|
| 116 |
def extract_chats(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 117 |
-
if "chats" in data and "list" in data["chats"]:
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
| 120 |
return [data]
|
| 121 |
-
raise ValueError("
|
| 122 |
|
| 123 |
|
| 124 |
def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
def build_df(chat: Dict[str, Any]) -> pd.DataFrame:
|
| 129 |
-
rows = []
|
| 130 |
for msg in chat.get("messages", []):
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
| 132 |
if not text:
|
| 133 |
continue
|
| 134 |
-
|
| 135 |
-
|
|
|
|
| 136 |
continue
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
return float(SIA.polarity_scores(text)["compound"])
|
| 147 |
|
| 148 |
|
| 149 |
@torch.inference_mode()
|
| 150 |
-
def
|
| 151 |
-
|
| 152 |
-
for i in range(0, len(texts),
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
diff =
|
| 159 |
-
|
| 160 |
-
return
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
df = df.copy()
|
| 165 |
df["sentiment_final"] = 0.0
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
].apply(score_en)
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
].apply(lambda t: 0.0)
|
| 176 |
|
| 177 |
-
|
| 178 |
-
if
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
-
# ===================== Weekly =====================
|
| 186 |
def weekly_series(df: pd.DataFrame) -> pd.Series:
|
| 187 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
|
| 190 |
def make_weekly_plot(df: pd.DataFrame, chat_name: str):
|
| 191 |
ws = weekly_series(df)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
fig = plt.figure(figsize=(22, 8))
|
| 194 |
plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
|
| 195 |
plt.xlabel("Time (weeks)")
|
| 196 |
plt.ylabel("Average sentiment score")
|
| 197 |
-
plt.axhline(0, linestyle="--")
|
| 198 |
plt.grid(True)
|
|
|
|
| 199 |
|
| 200 |
if ws.empty:
|
| 201 |
-
|
|
|
|
| 202 |
|
| 203 |
x = ws.index
|
| 204 |
y = ws.values.astype(float)
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
plt.plot(x, y, marker="o", color="red", linewidth=2)
|
| 211 |
plt.tight_layout()
|
|
|
|
| 212 |
|
| 213 |
-
return fig, {}
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
# ===================== TOP WORDS (NORMALIZED) =====================
|
| 217 |
-
def top_words_weighted_by_sentiment(
|
| 218 |
-
df: pd.DataFrame,
|
| 219 |
-
top_n: int = 5
|
| 220 |
-
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
pos_w = defaultdict(float)
|
| 223 |
neg_w = defaultdict(float)
|
| 224 |
|
| 225 |
-
for text, s in zip(df["text"], df["sentiment_final"]):
|
| 226 |
-
tokens = [
|
| 227 |
-
t for t in custom_tokenize(text)
|
| 228 |
-
if len(t) > 1 and t not in stopwords_all
|
| 229 |
-
]
|
| 230 |
if not tokens or abs(s) < 1e-9:
|
| 231 |
continue
|
| 232 |
|
| 233 |
if s > 0:
|
| 234 |
for t in tokens:
|
| 235 |
-
pos_w[t] += s
|
| 236 |
elif s < 0:
|
| 237 |
for t in tokens:
|
| 238 |
-
neg_w[t] += abs(s)
|
| 239 |
|
| 240 |
-
max_pos = max(pos_w.values(), default=
|
| 241 |
-
max_neg = max(neg_w.values(), default=
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
for w, v in sorted(pos_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
| 246 |
-
]
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
return pos_top, neg_top
|
| 254 |
|
| 255 |
|
| 256 |
-
# ===================== Main =====================
|
| 257 |
-
def analyze_chat(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
df = build_df(chat)
|
| 259 |
-
name = get_chat_name(chat, "
|
| 260 |
|
| 261 |
if df.empty:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
fig = plt.figure(figsize=(22, 8))
|
| 263 |
-
|
|
|
|
|
|
|
| 264 |
|
| 265 |
-
df, used = compute_sentiments(df,
|
| 266 |
-
fig, _ = make_weekly_plot(df, name)
|
| 267 |
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
result = {
|
| 271 |
"chat_name": name,
|
| 272 |
-
"message_count": len(df),
|
| 273 |
-
"bert_used_on_persian_messages": used,
|
| 274 |
-
"overall_avg_sentiment":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
"top5_positive_lex": pos_top,
|
| 276 |
"top5_negative_lex": neg_top,
|
| 277 |
}
|
|
|
|
| 36 |
SIA = SentimentIntensityAnalyzer()
|
| 37 |
|
| 38 |
|
| 39 |
+
# ===================== Persian sentiment model =====================
|
| 40 |
MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
|
| 41 |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 42 |
|
|
|
|
| 45 |
MODEL.eval()
|
| 46 |
|
| 47 |
|
| 48 |
+
# ===================== Small lex helpers (not used for final top words scale anymore) =====================
|
| 49 |
persian_positive = {
|
| 50 |
"خوب","عالی","عالیه","باحال","قشنگ","زیبا","خوشحال","شاد","مرسی","ممنون","ممنونم",
|
| 51 |
"دمت","دمتگرم","دمت گرم","ایول","آفرین","دوستت","دوستت_دارم","دوستتدارم","عزیزم",
|
|
|
|
| 56 |
"حالم_بده","حالمبده","😡","😠","😞","😔","😭","💔"
|
| 57 |
}
|
| 58 |
|
| 59 |
+
persian_positive_phrases = {"دوستت دارم", "خیلی دوستت دارم", "دمت گرم", "آفرین"}
|
| 60 |
+
persian_negative_phrases = {"حالم بده", "خیلی بده", "اعصابم خورد"}
|
| 61 |
+
|
| 62 |
persian_stopwords = {
|
| 63 |
"و","در","به","از","که","این","اون","آن","من","تو","ما","شما","او","هم","یا","اما","اگر",
|
| 64 |
+
"برای","با","روی","تا","نه","را","همه","چی","چیه","چرا","کجا","کی","چه","یه","یک","بود","هست",
|
| 65 |
+
"می","میشه","کرد","کردم","کن","کردی","کردن","دارم","داری","داره","داشت","شد","شده"
|
| 66 |
}
|
| 67 |
english_stopwords = {
|
| 68 |
"the","a","an","and","or","but","if","then","else","to","of","in","on","at","for","with",
|
|
|
|
| 72 |
stopwords_all = persian_stopwords.union(english_stopwords)
|
| 73 |
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# ===================== Telegram parsing =====================
|
| 76 |
def extract_text(msg_text: Any) -> str:
|
| 77 |
if msg_text is None:
|
|
|
|
| 79 |
if isinstance(msg_text, str):
|
| 80 |
return msg_text
|
| 81 |
if isinstance(msg_text, list):
|
| 82 |
+
parts: List[str] = []
|
| 83 |
+
for part in msg_text:
|
| 84 |
+
if isinstance(part, str):
|
| 85 |
+
parts.append(part)
|
| 86 |
+
elif isinstance(part, dict):
|
| 87 |
+
t = part.get("text")
|
| 88 |
+
if isinstance(t, str):
|
| 89 |
+
parts.append(t)
|
| 90 |
+
return "".join(parts)
|
| 91 |
return ""
|
| 92 |
|
| 93 |
|
| 94 |
def extract_chats(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 95 |
+
if isinstance(data, dict) and "chats" in data and isinstance(data["chats"], dict) and "list" in data["chats"]:
|
| 96 |
+
lst = data["chats"]["list"]
|
| 97 |
+
if isinstance(lst, list):
|
| 98 |
+
return lst
|
| 99 |
+
if isinstance(data, dict) and "messages" in data and isinstance(data["messages"], list):
|
| 100 |
return [data]
|
| 101 |
+
raise ValueError("JSON format not recognized. Need Telegram export result.json with chats.list.")
|
| 102 |
|
| 103 |
|
| 104 |
def get_chat_name(chat: Dict[str, Any], fallback: str) -> str:
|
| 105 |
+
name = chat.get("name") or chat.get("title")
|
| 106 |
+
if isinstance(name, str) and name.strip():
|
| 107 |
+
return name.strip()
|
| 108 |
+
return fallback
|
| 109 |
|
| 110 |
|
| 111 |
def build_df(chat: Dict[str, Any]) -> pd.DataFrame:
|
| 112 |
+
rows: List[Dict[str, Any]] = []
|
| 113 |
for msg in chat.get("messages", []):
|
| 114 |
+
if not isinstance(msg, dict):
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
text = extract_text(msg.get("text", "")).strip()
|
| 118 |
if not text:
|
| 119 |
continue
|
| 120 |
+
|
| 121 |
+
date_raw = msg.get("date")
|
| 122 |
+
if not isinstance(date_raw, str) or not date_raw:
|
| 123 |
continue
|
| 124 |
+
|
| 125 |
+
rows.append({"date_raw": date_raw, "text": text})
|
| 126 |
+
|
| 127 |
+
df = pd.DataFrame(rows)
|
| 128 |
+
if df.empty:
|
| 129 |
+
return df
|
| 130 |
+
|
| 131 |
+
df["date"] = pd.to_datetime(df["date_raw"], errors="coerce", utc=True).dt.tz_convert(None)
|
| 132 |
+
df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
|
| 133 |
+
return df
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# ===================== Tokenize + Persian detect =====================
|
| 137 |
+
_FA_RE = re.compile(r"[\u0600-\u06FF]")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def is_persian(text: str) -> bool:
|
| 141 |
+
return bool(_FA_RE.search(text or ""))
|
| 142 |
|
| 143 |
|
| 144 |
+
def custom_tokenize(text: str) -> List[str]:
|
| 145 |
+
text = re.sub(r"http\S+|www\.\S+", " ", str(text))
|
| 146 |
+
text = text.replace("\u200c", " ")
|
| 147 |
+
tokens = re.findall(r"[\w\u0600-\u06FF]+", text)
|
| 148 |
+
tokens = [t.replace("دوستتدارم", "دوستت_دارم").replace("حالمبده", "حالم_بده") for t in tokens]
|
| 149 |
+
return tokens
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ===================== Sentiment scoring =====================
|
| 153 |
+
def score_en_vader(text: str) -> float:
|
| 154 |
return float(SIA.polarity_scores(text)["compound"])
|
| 155 |
|
| 156 |
|
| 157 |
@torch.inference_mode()
|
| 158 |
+
def score_fa_bert_batch(texts: List[str], batch_size: int = 16) -> List[float]:
|
| 159 |
+
scores: List[float] = []
|
| 160 |
+
for i in range(0, len(texts), batch_size):
|
| 161 |
+
chunk = texts[i:i + batch_size]
|
| 162 |
+
inputs = TOKENIZER(chunk, return_tensors="pt", truncation=True, padding=True, max_length=256).to(DEVICE)
|
| 163 |
+
out = MODEL(**inputs)
|
| 164 |
+
probs = torch.softmax(out.logits, dim=-1)
|
| 165 |
+
diff = (probs[:, 1] - probs[:, 0]).detach().cpu().numpy().astype(float).tolist()
|
| 166 |
+
diff = [float(max(-1.0, min(1.0, d))) for d in diff] # keep sentiment in [-1, 1]
|
| 167 |
+
scores.extend(diff)
|
| 168 |
+
return scores
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def persian_lexicon_score(text: str) -> float:
|
| 172 |
+
tokens = custom_tokenize(text)
|
| 173 |
+
pos = 0
|
| 174 |
+
neg = 0
|
| 175 |
+
|
| 176 |
+
for t in tokens:
|
| 177 |
+
if t in persian_positive:
|
| 178 |
+
pos += 1
|
| 179 |
+
elif t in persian_negative:
|
| 180 |
+
neg += 1
|
| 181 |
+
|
| 182 |
+
norm = str(text).replace("\u200c", " ")
|
| 183 |
+
for ph in persian_positive_phrases:
|
| 184 |
+
if ph in norm:
|
| 185 |
+
pos += 2
|
| 186 |
+
for ph in persian_negative_phrases:
|
| 187 |
+
if ph in norm:
|
| 188 |
+
neg += 2
|
| 189 |
+
|
| 190 |
+
return float((pos - neg) / max(1, (pos + neg)))
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def compute_sentiments(df: pd.DataFrame, max_bert_persian: int = 500, bert_batch_size: int = 16) -> Tuple[pd.DataFrame, int]:
|
| 194 |
+
if df.empty:
|
| 195 |
+
return df, 0
|
| 196 |
+
|
| 197 |
df = df.copy()
|
| 198 |
df["sentiment_final"] = 0.0
|
| 199 |
|
| 200 |
+
pers_mask = df["text"].astype(str).apply(is_persian)
|
| 201 |
+
pers_idx = df.index[pers_mask].tolist()
|
| 202 |
+
en_idx = df.index[~pers_mask].tolist()
|
| 203 |
|
| 204 |
+
if en_idx:
|
| 205 |
+
df.loc[en_idx, "sentiment_final"] = df.loc[en_idx, "text"].astype(str).apply(score_en_vader)
|
|
|
|
| 206 |
|
| 207 |
+
if pers_idx:
|
| 208 |
+
df.loc[pers_idx, "sentiment_final"] = df.loc[pers_idx, "text"].astype(str).apply(persian_lexicon_score)
|
|
|
|
| 209 |
|
| 210 |
+
bert_idx = pers_idx[: max(0, int(max_bert_persian))]
|
| 211 |
+
if bert_idx:
|
| 212 |
+
texts = df.loc[bert_idx, "text"].astype(str).tolist()
|
| 213 |
+
scores = score_fa_bert_batch(texts, batch_size=int(bert_batch_size))
|
| 214 |
+
df.loc[bert_idx, "sentiment_final"] = scores
|
| 215 |
|
| 216 |
+
df["sentiment_final"] = df["sentiment_final"].fillna(0.0)
|
| 217 |
+
df["sentiment_final"] = df["sentiment_final"].clip(-1.0, 1.0) # safety
|
| 218 |
+
return df, len(bert_idx)
|
| 219 |
|
| 220 |
|
| 221 |
+
# ===================== Weekly aggregation + extremes =====================
|
| 222 |
def weekly_series(df: pd.DataFrame) -> pd.Series:
|
| 223 |
+
return (
|
| 224 |
+
df.set_index("date")
|
| 225 |
+
.resample("W")["sentiment_final"]
|
| 226 |
+
.mean()
|
| 227 |
+
.dropna()
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def message_in_week(df: pd.DataFrame, week_end: pd.Timestamp, mode: str):
|
| 232 |
+
start = week_end - pd.Timedelta(days=7)
|
| 233 |
+
sub = df[(df["date"] > start) & (df["date"] <= week_end)]
|
| 234 |
+
if sub.empty:
|
| 235 |
+
return None
|
| 236 |
+
if mode == "max":
|
| 237 |
+
return sub.loc[sub["sentiment_final"].idxmax()]
|
| 238 |
+
return sub.loc[sub["sentiment_final"].idxmin()]
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def extract_lex_words_from_text(text: str, polarity: str, min_words: int = 4) -> List[str]:
|
| 242 |
+
tokens = [t for t in custom_tokenize(text) if len(t) > 1 and t not in stopwords_all]
|
| 243 |
+
|
| 244 |
+
out: List[str] = []
|
| 245 |
+
for t in tokens:
|
| 246 |
+
if polarity == "pos" and (t in persian_positive):
|
| 247 |
+
out.append(t)
|
| 248 |
+
if polarity == "neg" and (t in persian_negative):
|
| 249 |
+
out.append(t)
|
| 250 |
+
|
| 251 |
+
if len(out) < min_words:
|
| 252 |
+
for t in tokens:
|
| 253 |
+
if t not in out:
|
| 254 |
+
out.append(t)
|
| 255 |
+
if len(out) >= min_words:
|
| 256 |
+
break
|
| 257 |
+
|
| 258 |
+
return out[:max(min_words, 4)]
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# ===================== Plot helpers =====================
|
| 262 |
+
def _shape_fa(s: str) -> str:
|
| 263 |
+
try:
|
| 264 |
+
return get_display(arabic_reshaper.reshape(str(s)))
|
| 265 |
+
except Exception:
|
| 266 |
+
return str(s)
|
| 267 |
|
| 268 |
|
| 269 |
def make_weekly_plot(df: pd.DataFrame, chat_name: str):
|
| 270 |
ws = weekly_series(df)
|
| 271 |
|
| 272 |
+
info: Dict[str, Any] = {
|
| 273 |
+
"peak_week_end": None,
|
| 274 |
+
"low_week_end": None,
|
| 275 |
+
"peak_words": [],
|
| 276 |
+
"low_words": [],
|
| 277 |
+
"peak_word_main": None,
|
| 278 |
+
"low_word_main": None,
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
# big plot so Persian annotations are readable
|
| 282 |
fig = plt.figure(figsize=(22, 8))
|
| 283 |
plt.title(_shape_fa(f"Emotion Trajectory in Chat: {chat_name}"))
|
| 284 |
plt.xlabel("Time (weeks)")
|
| 285 |
plt.ylabel("Average sentiment score")
|
|
|
|
| 286 |
plt.grid(True)
|
| 287 |
+
plt.axhline(0, linestyle="--")
|
| 288 |
|
| 289 |
if ws.empty:
|
| 290 |
+
plt.tight_layout()
|
| 291 |
+
return fig, info
|
| 292 |
|
| 293 |
x = ws.index
|
| 294 |
y = ws.values.astype(float)
|
| 295 |
|
| 296 |
+
# show true peaks: full min/max range
|
| 297 |
+
y_min = float(np.min(y))
|
| 298 |
+
y_max = float(np.max(y))
|
| 299 |
+
pad = 0.08 * max(1e-9, (y_max - y_min))
|
| 300 |
+
y_min_plot = y_min - pad
|
| 301 |
+
y_max_plot = y_max + pad
|
| 302 |
+
|
| 303 |
+
plt.plot(x, y, color="red", linewidth=2, marker="o", markersize=4)
|
| 304 |
+
plt.ylim(y_min_plot, y_max_plot)
|
| 305 |
+
plt.margins(x=0.03, y=0.15)
|
| 306 |
+
|
| 307 |
+
peak_week = ws.idxmax()
|
| 308 |
+
low_week = ws.idxmin()
|
| 309 |
+
|
| 310 |
+
info["peak_week_end"] = peak_week.isoformat()
|
| 311 |
+
info["low_week_end"] = low_week.isoformat()
|
| 312 |
+
|
| 313 |
+
peak_msg = message_in_week(df, peak_week, "max")
|
| 314 |
+
low_msg = message_in_week(df, low_week, "min")
|
| 315 |
+
|
| 316 |
+
peak_words = extract_lex_words_from_text(str(peak_msg["text"]) if peak_msg is not None else "", "pos", 4)
|
| 317 |
+
low_words = extract_lex_words_from_text(str(low_msg["text"]) if low_msg is not None else "", "neg", 4)
|
| 318 |
+
|
| 319 |
+
info["peak_words"] = peak_words
|
| 320 |
+
info["low_words"] = low_words
|
| 321 |
+
info["peak_word_main"] = peak_words[0] if peak_words else None
|
| 322 |
+
info["low_word_main"] = low_words[0] if low_words else None
|
| 323 |
+
|
| 324 |
+
peak_y = float(ws.loc[peak_week])
|
| 325 |
+
low_y = float(ws.loc[low_week])
|
| 326 |
+
|
| 327 |
+
def clamp(val: float) -> float:
|
| 328 |
+
return float(min(max(val, y_min_plot), y_max_plot))
|
| 329 |
+
|
| 330 |
+
peak_y_plot = clamp(peak_y)
|
| 331 |
+
low_y_plot = clamp(low_y)
|
| 332 |
+
|
| 333 |
+
plt.scatter([peak_week], [peak_y_plot])
|
| 334 |
+
plt.annotate(
|
| 335 |
+
_shape_fa("، ".join(peak_words) + f" (peak={peak_y:.3f})"),
|
| 336 |
+
xy=(peak_week, peak_y_plot),
|
| 337 |
+
xytext=(peak_week, y_max_plot + 0.06 * (y_max_plot - y_min_plot)),
|
| 338 |
+
arrowprops=dict(arrowstyle="->"),
|
| 339 |
+
ha="center",
|
| 340 |
+
fontsize=10,
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
plt.scatter([low_week], [low_y_plot])
|
| 344 |
+
plt.annotate(
|
| 345 |
+
_shape_fa("، ".join(low_words) + f" (low={low_y:.3f})"),
|
| 346 |
+
xy=(low_week, low_y_plot),
|
| 347 |
+
xytext=(low_week, y_min_plot - 0.06 * (y_max_plot - y_min_plot)),
|
| 348 |
+
arrowprops=dict(arrowstyle="->"),
|
| 349 |
+
ha="center",
|
| 350 |
+
fontsize=10,
|
| 351 |
+
)
|
| 352 |
|
|
|
|
| 353 |
plt.tight_layout()
|
| 354 |
+
return fig, info
|
| 355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
+
# ===================== Top words (normalized to [-1, +1]) =====================
|
| 358 |
+
def top_words_weighted_by_sentiment(df: pd.DataFrame, top_n: int = 5) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
| 359 |
+
"""
|
| 360 |
+
Uses message sentiment to weight tokens, then normalizes scores into [-1, +1].
|
| 361 |
+
Positive table: 0..+1
|
| 362 |
+
Negative table: -1..0
|
| 363 |
+
"""
|
| 364 |
pos_w = defaultdict(float)
|
| 365 |
neg_w = defaultdict(float)
|
| 366 |
|
| 367 |
+
for text, s in zip(df["text"].astype(str).tolist(), df["sentiment_final"].astype(float).tolist()):
|
| 368 |
+
tokens = [t for t in custom_tokenize(text) if len(t) > 1 and t not in stopwords_all]
|
|
|
|
|
|
|
|
|
|
| 369 |
if not tokens or abs(s) < 1e-9:
|
| 370 |
continue
|
| 371 |
|
| 372 |
if s > 0:
|
| 373 |
for t in tokens:
|
| 374 |
+
pos_w[t] += float(s)
|
| 375 |
elif s < 0:
|
| 376 |
for t in tokens:
|
| 377 |
+
neg_w[t] += float(abs(s))
|
| 378 |
|
| 379 |
+
max_pos = max(pos_w.values(), default=0.0)
|
| 380 |
+
max_neg = max(neg_w.values(), default=0.0)
|
| 381 |
|
| 382 |
+
pos_items = sorted(pos_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
| 383 |
+
neg_items = sorted(neg_w.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
|
|
|
|
|
| 384 |
|
| 385 |
+
if max_pos <= 1e-12:
|
| 386 |
+
pos_top = []
|
| 387 |
+
else:
|
| 388 |
+
pos_top = [{"word": w, "score": float(v / max_pos)} for w, v in pos_items]
|
| 389 |
+
for d in pos_top:
|
| 390 |
+
d["score"] = float(max(0.0, min(1.0, d["score"])))
|
| 391 |
+
|
| 392 |
+
if max_neg <= 1e-12:
|
| 393 |
+
neg_top = []
|
| 394 |
+
else:
|
| 395 |
+
neg_top = [{"word": w, "score": float(-(v / max_neg))} for w, v in neg_items]
|
| 396 |
+
for d in neg_top:
|
| 397 |
+
d["score"] = float(min(0.0, max(-1.0, d["score"])))
|
| 398 |
|
| 399 |
return pos_top, neg_top
|
| 400 |
|
| 401 |
|
| 402 |
+
# ===================== Main entry =====================
|
| 403 |
+
def analyze_chat(
|
| 404 |
+
chat: Dict[str, Any],
|
| 405 |
+
max_bert_persian: int = 500
|
| 406 |
+
) -> Tuple[Dict[str, Any], Any, List[Dict[str, Any]], List[Dict[str, Any]]]:
|
| 407 |
+
|
| 408 |
df = build_df(chat)
|
| 409 |
+
name = get_chat_name(chat, "Selected chat")
|
| 410 |
|
| 411 |
if df.empty:
|
| 412 |
+
empty = {
|
| 413 |
+
"chat_name": name,
|
| 414 |
+
"message_count": 0,
|
| 415 |
+
"bert_used_on_persian_messages": 0,
|
| 416 |
+
"overall_avg_sentiment": 0.0,
|
| 417 |
+
"peak_word_main": None,
|
| 418 |
+
"low_word_main": None,
|
| 419 |
+
"peak_words": [],
|
| 420 |
+
"low_words": [],
|
| 421 |
+
"weekly": [],
|
| 422 |
+
"top5_positive_lex": [],
|
| 423 |
+
"top5_negative_lex": [],
|
| 424 |
+
}
|
| 425 |
fig = plt.figure(figsize=(22, 8))
|
| 426 |
+
plt.title(_shape_fa(f"Emotion Trajectory in Chat: {name}"))
|
| 427 |
+
plt.tight_layout()
|
| 428 |
+
return empty, fig, [], []
|
| 429 |
|
| 430 |
+
df, used = compute_sentiments(df, max_bert_persian=max_bert_persian, bert_batch_size=16)
|
|
|
|
| 431 |
|
| 432 |
+
ws = weekly_series(df)
|
| 433 |
+
weekly_records = [{"week_end": idx.isoformat(), "avg_sentiment": float(val)} for idx, val in ws.items()]
|
| 434 |
+
overall = float(df["sentiment_final"].mean())
|
| 435 |
+
|
| 436 |
+
fig, info = make_weekly_plot(df, name)
|
| 437 |
+
|
| 438 |
+
pos_top, neg_top = top_words_weighted_by_sentiment(df, top_n=5)
|
| 439 |
|
| 440 |
result = {
|
| 441 |
"chat_name": name,
|
| 442 |
+
"message_count": int(len(df)),
|
| 443 |
+
"bert_used_on_persian_messages": int(used),
|
| 444 |
+
"overall_avg_sentiment": overall,
|
| 445 |
+
"weekly": weekly_records,
|
| 446 |
+
|
| 447 |
+
"peak_week_end": info["peak_week_end"],
|
| 448 |
+
"low_week_end": info["low_week_end"],
|
| 449 |
+
"peak_word_main": info["peak_word_main"],
|
| 450 |
+
"low_word_main": info["low_word_main"],
|
| 451 |
+
"peak_words": info["peak_words"],
|
| 452 |
+
"low_words": info["low_words"],
|
| 453 |
+
|
| 454 |
"top5_positive_lex": pos_top,
|
| 455 |
"top5_negative_lex": neg_top,
|
| 456 |
}
|