Di12 commited on
Commit
74cc1df
·
verified ·
1 Parent(s): 7e222ad

Add rule-based aspects

Browse files
Files changed (1) hide show
  1. app.py +124 -33
app.py CHANGED
@@ -201,6 +201,62 @@ def load_model(path: str):
201
 
202
  model = load_model(model_path)
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  def predict_sentiment(model, sentence, vocab, label_mapping=None):
205
  tensor = vocab.corpus_to_tensor([sentence])[0]
206
  length = torch.LongTensor([tensor.size(0)]).to(device)
@@ -212,7 +268,13 @@ def predict_sentiment(model, sentence, vocab, label_mapping=None):
212
  idx = int(torch.tensor(probs).argmax())
213
  return (label_mapping[idx], probs) if label_mapping else (idx, probs)
214
 
215
- def process_input(text_input, file):
 
 
 
 
 
 
216
  content = ""
217
  comments = []
218
 
@@ -223,7 +285,6 @@ def process_input(text_input, file):
223
 
224
  elif file:
225
  if isinstance(file, str):
226
- # file path
227
  if file.lower().endswith('.csv'):
228
  content = open(file, 'r', encoding='utf-8', errors='ignore').read()
229
  lines = content.splitlines()
@@ -240,49 +301,79 @@ def process_input(text_input, file):
240
  else:
241
  raise gr.Error("Định dạng tệp không được hỗ trợ.")
242
 
243
- if len(comments) == 0:
244
- raise gr.Error("Vui lòng nhập ít nhất một bình luận hoặc tải lên tệp chứa bình luận.")
 
 
 
 
245
 
246
- results = []
247
  for comment in comments:
248
- label, probability = predict_sentiment(model, clean_text(comment), vocab, label_map)
249
- results.append({
250
- 'Comment': comment,
251
- 'Dự đoán': label,
252
- 'Khả năng tiêu cực': probability[0],
253
- 'Khả năng bình thường': probability[1],
254
- 'Khả năng tích cực': probability[2],
255
- })
256
-
257
- df2 = pd.DataFrame(results)
258
-
259
- styler = df2.style.format({
260
- "Khả năng tiêu cực": "{:.0%}",
261
- "Khả năng bình thường": "{:.0%}",
262
- "Khả năng tích cực": "{:.0%}",
263
- })
264
-
265
- return styler, df2
266
-
267
- def summarize_distribution(df):
268
- dist = df['Dự đoán'].value_counts(normalize=True) * 100
269
- dist = dist.reindex(['tiêu cực', 'bình thường', 'tích cực'], fill_value=0)
270
- return dist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  def plot_distribution(dist):
273
  fig, ax = plt.subplots()
274
  dist.plot.bar(ax=ax, color=['red','gray','green'])
275
  ax.set_ylabel("Tỷ lệ (%)")
276
- ax.set_title("Phân phối cảm xúc")
277
  ax.tick_params(axis='x', labelrotation=0)
278
  ax.tick_params(axis='y', labelrotation=0)
 
279
  return fig
280
 
281
  def full_process(text_input, file_input):
282
- styler, df2 = process_input(text_input, file_input)
283
- dist = summarize_distribution(df2)
284
- fig = plot_distribution(dist)
285
- return styler, fig
286
 
287
  with gr.Blocks() as demo:
288
  gr.Markdown("## Phân tích cảm xúc")
 
201
 
202
  model = load_model(model_path)
203
 
204
+ seed_aspects = {
205
+ 'vận_chuyển': ['giao hàng', 'giao', 'ship', 'nhận hàng', 'vận chuyển'],
206
+ 'đóng_gói': ['đóng gói', 'đóng_gói', 'gói', 'bao_bì'],
207
+ 'sản_phẩm': ['sách', 'sản phẩm', 'chất lượng']
208
+ }
209
+
210
+ def tokenize_underthesea(text):
211
+ """
212
+ underthesea.word_tokenize returns a string or tokens joined by spaces.
213
+ We split to get list of tokens.
214
+ """
215
+ toks = word_tokenize(text) # underthesea
216
+ if isinstance(toks, str):
217
+ toks = toks.split()
218
+ return toks
219
+
220
+ def extract_aspects_from_text(text, seed_aspects, tokenizer=tokenize_underthesea):
221
+ """
222
+ Returns:
223
+ tokens: list[str]
224
+ found: list of (aspect_key, aspect_phrase, start_idx, end_idx)
225
+ Matching is token-based sequence match.
226
+ """
227
+ # clean and tokenize
228
+ txt = clean_text(text)
229
+ tokens = tokenizer(txt)
230
+ t_low = [t.lower() for t in tokens]
231
+ found = []
232
+
233
+ # prepare normalized seed phrases as token lists
234
+ seed_tokenlists = []
235
+ for asp_key, kws in seed_aspects.items():
236
+ for kw in kws:
237
+ # normalize - lowercase and split by spaces or underscores
238
+ kw_proc = kw.lower().replace('_', ' ').strip()
239
+ kw_tokens = kw_proc.split()
240
+ seed_tokenlists.append((asp_key, kw_tokens, kw_proc))
241
+
242
+ # match each seed phrase in token list (simple sliding window)
243
+ for asp_key, kw_tokens, kw_proc in seed_tokenlists:
244
+ L = len(kw_tokens)
245
+ if L == 0:
246
+ continue
247
+ for i in range(len(t_low) - L + 1):
248
+ if t_low[i:i+L] == kw_tokens:
249
+ phrase = " ".join(tokens[i:i+L])
250
+ found.append((asp_key, phrase, i, i+L-1))
251
+ # advance i to skip overlapping matches of same phrase
252
+ # (we don't break entirely because other seeds/aspects can still match)
253
+ return tokens, found
254
+
255
+ def get_context_string_from_tokens(tokens, start, end, window=3):
256
+ left = max(0, start - window)
257
+ right = min(len(tokens)-1, end + window)
258
+ return " ".join(tokens[left:right+1])
259
+
260
  def predict_sentiment(model, sentence, vocab, label_mapping=None):
261
  tensor = vocab.corpus_to_tensor([sentence])[0]
262
  length = torch.LongTensor([tensor.size(0)]).to(device)
 
268
  idx = int(torch.tensor(probs).argmax())
269
  return (label_mapping[idx], probs) if label_mapping else (idx, probs)
270
 
271
+ def process_input_with_aspects(text_input, file):
272
+ """
273
+ Reads input text or uploaded file, splits into sentences/comments,
274
+ extracts aspects for each comment, predicts sentiment per-aspect
275
+ (or per-sentence fallback) and returns styled DataFrame + aspect-level summary.
276
+ (This version hides probability columns.)
277
+ """
278
  content = ""
279
  comments = []
280
 
 
285
 
286
  elif file:
287
  if isinstance(file, str):
 
288
  if file.lower().endswith('.csv'):
289
  content = open(file, 'r', encoding='utf-8', errors='ignore').read()
290
  lines = content.splitlines()
 
301
  else:
302
  raise gr.Error("Định dạng tệp không được hỗ trợ.")
303
 
304
+ if len(comments) == 0:
305
+ raise gr.Error("Vui lòng nhập ít nhất một bình luận hoặc tải lên tệp chứa bình luận.")
306
+
307
+ # RESULTS
308
+ table_rows = []
309
+ aspect_rows = [] # flattened aspect-level entries for aggregation
310
 
 
311
  for comment in comments:
312
+ # aspect extraction
313
+ tokens, aspects = extract_aspects_from_text(comment, seed_aspects)
314
+
315
+ if len(aspects) == 0:
316
+ # fallback: sentence-level
317
+ sent_label, _ = predict_sentiment(model, clean_text(comment), vocab, label_map)
318
+ row = {
319
+ 'Comment': comment,
320
+ 'Dự đoán': sent_label,
321
+ 'Aspects': ''
322
+ }
323
+ table_rows.append(row)
324
+ else:
325
+ asp_info_list = []
326
+ for asp_key, asp_phrase, s, e in aspects:
327
+ context = get_context_string_from_tokens(tokens, s, e, window=3)
328
+ sent, _ = predict_sentiment(model, clean_text(context), vocab, label_map)
329
+ asp_info_list.append(f"{asp_key}: {sent}")
330
+ aspect_rows.append({
331
+ 'Comment': comment,
332
+ 'Aspect': asp_key,
333
+ 'Phrase': asp_phrase,
334
+ 'Context': context,
335
+ 'Sentiment': sent
336
+ })
337
+ aspects_str = " | ".join(asp_info_list)
338
+ sent_label, _ = predict_sentiment(model, clean_text(comment), vocab, label_map)
339
+ row = {
340
+ 'Comment': comment,
341
+ 'Dự đoán': sent_label,
342
+ 'Aspects': aspects_str
343
+ }
344
+ table_rows.append(row)
345
+
346
+ df2 = pd.DataFrame(table_rows)
347
+
348
+ # No probability columns => simpler styler
349
+ styler = df2.style
350
+
351
+ if len(aspect_rows) > 0:
352
+ df_aspects = pd.DataFrame(aspect_rows)
353
+ aspect_dist = (df_aspects.groupby(['Aspect','Sentiment']).size()
354
+ .unstack(fill_value=0))
355
+ aspect_dist_pct = aspect_dist.div(aspect_dist.sum(axis=1), axis=0) * 100
356
+ else:
357
+ df_aspects = pd.DataFrame(columns=['Comment','Aspect','Phrase','Context','Sentiment'])
358
+ aspect_dist_pct = pd.DataFrame()
359
+
360
+ return styler, df2, df_aspects, aspect_dist_pct
361
 
362
  def plot_distribution(dist):
363
  fig, ax = plt.subplots()
364
  dist.plot.bar(ax=ax, color=['red','gray','green'])
365
  ax.set_ylabel("Tỷ lệ (%)")
366
+ ax.set_title("Phân phối cảm xúc (toàn câu)")
367
  ax.tick_params(axis='x', labelrotation=0)
368
  ax.tick_params(axis='y', labelrotation=0)
369
+ plt.tight_layout()
370
  return fig
371
 
372
  def full_process(text_input, file_input):
373
+ styler, df2, df_aspects, aspect_dist_pct = process_input_with_aspects(text_input, file_input)
374
+ dist = summarize_distribution_from_df(df2)
375
+ fig_main = plot_distribution(dist)
376
+ return styler, fig_main
377
 
378
  with gr.Blocks() as demo:
379
  gr.Markdown("## Phân tích cảm xúc")