Spaces:

Di12
/

sentiment_analysis

Sleeping

App Files Files Community

Di12 commited on Aug 30, 2025

Commit

74cc1df

verified ·

1 Parent(s): 7e222ad

Add rule-based aspects

Browse files

Files changed (1) hide show

app.py +124 -33

app.py CHANGED Viewed

@@ -201,6 +201,62 @@ def load_model(path: str):
 model = load_model(model_path)
 def predict_sentiment(model, sentence, vocab, label_mapping=None):
     tensor = vocab.corpus_to_tensor([sentence])[0]
     length = torch.LongTensor([tensor.size(0)]).to(device)
@@ -212,7 +268,13 @@ def predict_sentiment(model, sentence, vocab, label_mapping=None):
     idx = int(torch.tensor(probs).argmax())
     return (label_mapping[idx], probs) if label_mapping else (idx, probs)
-def process_input(text_input, file):
     content = ""
     comments = []
@@ -223,7 +285,6 @@ def process_input(text_input, file):
     elif file:
         if isinstance(file, str):
-            # file path
             if file.lower().endswith('.csv'):
                 content = open(file, 'r', encoding='utf-8', errors='ignore').read()
                 lines = content.splitlines()
@@ -240,49 +301,79 @@ def process_input(text_input, file):
         else:
             raise gr.Error("Định dạng tệp không được hỗ trợ.")
-    if len(comments) == 0:
-        raise gr.Error("Vui lòng nhập ít nhất một bình luận hoặc tải lên tệp chứa bình luận.")
-    results = []
     for comment in comments:
-        label, probability = predict_sentiment(model, clean_text(comment), vocab, label_map)
-        results.append({
-            'Comment': comment,
-            'Dự đoán': label,
-            'Khả năng tiêu cực': probability[0],
-            'Khả năng bình thường': probability[1],
-            'Khả năng tích cực': probability[2],
-        })
-    df2 = pd.DataFrame(results)
-    styler = df2.style.format({
-        "Khả năng tiêu cực": "{:.0%}",
-        "Khả năng bình thường": "{:.0%}",
-        "Khả năng tích cực": "{:.0%}",
-    })
-    return styler, df2
-def summarize_distribution(df):
-    dist = df['Dự đoán'].value_counts(normalize=True) * 100
-    dist = dist.reindex(['tiêu cực', 'bình thường', 'tích cực'], fill_value=0)
-    return dist
 def plot_distribution(dist):
     fig, ax = plt.subplots()
     dist.plot.bar(ax=ax, color=['red','gray','green'])
     ax.set_ylabel("Tỷ lệ (%)")
-    ax.set_title("Phân phối cảm xúc")
     ax.tick_params(axis='x', labelrotation=0)
     ax.tick_params(axis='y', labelrotation=0)
     return fig
 def full_process(text_input, file_input):
-    styler, df2 = process_input(text_input, file_input)
-    dist = summarize_distribution(df2)
-    fig = plot_distribution(dist)
-    return styler, fig
 with gr.Blocks() as demo:
     gr.Markdown("## Phân tích cảm xúc")

 model = load_model(model_path)
+seed_aspects = {
+    'vận_chuyển': ['giao hàng', 'giao', 'ship', 'nhận hàng', 'vận chuyển'],
+    'đóng_gói': ['đóng gói', 'đóng_gói', 'gói', 'bao_bì'],
+    'sản_phẩm': ['sách', 'sản phẩm', 'chất lượng']
+}
+def tokenize_underthesea(text):
+    """
+    underthesea.word_tokenize returns a string or tokens joined by spaces.
+    We split to get list of tokens.
+    """
+    toks = word_tokenize(text)  # underthesea
+    if isinstance(toks, str):
+        toks = toks.split()
+    return toks
+def extract_aspects_from_text(text, seed_aspects, tokenizer=tokenize_underthesea):
+    """
+    Returns:
+      tokens: list[str]
+      found: list of (aspect_key, aspect_phrase, start_idx, end_idx)
+    Matching is token-based sequence match.
+    """
+    # clean and tokenize
+    txt = clean_text(text)
+    tokens = tokenizer(txt)
+    t_low = [t.lower() for t in tokens]
+    found = []
+    # prepare normalized seed phrases as token lists
+    seed_tokenlists = []
+    for asp_key, kws in seed_aspects.items():
+        for kw in kws:
+            # normalize - lowercase and split by spaces or underscores
+            kw_proc = kw.lower().replace('_', ' ').strip()
+            kw_tokens = kw_proc.split()
+            seed_tokenlists.append((asp_key, kw_tokens, kw_proc))
+    # match each seed phrase in token list (simple sliding window)
+    for asp_key, kw_tokens, kw_proc in seed_tokenlists:
+        L = len(kw_tokens)
+        if L == 0:
+            continue
+        for i in range(len(t_low) - L + 1):
+            if t_low[i:i+L] == kw_tokens:
+                phrase = " ".join(tokens[i:i+L])
+                found.append((asp_key, phrase, i, i+L-1))
+                # advance i to skip overlapping matches of same phrase
+                # (we don't break entirely because other seeds/aspects can still match)
+    return tokens, found
+def get_context_string_from_tokens(tokens, start, end, window=3):
+    left = max(0, start - window)
+    right = min(len(tokens)-1, end + window)
+    return " ".join(tokens[left:right+1])
 def predict_sentiment(model, sentence, vocab, label_mapping=None):
     tensor = vocab.corpus_to_tensor([sentence])[0]
     length = torch.LongTensor([tensor.size(0)]).to(device)
     idx = int(torch.tensor(probs).argmax())
     return (label_mapping[idx], probs) if label_mapping else (idx, probs)
+def process_input_with_aspects(text_input, file):
+    """
+    Reads input text or uploaded file, splits into sentences/comments,
+    extracts aspects for each comment, predicts sentiment per-aspect
+    (or per-sentence fallback) and returns styled DataFrame + aspect-level summary.
+    (This version hides probability columns.)
+    """
     content = ""
     comments = []
     elif file:
         if isinstance(file, str):
             if file.lower().endswith('.csv'):
                 content = open(file, 'r', encoding='utf-8', errors='ignore').read()
                 lines = content.splitlines()
         else:
             raise gr.Error("Định dạng tệp không được hỗ trợ.")
+    if len(comments) == 0:
+        raise gr.Error("Vui lòng nhập ít nhất một bình luận hoặc tải lên tệp chứa bình luận.")
+    # RESULTS
+    table_rows = []
+    aspect_rows = []  # flattened aspect-level entries for aggregation
     for comment in comments:
+        # aspect extraction
+        tokens, aspects = extract_aspects_from_text(comment, seed_aspects)
+        if len(aspects) == 0:
+            # fallback: sentence-level
+            sent_label, _ = predict_sentiment(model, clean_text(comment), vocab, label_map)
+            row = {
+                'Comment': comment,
+                'Dự đoán': sent_label,
+                'Aspects': ''
+            }
+            table_rows.append(row)
+        else:
+            asp_info_list = []
+            for asp_key, asp_phrase, s, e in aspects:
+                context = get_context_string_from_tokens(tokens, s, e, window=3)
+                sent, _ = predict_sentiment(model, clean_text(context), vocab, label_map)
+                asp_info_list.append(f"{asp_key}: {sent}")
+                aspect_rows.append({
+                    'Comment': comment,
+                    'Aspect': asp_key,
+                    'Phrase': asp_phrase,
+                    'Context': context,
+                    'Sentiment': sent
+                })
+            aspects_str = " | ".join(asp_info_list)
+            sent_label, _ = predict_sentiment(model, clean_text(comment), vocab, label_map)
+            row = {
+                'Comment': comment,
+                'Dự đoán': sent_label,
+                'Aspects': aspects_str
+            }
+            table_rows.append(row)
+    df2 = pd.DataFrame(table_rows)
+    # No probability columns => simpler styler
+    styler = df2.style
+    if len(aspect_rows) > 0:
+        df_aspects = pd.DataFrame(aspect_rows)
+        aspect_dist = (df_aspects.groupby(['Aspect','Sentiment']).size()
+                       .unstack(fill_value=0))
+        aspect_dist_pct = aspect_dist.div(aspect_dist.sum(axis=1), axis=0) * 100
+    else:
+        df_aspects = pd.DataFrame(columns=['Comment','Aspect','Phrase','Context','Sentiment'])
+        aspect_dist_pct = pd.DataFrame()
+    return styler, df2, df_aspects, aspect_dist_pct
 def plot_distribution(dist):
     fig, ax = plt.subplots()
     dist.plot.bar(ax=ax, color=['red','gray','green'])
     ax.set_ylabel("Tỷ lệ (%)")
+    ax.set_title("Phân phối cảm xúc (toàn câu)")
     ax.tick_params(axis='x', labelrotation=0)
     ax.tick_params(axis='y', labelrotation=0)
+    plt.tight_layout()
     return fig
 def full_process(text_input, file_input):
+    styler, df2, df_aspects, aspect_dist_pct = process_input_with_aspects(text_input, file_input)
+    dist = summarize_distribution_from_df(df2)
+    fig_main = plot_distribution(dist)
+    return styler, fig_main
 with gr.Blocks() as demo:
     gr.Markdown("## Phân tích cảm xúc")