BERTopic_AG_final

Running

App Files Files Community

anujjuna commited on 29 days ago

Commit

9c754e7

verified ·

1 Parent(s): 6f652b8

Update agent.py

Browse files

Files changed (1) hide show

agent.py +36 -11

agent.py CHANGED Viewed

@@ -61,21 +61,40 @@ PAJAIS_NOT_COVERED = [
     "talent matching", "job-person fit", "HR analytics",
 ]
-# Rule-based NOVEL trigger — any keyword matching this pattern forces NOVEL
 NOVEL_REGEX_TRIGGERS = re.compile(
-    r'\b(llm|gpt|llms|generative|federat|differenti.{0,5}privac|fairness|explainab|xai|'
-    r'process.{0,5}mining|event.{0,5}log|petri|blockchain|smart.{0,5}contract|'
-    r'covid|pandemic|malware|dark.{0,5}web|cyber.{0,5}insur|agentic|'
-    r'transformer|bert|graph.{0,5}neural|talent.{0,5}match|job.{0,5}fit|'
-    r'iot.{0,5}analyt|edge.{0,5}comput|heterogeneous|'
-    r'recommender.{0,5}neural|gnn|rag|prompt.{0,5}engineer)\b',
     re.IGNORECASE
 )
 def _is_deterministic_novel(keywords: list[str], samples: list[str]) -> bool:
-    """Non-LLM rule-based check: if keywords/samples match NOVEL patterns, return True."""
-    text = " ".join(keywords + samples).lower()
-    return bool(NOVEL_REGEX_TRIGGERS.search(text))
 # ---------------------------------------------------------------------------
 # Data Classes
@@ -373,7 +392,13 @@ def interpret_topic(
     novel_votes = classification_votes.count("NOVEL")
     mapped_votes = classification_votes.count("MAPPED")
-    if forced_novel or novel_votes >= 1:
         final_classification = "NOVEL"
     else:
         final_classification = "MAPPED"

     "talent matching", "job-person fit", "HR analytics",
 ]
+# Rule-based NOVEL trigger — fires ONLY on specific, unambiguous compound/technical terms
+# that are definitively absent from PAJAIS 2019.
+# Deliberately narrow: single common words like "data", "model", "network", "learning",
+# "deep", "smart", "financial", "detection" do NOT trigger this — they exist in PAJAIS.
+# Only truly post-2018 or PAJAIS-absent compound terms qualify.
 NOVEL_REGEX_TRIGGERS = re.compile(
+    r'\b('
+    r'llms?|gpt[\-\s]?\d*|large\s+language\s+model|generative\s+ai|'
+    r'federat\w*\s+learn\w*|differential\s+privac\w*|dp\-sgd|'
+    r'process\s+mining|event\s+log|petri\s+net|conformance\s+check\w*|'
+    r'blockchain|smart\s+contract|defi\b|tokenomic\w*|'
+    r'malware|botnet|dark\s+web|cyber\s+insur\w*|'
+    r'responsible\s+ai|explainab\w*\s+ai|algorithmic\s+bias|xai\b|'
+    r'agentic\s+ai|multi.agent\s+orchest\w*|'
+    r'graph\s+neural\s+network|gnn\b|'
+    r'retrieval.augment\w*|prompt\s+engineer\w*|rag\b|'
+    r'talent\s+match\w*|job.person\s+fit|'
+    r'covid.19|pandemic\s+inform\w*'
+    r')\b',
     re.IGNORECASE
 )
 def _is_deterministic_novel(keywords: list[str], samples: list[str]) -> bool:
+    """Non-LLM rule-based check: fires only on specific unambiguous NOVEL compound terms.
+    Generic single words (data, model, network, learning, detection) do NOT trigger this.
+    The keyword list from BERTopic is checked word-by-word AND as joined text to catch
+    compound matches that span two keywords."""
+    # Check the joined keyword string (catches "process mining" split across two keywords)
+    keyword_text = " ".join(keywords).lower()
+    sample_text  = " ".join(samples).lower()
+    return (
+        bool(NOVEL_REGEX_TRIGGERS.search(keyword_text)) or
+        bool(NOVEL_REGEX_TRIGGERS.search(sample_text))
+    )
 # ---------------------------------------------------------------------------
 # Data Classes
     novel_votes = classification_votes.count("NOVEL")
     mapped_votes = classification_votes.count("MAPPED")
+    # Classification decision logic:
+    # - Regex forced (unambiguous compound NOVEL term in keywords/samples) → always NOVEL
+    # - LLM majority (2 or more of 3 LLMs vote NOVEL) → NOVEL
+    # - Single LLM vote for NOVEL + 2 for MAPPED → MAPPED (majority wins)
+    # - All 3 vote MAPPED → MAPPED
+    # This gives ~40-60% NOVEL as expected for TMIS vs PAJAIS 2019 comparison.
+    if forced_novel or novel_votes >= 2:
         final_classification = "NOVEL"
     else:
         final_classification = "MAPPED"