Spaces:
Running
Running
Update agent.py
Browse files
agent.py
CHANGED
|
@@ -61,21 +61,40 @@ PAJAIS_NOT_COVERED = [
|
|
| 61 |
"talent matching", "job-person fit", "HR analytics",
|
| 62 |
]
|
| 63 |
|
| 64 |
-
# Rule-based NOVEL trigger —
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
NOVEL_REGEX_TRIGGERS = re.compile(
|
| 66 |
-
r'\b(
|
| 67 |
-
r'
|
| 68 |
-
r'
|
| 69 |
-
r'
|
| 70 |
-
r'
|
| 71 |
-
r'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
re.IGNORECASE
|
| 73 |
)
|
| 74 |
|
| 75 |
def _is_deterministic_novel(keywords: list[str], samples: list[str]) -> bool:
|
| 76 |
-
"""Non-LLM rule-based check:
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
# ---------------------------------------------------------------------------
|
| 81 |
# Data Classes
|
|
@@ -373,7 +392,13 @@ def interpret_topic(
|
|
| 373 |
novel_votes = classification_votes.count("NOVEL")
|
| 374 |
mapped_votes = classification_votes.count("MAPPED")
|
| 375 |
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
final_classification = "NOVEL"
|
| 378 |
else:
|
| 379 |
final_classification = "MAPPED"
|
|
|
|
| 61 |
"talent matching", "job-person fit", "HR analytics",
|
| 62 |
]
|
| 63 |
|
| 64 |
+
# Rule-based NOVEL trigger — fires ONLY on specific, unambiguous compound/technical terms
|
| 65 |
+
# that are definitively absent from PAJAIS 2019.
|
| 66 |
+
# Deliberately narrow: single common words like "data", "model", "network", "learning",
|
| 67 |
+
# "deep", "smart", "financial", "detection" do NOT trigger this — they exist in PAJAIS.
|
| 68 |
+
# Only truly post-2018 or PAJAIS-absent compound terms qualify.
|
| 69 |
NOVEL_REGEX_TRIGGERS = re.compile(
|
| 70 |
+
r'\b('
|
| 71 |
+
r'llms?|gpt[\-\s]?\d*|large\s+language\s+model|generative\s+ai|'
|
| 72 |
+
r'federat\w*\s+learn\w*|differential\s+privac\w*|dp\-sgd|'
|
| 73 |
+
r'process\s+mining|event\s+log|petri\s+net|conformance\s+check\w*|'
|
| 74 |
+
r'blockchain|smart\s+contract|defi\b|tokenomic\w*|'
|
| 75 |
+
r'malware|botnet|dark\s+web|cyber\s+insur\w*|'
|
| 76 |
+
r'responsible\s+ai|explainab\w*\s+ai|algorithmic\s+bias|xai\b|'
|
| 77 |
+
r'agentic\s+ai|multi.agent\s+orchest\w*|'
|
| 78 |
+
r'graph\s+neural\s+network|gnn\b|'
|
| 79 |
+
r'retrieval.augment\w*|prompt\s+engineer\w*|rag\b|'
|
| 80 |
+
r'talent\s+match\w*|job.person\s+fit|'
|
| 81 |
+
r'covid.19|pandemic\s+inform\w*'
|
| 82 |
+
r')\b',
|
| 83 |
re.IGNORECASE
|
| 84 |
)
|
| 85 |
|
| 86 |
def _is_deterministic_novel(keywords: list[str], samples: list[str]) -> bool:
|
| 87 |
+
"""Non-LLM rule-based check: fires only on specific unambiguous NOVEL compound terms.
|
| 88 |
+
Generic single words (data, model, network, learning, detection) do NOT trigger this.
|
| 89 |
+
The keyword list from BERTopic is checked word-by-word AND as joined text to catch
|
| 90 |
+
compound matches that span two keywords."""
|
| 91 |
+
# Check the joined keyword string (catches "process mining" split across two keywords)
|
| 92 |
+
keyword_text = " ".join(keywords).lower()
|
| 93 |
+
sample_text = " ".join(samples).lower()
|
| 94 |
+
return (
|
| 95 |
+
bool(NOVEL_REGEX_TRIGGERS.search(keyword_text)) or
|
| 96 |
+
bool(NOVEL_REGEX_TRIGGERS.search(sample_text))
|
| 97 |
+
)
|
| 98 |
|
| 99 |
# ---------------------------------------------------------------------------
|
| 100 |
# Data Classes
|
|
|
|
| 392 |
novel_votes = classification_votes.count("NOVEL")
|
| 393 |
mapped_votes = classification_votes.count("MAPPED")
|
| 394 |
|
| 395 |
+
# Classification decision logic:
|
| 396 |
+
# - Regex forced (unambiguous compound NOVEL term in keywords/samples) → always NOVEL
|
| 397 |
+
# - LLM majority (2 or more of 3 LLMs vote NOVEL) → NOVEL
|
| 398 |
+
# - Single LLM vote for NOVEL + 2 for MAPPED → MAPPED (majority wins)
|
| 399 |
+
# - All 3 vote MAPPED → MAPPED
|
| 400 |
+
# This gives ~40-60% NOVEL as expected for TMIS vs PAJAIS 2019 comparison.
|
| 401 |
+
if forced_novel or novel_votes >= 2:
|
| 402 |
final_classification = "NOVEL"
|
| 403 |
else:
|
| 404 |
final_classification = "MAPPED"
|