Spaces:
Running
Running
Hardik Singh commited on
Commit Β·
970f66d
1
Parent(s): 09db715
data logging
Browse files- poller/classifier.py +15 -3
- poller/training_logger.py +40 -0
poller/classifier.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Dict, Any, Tuple, List, Optional
|
|
| 7 |
from groq import AsyncGroq
|
| 8 |
import google.generativeai as genai
|
| 9 |
from huggingface_hub import AsyncInferenceClient
|
|
|
|
| 10 |
|
| 11 |
log = logging.getLogger(__name__)
|
| 12 |
|
|
@@ -459,11 +460,13 @@ async def classify_event_llm(title: str, summary: str = "") -> Dict[str, Any]:
|
|
| 459 |
# TIER 0: Smart Regex β handles ~60% of articles (FREE)
|
| 460 |
regex_result = smart_regex_classify(title, summary)
|
| 461 |
if regex_result:
|
|
|
|
| 462 |
return regex_result
|
| 463 |
|
| 464 |
# TIER 1: Zero-Shot Classification β handles ~25% of articles (FREE)
|
| 465 |
zs_result = await classify_with_zero_shot(title)
|
| 466 |
if zs_result:
|
|
|
|
| 467 |
return zs_result
|
| 468 |
|
| 469 |
# TIER 2: LLM β only ~15% of articles need this (COSTS TOKENS)
|
|
@@ -497,15 +500,24 @@ Return a JSON object with these fields:
|
|
| 497 |
|
| 498 |
# ENGINE TIER 2A: GROQ
|
| 499 |
res = await classify_with_groq(prompt)
|
| 500 |
-
if res:
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
# ENGINE TIER 2B: GEMINI
|
| 503 |
res = await classify_with_gemini(prompt)
|
| 504 |
-
if res:
|
|
|
|
|
|
|
|
|
|
| 505 |
|
| 506 |
# ENGINE TIER 2C: HF INFERENCE (Chat)
|
| 507 |
res = await classify_with_hf(prompt)
|
| 508 |
-
if res:
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
# ABSOLUTE FALLBACK: Basic regex (shouldn't reach here often)
|
| 511 |
cat, sev, tags, etype = classify_event(title, summary)
|
|
|
|
| 7 |
from groq import AsyncGroq
|
| 8 |
import google.generativeai as genai
|
| 9 |
from huggingface_hub import AsyncInferenceClient
|
| 10 |
+
from poller.training_logger import log_training_example
|
| 11 |
|
| 12 |
log = logging.getLogger(__name__)
|
| 13 |
|
|
|
|
| 460 |
# TIER 0: Smart Regex β handles ~60% of articles (FREE)
|
| 461 |
regex_result = smart_regex_classify(title, summary)
|
| 462 |
if regex_result:
|
| 463 |
+
log_training_example(title, summary, regex_result)
|
| 464 |
return regex_result
|
| 465 |
|
| 466 |
# TIER 1: Zero-Shot Classification β handles ~25% of articles (FREE)
|
| 467 |
zs_result = await classify_with_zero_shot(title)
|
| 468 |
if zs_result:
|
| 469 |
+
log_training_example(title, summary, zs_result)
|
| 470 |
return zs_result
|
| 471 |
|
| 472 |
# TIER 2: LLM β only ~15% of articles need this (COSTS TOKENS)
|
|
|
|
| 500 |
|
| 501 |
# ENGINE TIER 2A: GROQ
|
| 502 |
res = await classify_with_groq(prompt)
|
| 503 |
+
if res:
|
| 504 |
+
result = parse_llm_res(res, "GROQ")
|
| 505 |
+
log_training_example(title, summary, result)
|
| 506 |
+
return result
|
| 507 |
|
| 508 |
# ENGINE TIER 2B: GEMINI
|
| 509 |
res = await classify_with_gemini(prompt)
|
| 510 |
+
if res:
|
| 511 |
+
result = parse_llm_res(res, "GEMINI")
|
| 512 |
+
log_training_example(title, summary, result)
|
| 513 |
+
return result
|
| 514 |
|
| 515 |
# ENGINE TIER 2C: HF INFERENCE (Chat)
|
| 516 |
res = await classify_with_hf(prompt)
|
| 517 |
+
if res:
|
| 518 |
+
result = parse_llm_res(res, "HF-API")
|
| 519 |
+
log_training_example(title, summary, result)
|
| 520 |
+
return result
|
| 521 |
|
| 522 |
# ABSOLUTE FALLBACK: Basic regex (shouldn't reach here often)
|
| 523 |
cat, sev, tags, etype = classify_event(title, summary)
|
poller/training_logger.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
|
| 6 |
+
log = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
TRAINING_DATA_FILE = os.path.join(os.path.dirname(__file__), "training_data.jsonl")
|
| 9 |
+
|
| 10 |
+
def log_training_example(title: str, summary: str, result: dict):
|
| 11 |
+
"""
|
| 12 |
+
Appends a classified event as a training example for future DistilBERT fine-tuning.
|
| 13 |
+
Writes one JSON object per line (JSONL format).
|
| 14 |
+
"""
|
| 15 |
+
if result.get("is_noise"):
|
| 16 |
+
label = "NOT_CONFLICT"
|
| 17 |
+
category = "NOISE"
|
| 18 |
+
else:
|
| 19 |
+
label = "CONFLICT"
|
| 20 |
+
category = result.get("category", "GENERAL")
|
| 21 |
+
|
| 22 |
+
example = {
|
| 23 |
+
"text": title.strip(),
|
| 24 |
+
"summary": (summary or "").strip()[:200],
|
| 25 |
+
"label": label,
|
| 26 |
+
"category": category,
|
| 27 |
+
"event_type": result.get("event_type", "Other"),
|
| 28 |
+
"severity_score": result.get("severity_score", 0),
|
| 29 |
+
"country": result.get("country"),
|
| 30 |
+
"actor1": result.get("actor1"),
|
| 31 |
+
"weapon": result.get("weapon"),
|
| 32 |
+
"provider": result.get("provider", "UNKNOWN"),
|
| 33 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
with open(TRAINING_DATA_FILE, "a", encoding="utf-8") as f:
|
| 38 |
+
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
log.debug(f"Training logger write failed: {e}")
|