Hardik Singh commited on
Commit
970f66d
Β·
1 Parent(s): 09db715

data logging

Browse files
Files changed (2) hide show
  1. poller/classifier.py +15 -3
  2. poller/training_logger.py +40 -0
poller/classifier.py CHANGED
@@ -7,6 +7,7 @@ from typing import Dict, Any, Tuple, List, Optional
7
  from groq import AsyncGroq
8
  import google.generativeai as genai
9
  from huggingface_hub import AsyncInferenceClient
 
10
 
11
  log = logging.getLogger(__name__)
12
 
@@ -459,11 +460,13 @@ async def classify_event_llm(title: str, summary: str = "") -> Dict[str, Any]:
459
  # TIER 0: Smart Regex β€” handles ~60% of articles (FREE)
460
  regex_result = smart_regex_classify(title, summary)
461
  if regex_result:
 
462
  return regex_result
463
 
464
  # TIER 1: Zero-Shot Classification β€” handles ~25% of articles (FREE)
465
  zs_result = await classify_with_zero_shot(title)
466
  if zs_result:
 
467
  return zs_result
468
 
469
  # TIER 2: LLM β€” only ~15% of articles need this (COSTS TOKENS)
@@ -497,15 +500,24 @@ Return a JSON object with these fields:
497
 
498
  # ENGINE TIER 2A: GROQ
499
  res = await classify_with_groq(prompt)
500
- if res: return parse_llm_res(res, "GROQ")
 
 
 
501
 
502
  # ENGINE TIER 2B: GEMINI
503
  res = await classify_with_gemini(prompt)
504
- if res: return parse_llm_res(res, "GEMINI")
 
 
 
505
 
506
  # ENGINE TIER 2C: HF INFERENCE (Chat)
507
  res = await classify_with_hf(prompt)
508
- if res: return parse_llm_res(res, "HF-API")
 
 
 
509
 
510
  # ABSOLUTE FALLBACK: Basic regex (shouldn't reach here often)
511
  cat, sev, tags, etype = classify_event(title, summary)
 
7
  from groq import AsyncGroq
8
  import google.generativeai as genai
9
  from huggingface_hub import AsyncInferenceClient
10
+ from poller.training_logger import log_training_example
11
 
12
  log = logging.getLogger(__name__)
13
 
 
460
  # TIER 0: Smart Regex β€” handles ~60% of articles (FREE)
461
  regex_result = smart_regex_classify(title, summary)
462
  if regex_result:
463
+ log_training_example(title, summary, regex_result)
464
  return regex_result
465
 
466
  # TIER 1: Zero-Shot Classification β€” handles ~25% of articles (FREE)
467
  zs_result = await classify_with_zero_shot(title)
468
  if zs_result:
469
+ log_training_example(title, summary, zs_result)
470
  return zs_result
471
 
472
  # TIER 2: LLM β€” only ~15% of articles need this (COSTS TOKENS)
 
500
 
501
  # ENGINE TIER 2A: GROQ
502
  res = await classify_with_groq(prompt)
503
+ if res:
504
+ result = parse_llm_res(res, "GROQ")
505
+ log_training_example(title, summary, result)
506
+ return result
507
 
508
  # ENGINE TIER 2B: GEMINI
509
  res = await classify_with_gemini(prompt)
510
+ if res:
511
+ result = parse_llm_res(res, "GEMINI")
512
+ log_training_example(title, summary, result)
513
+ return result
514
 
515
  # ENGINE TIER 2C: HF INFERENCE (Chat)
516
  res = await classify_with_hf(prompt)
517
+ if res:
518
+ result = parse_llm_res(res, "HF-API")
519
+ log_training_example(title, summary, result)
520
+ return result
521
 
522
  # ABSOLUTE FALLBACK: Basic regex (shouldn't reach here often)
523
  cat, sev, tags, etype = classify_event(title, summary)
poller/training_logger.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import logging
4
+ from datetime import datetime, timezone
5
+
6
+ log = logging.getLogger(__name__)
7
+
8
+ TRAINING_DATA_FILE = os.path.join(os.path.dirname(__file__), "training_data.jsonl")
9
+
10
+ def log_training_example(title: str, summary: str, result: dict):
11
+ """
12
+ Appends a classified event as a training example for future DistilBERT fine-tuning.
13
+ Writes one JSON object per line (JSONL format).
14
+ """
15
+ if result.get("is_noise"):
16
+ label = "NOT_CONFLICT"
17
+ category = "NOISE"
18
+ else:
19
+ label = "CONFLICT"
20
+ category = result.get("category", "GENERAL")
21
+
22
+ example = {
23
+ "text": title.strip(),
24
+ "summary": (summary or "").strip()[:200],
25
+ "label": label,
26
+ "category": category,
27
+ "event_type": result.get("event_type", "Other"),
28
+ "severity_score": result.get("severity_score", 0),
29
+ "country": result.get("country"),
30
+ "actor1": result.get("actor1"),
31
+ "weapon": result.get("weapon"),
32
+ "provider": result.get("provider", "UNKNOWN"),
33
+ "timestamp": datetime.now(timezone.utc).isoformat(),
34
+ }
35
+
36
+ try:
37
+ with open(TRAINING_DATA_FILE, "a", encoding="utf-8") as f:
38
+ f.write(json.dumps(example, ensure_ascii=False) + "\n")
39
+ except Exception as e:
40
+ log.debug(f"Training logger write failed: {e}")