Spaces:

hardik1231312
/

conflictData

Running

App Files Files Community

Hardik Singh commited on Apr 24

Commit

970f66d

1 Parent(s): 09db715

data logging

Browse files

Files changed (2) hide show

poller/classifier.py +15 -3
poller/training_logger.py +40 -0

poller/classifier.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Dict, Any, Tuple, List, Optional
 from groq import AsyncGroq
 import google.generativeai as genai
 from huggingface_hub import AsyncInferenceClient
 log = logging.getLogger(__name__)
@@ -459,11 +460,13 @@ async def classify_event_llm(title: str, summary: str = "") -> Dict[str, Any]:
     # TIER 0: Smart Regex — handles ~60% of articles (FREE)
     regex_result = smart_regex_classify(title, summary)
     if regex_result:
         return regex_result
     # TIER 1: Zero-Shot Classification — handles ~25% of articles (FREE)
     zs_result = await classify_with_zero_shot(title)
     if zs_result:
         return zs_result
     # TIER 2: LLM — only ~15% of articles need this (COSTS TOKENS)
@@ -497,15 +500,24 @@ Return a JSON object with these fields:
     # ENGINE TIER 2A: GROQ
     res = await classify_with_groq(prompt)
-    if res: return parse_llm_res(res, "GROQ")
     # ENGINE TIER 2B: GEMINI
     res = await classify_with_gemini(prompt)
-    if res: return parse_llm_res(res, "GEMINI")
     # ENGINE TIER 2C: HF INFERENCE (Chat)
     res = await classify_with_hf(prompt)
-    if res: return parse_llm_res(res, "HF-API")
     # ABSOLUTE FALLBACK: Basic regex (shouldn't reach here often)
     cat, sev, tags, etype = classify_event(title, summary)

 from groq import AsyncGroq
 import google.generativeai as genai
 from huggingface_hub import AsyncInferenceClient
+from poller.training_logger import log_training_example
 log = logging.getLogger(__name__)
     # TIER 0: Smart Regex — handles ~60% of articles (FREE)
     regex_result = smart_regex_classify(title, summary)
     if regex_result:
+        log_training_example(title, summary, regex_result)
         return regex_result
     # TIER 1: Zero-Shot Classification — handles ~25% of articles (FREE)
     zs_result = await classify_with_zero_shot(title)
     if zs_result:
+        log_training_example(title, summary, zs_result)
         return zs_result
     # TIER 2: LLM — only ~15% of articles need this (COSTS TOKENS)
     # ENGINE TIER 2A: GROQ
     res = await classify_with_groq(prompt)
+    if res:
+        result = parse_llm_res(res, "GROQ")
+        log_training_example(title, summary, result)
+        return result
     # ENGINE TIER 2B: GEMINI
     res = await classify_with_gemini(prompt)
+    if res:
+        result = parse_llm_res(res, "GEMINI")
+        log_training_example(title, summary, result)
+        return result
     # ENGINE TIER 2C: HF INFERENCE (Chat)
     res = await classify_with_hf(prompt)
+    if res:
+        result = parse_llm_res(res, "HF-API")
+        log_training_example(title, summary, result)
+        return result
     # ABSOLUTE FALLBACK: Basic regex (shouldn't reach here often)
     cat, sev, tags, etype = classify_event(title, summary)

poller/training_logger.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+import os
+import logging
+from datetime import datetime, timezone
+log = logging.getLogger(__name__)
+TRAINING_DATA_FILE = os.path.join(os.path.dirname(__file__), "training_data.jsonl")
+def log_training_example(title: str, summary: str, result: dict):
+    """
+    Appends a classified event as a training example for future DistilBERT fine-tuning.
+    Writes one JSON object per line (JSONL format).
+    """
+    if result.get("is_noise"):
+        label = "NOT_CONFLICT"
+        category = "NOISE"
+    else:
+        label = "CONFLICT"
+        category = result.get("category", "GENERAL")
+    example = {
+        "text": title.strip(),
+        "summary": (summary or "").strip()[:200],
+        "label": label,
+        "category": category,
+        "event_type": result.get("event_type", "Other"),
+        "severity_score": result.get("severity_score", 0),
+        "country": result.get("country"),
+        "actor1": result.get("actor1"),
+        "weapon": result.get("weapon"),
+        "provider": result.get("provider", "UNKNOWN"),
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+    try:
+        with open(TRAINING_DATA_FILE, "a", encoding="utf-8") as f:
+            f.write(json.dumps(example, ensure_ascii=False) + "\n")
+    except Exception as e:
+        log.debug(f"Training logger write failed: {e}")