Spaces:

Firemedic15
/

OSINTTool

Sleeping

App Files Files Community

OSINTTool / tools.py

Firemedic15

Update tools.py

cc6e7c5 verified about 2 months ago

raw

history blame contribute delete

10.6 kB

	"""
	tools.py — OSINT data source tools for the agentic analyst loop.

	Required Space Secrets:
	ACLED_USERNAME — your myACLED email address
	ACLED_PASSWORD — your myACLED password
	"""

	import os
	import re
	import time
	import threading
	import feedparser
	import requests
	from datetime import datetime, timedelta
	from smolagents import tool

	# ---------------------------------------------------------------------------
	# ACLED OAuth token cache
	# ---------------------------------------------------------------------------

	_token_cache = {
	"access_token": None,
	"expires_at": 0,
	"lock": threading.Lock(),
	}

	ACLED_TOKEN_URL = "https://acleddata.com/oauth/token"
	ACLED_BASE = "https://acleddata.com/api/acled/read"


	def _get_acled_token() -> str:
	with _token_cache["lock"]:
	now = time.time()
	if _token_cache["access_token"] and now < _token_cache["expires_at"]:
	return _token_cache["access_token"]

	username = os.environ.get("ACLED_USERNAME", "").strip()
	password = os.environ.get("ACLED_PASSWORD", "").strip()

	if not username or not password:
	raise EnvironmentError(
	"ACLED credentials missing. Add ACLED_USERNAME and ACLED_PASSWORD "
	"as Space secrets under Settings -> Variables and Secrets."
	)

	resp = requests.post(
	ACLED_TOKEN_URL,
	headers={"Content-Type": "application/x-www-form-urlencoded"},
	data={
	"username": username,
	"password": password,
	"grant_type": "password",
	"client_id": "acled",
	},
	timeout=15,
	)

	if resp.status_code != 200:
	raise EnvironmentError(
	f"ACLED token request failed ({resp.status_code}): {resp.text[:200]}"
	)

	token_data = resp.json()
	_token_cache["access_token"] = token_data["access_token"]
	_token_cache["expires_at"] = now + token_data.get("expires_in", 86400) - 300
	return _token_cache["access_token"]


	def _strip_html(text: str) -> str:
	"""Remove HTML tags and clean up whitespace."""
	clean = re.sub(r"<[^>]+>", " ", text)
	clean = re.sub(r"\s+", " ", clean)
	return clean.strip()


	# ---------------------------------------------------------------------------
	# ACLED Tool
	# ---------------------------------------------------------------------------

	@tool
	def fetch_acled_events(country: str, days_back: int = 14, limit: int = 25) -> str:
	"""
	Fetches recent armed conflict events from ACLED for a given country.
	Returns dates, locations, actor names, event types, and fatality counts.

	Args:
	country: Country name to query (e.g. 'Sudan', 'Ukraine', 'Mexico').
	days_back: How many days back to search (default 14).
	limit: Maximum number of events to return (default 25, max 50).
	"""
	try:
	token = _get_acled_token()
	except EnvironmentError as e:
	return f"[ACLED] Auth error: {e}"
	except requests.RequestException as e:
	return f"[ACLED] Failed to obtain token: {e}"

	since = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")

	params = {
	"country": country,
	"event_date": since,
	"event_date_where": ">=",
	"limit": min(limit, 50),
	"fields": "event_date\|event_type\|sub_event_type\|actor1\|actor2\|location\|admin1\|fatalities\|notes",
	"_format": "json",
	}

	headers = {
	"Authorization": f"Bearer {token}",
	"Content-Type": "application/json",
	}

	try:
	resp = requests.get(ACLED_BASE, params=params, headers=headers, timeout=15)
	resp.raise_for_status()
	data = resp.json()
	except requests.RequestException as e:
	return f"[ACLED] Request failed: {e}"

	if data.get("status") != 200:
	return f"[ACLED] API error: {data.get('error', data)}"

	events = data.get("data", [])
	if not events:
	return f"[ACLED] No events found for '{country}' in the last {days_back} days."

	lines = [f"[ACLED] {len(events)} events in {country} (last {days_back} days):\n"]
	total_fatalities = 0

	for ev in events:
	fatalities = int(ev.get("fatalities", 0))
	total_fatalities += fatalities
	actor2_str = f" vs {ev['actor2']}" if ev.get("actor2") else ""
	lines.append(
	f"* {ev['event_date']} \| {ev['event_type']} / {ev.get('sub_event_type', '')} \| "
	f"{ev.get('location', '?')}, {ev.get('admin1', '?')} \| "
	f"{ev.get('actor1', '?')}{actor2_str} \| "
	f"Fatalities: {fatalities} \| "
	f"Notes: {ev.get('notes', '')[:120]}"
	)

	lines.append(f"\nTotal reported fatalities: {total_fatalities}")
	return "\n".join(lines)


	# ---------------------------------------------------------------------------
	# RSS Tool — returns structured JSON-like records for report inclusion
	# ---------------------------------------------------------------------------

	RSS_FEED_REGISTRY = {
	"reuters_world": "https://feeds.reuters.com/reuters/worldNews",
	"bbc_world": "https://feeds.bbci.co.uk/news/world/rss.xml",
	"al_jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
	"bellingcat": "https://www.bellingcat.com/feed/",
	"crisis_group": "https://www.crisisgroup.org/rss.xml",
	"acled_blog": "https://acleddata.com/feed/",
	"un_news": "https://news.un.org/feed/subscribe/en/news/feed/rss.xml",
	"foreign_policy": "https://foreignpolicy.com/feed/",
	}

	SCAN_LIMIT = 50

	# Signal words that bump an article to "notable"
	NOTABLE_SIGNALS = [
	"killed", "dead", "deaths", "fatalities", "massacre", "attack", "attacked",
	"explosion", "bomb", "bombing", "shooting", "gunfire", "clash", "clashes",
	"offensive", "invasion", "coup", "crisis", "emergency", "arrest", "arrested",
	"protest", "riot", "siege", "hostage", "kidnap", "cartel", "militia",
	"sanctions", "airstrike", "drone", "ceasefire", "peace", "agreement",
	"earthquake", "flood", "disaster", "outbreak", "epidemic",
	]


	def _is_notable(title: str, summary: str) -> bool:
	"""Returns True if the article contains high-signal security/conflict language."""
	text = (title + " " + summary).lower()
	return any(signal in text for signal in NOTABLE_SIGNALS)


	@tool
	def fetch_rss_headlines(
	topic: str,
	sources: str = "reuters_world,bbc_world,al_jazeera",
	max_articles: int = 20,
	) -> str:
	"""
	Fetches recent RSS news headlines related to a topic or region.
	Returns structured article records including title, source, date, summary,
	URL, and a 'notable' flag for high-signal security/conflict articles.
	The notable flag should be used to select articles for inclusion in the
	final threat brief's news section.

	Args:
	topic: Keyword or region to filter headlines (e.g. 'Mexico', 'Sudan').
	Single keywords work best.
	sources: Comma-separated source keys. Available: reuters_world, bbc_world,
	al_jazeera, bellingcat, crisis_group, acled_blog, un_news, foreign_policy.
	max_articles: Maximum total articles to return across all sources (default 20).
	"""
	source_keys = [s.strip() for s in sources.split(",") if s.strip()]
	keywords = [w.lower() for w in topic.lower().split() if len(w) > 2]
	articles = []
	feed_errors = []

	for key in source_keys:
	if len(articles) >= max_articles:
	break

	url = RSS_FEED_REGISTRY.get(key)
	if not url:
	feed_errors.append(f"Unknown source key: '{key}'")
	continue

	try:
	feed = feedparser.parse(url)
	if feed.bozo and not feed.entries:
	feed_errors.append(f"[{key}] Feed parse error: {feed.bozo_exception}")
	continue
	except Exception as e:
	feed_errors.append(f"[{key}] Exception: {e}")
	continue

	source_name = feed.feed.get("title", key)

	for entry in feed.entries[:SCAN_LIMIT]:
	if len(articles) >= max_articles:
	break

	title = entry.get("title", "").strip()
	raw_summary = entry.get("summary", entry.get("description", ""))
	summary = _strip_html(raw_summary)[:300]
	published = entry.get("published", entry.get("updated", ""))
	link = entry.get("link", "")

	searchable = (title + " " + summary).lower()
	if not any(kw in searchable for kw in keywords):
	continue

	notable = _is_notable(title, summary)

	articles.append({
	"source_key": key,
	"source_name": source_name,
	"published": published,
	"title": title,
	"summary": summary,
	"url": link,
	"notable": notable,
	})

	time.sleep(0.3)

	if not articles:
	err_detail = "; ".join(feed_errors) if feed_errors else "no entries matched"
	return (
	f"[RSS] No articles matched '{topic}'. {err_detail}\n"
	"Tip: Try a shorter single-word keyword (e.g. 'Mexico' not 'Mexico violence')."
	)

	# Format output clearly for the agent
	lines = [f"[RSS] {len(articles)} articles found for '{topic}':\n"]
	notable_count = sum(1 for a in articles if a["notable"])
	lines.append(f"Notable (high-signal) articles: {notable_count} of {len(articles)}\n")

	for i, a in enumerate(articles, 1):
	flag = " * NOTABLE *" if a["notable"] else ""
	lines.append(
	f"[{i}] {a['source_name']} \| {a['published']}{flag}\n"
	f" Title: {a['title']}\n"
	f" Summary: {a['summary']}\n"
	f" URL: {a['url']}\n"
	f" Notable: {a['notable']}"
	)

	if feed_errors:
	lines.append("\n--- Feed warnings ---")
	lines.extend(feed_errors)

	return "\n\n".join(lines)


	# ---------------------------------------------------------------------------
	# Helper tool
	# ---------------------------------------------------------------------------

	@tool
	def list_available_sources() -> str:
	"""
	Returns a list of all available RSS feed source keys and their URLs.

	Args: None
	"""
	lines = ["Available RSS sources:"]
	for key, url in RSS_FEED_REGISTRY.items():
	lines.append(f" * {key}: {url}")
	lines.append("\nACLED is also available for structured armed conflict event data.")
	return "\n".join(lines)