""" Wikidata Integration for Civic Engagement Data Wikidata is a collaborative knowledge base that powers Wikipedia's infoboxes. It is the BEST FREE SOURCE for connecting people to organizations and locations. SPARQL Endpoint: https://query.wikidata.org/sparql REST API: https://www.wikidata.org/w/api.php Query Service: https://query.wikidata.org/ KEY ADVANTAGES: ✅ Completely FREE - no API key required ✅ Highly interconnected - find person → see all linked organizations ✅ Structured data - triples (subject-predicate-object) ✅ Real Wikipedia data - millions of entities ✅ SPARQL queries - powerful graph queries USE CASES FOR CIVIC ENGAGEMENT: - Find all members of school boards in a state - Find all mayors in a county - Link people to their organizations - Discover city council members - Get organizational hierarchies EXAMPLE QUERIES: - "All school board members in Alabama" - "All cities in Tuscaloosa County" - "All elected officials in a city" - "Organizations a person is affiliated with" API DOCUMENTATION: - SPARQL: https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service - REST API: https://www.wikidata.org/w/api.php - Query Examples: https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples USAGE: from discovery.wikidata_integration import WikidataQuery wikidata = WikidataQuery() # Find school board members in Alabama members = await wikidata.find_school_board_members(state="Alabama") # Find all cities in a county cities = await wikidata.find_cities_in_county("Tuscaloosa County", "Alabama") # Find organizations a person is affiliated with orgs = await wikidata.find_person_organizations("Walt Maddox") """ import asyncio from typing import List, Dict, Optional, Any from datetime import datetime from pathlib import Path import httpx from loguru import logger try: from pyspark.sql import SparkSession from config.settings import settings SPARK_AVAILABLE = True except ImportError: SPARK_AVAILABLE = False settings = None class WikidataQuery: """ Query Wikidata using SPARQL for civic engagement data. Wikidata is completely FREE and provides structured knowledge about people, organizations, and places. """ SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" REST_API = "https://www.wikidata.org/w/api.php" # Wikidata property IDs (for SPARQL queries) PROPERTIES = { "instance_of": "P31", # What type of thing is this? "position_held": "P39", # What position does this person hold? "member_of": "P463", # What organization is this person a member of? "location": "P276", # Where is this located? "located_in": "P131", # Administrative territory "country": "P17", # Country "state": "P131", # State/province "occupation": "P106", # Occupation "official_website": "P856", # Official website } # Wikidata item IDs (common entities) ITEMS = { "human": "Q5", # A human being "school_board": "Q7430706", # School board "city": "Q515", # City "county": "Q28575", # County (US) "mayor": "Q30185", # Mayor "city_council": "Q871419", # City council "school_district": "Q1244442", # School district } def __init__(self, cache_dir: str = "data/cache/wikidata"): """Initialize Wikidata query client.""" self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(parents=True, exist_ok=True) async def execute_sparql(self, query: str) -> List[Dict]: """ Execute a SPARQL query against Wikidata. Args: query: SPARQL query string Returns: List of result dicts """ logger.info(f"Executing SPARQL query...") logger.debug(f"Query: {query}") async with httpx.AsyncClient(timeout=60.0) as client: try: response = await client.get( self.SPARQL_ENDPOINT, params={ "query": query, "format": "json" }, headers={ "User-Agent": "CivicEngagementBot/1.0 (Educational Research)", "Accept": "application/sparql-results+json" } ) response.raise_for_status() data = response.json() # Extract results bindings = data.get("results", {}).get("bindings", []) results = [] for binding in bindings: result = {} for key, value in binding.items(): result[key] = value.get("value") results.append(result) logger.info(f"✅ Query returned {len(results)} results") return results except httpx.HTTPStatusError as e: logger.error(f"SPARQL query failed: {e.response.status_code}") logger.error(f"Response: {e.response.text}") raise except Exception as e: logger.error(f"Error executing SPARQL query: {e}") raise async def find_school_board_members( self, state: Optional[str] = None, district: Optional[str] = None ) -> List[Dict]: """ Find school board members. Args: state: State name (e.g., "Alabama") district: School district name (optional) Returns: List of school board member dicts """ # SPARQL query to find school board members query = """ SELECT ?person ?personLabel ?board ?boardLabel ?position ?positionLabel WHERE { # Person holds a position ?person wdt:P39 ?position . # Position is on a school board ?position wdt:P31 wd:Q7430706 . # instance of school board # Board is the organization ?person wdt:P463 ?board . # Filter by state if provided FILTER_STATE SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } LIMIT 100 """ # Add state filter if provided if state: state_filter = f'FILTER(CONTAINS(LCASE(?boardLabel), "{state.lower()}")).' query = query.replace("FILTER_STATE", state_filter) else: query = query.replace("FILTER_STATE", "") results = await self.execute_sparql(query) # Format results members = [] for result in results: members.append({ "name": result.get("personLabel"), "wikidata_id": result.get("person", "").split("/")[-1], "board": result.get("boardLabel"), "board_id": result.get("board", "").split("/")[-1], "position": result.get("positionLabel"), "source": "wikidata", "fetched_at": datetime.utcnow().isoformat() }) logger.info(f"✅ Found {len(members)} school board members") return members async def find_cities_in_county( self, county: str, state: Optional[str] = None ) -> List[Dict]: """ Find all cities in a county. Args: county: County name (e.g., "Tuscaloosa County") state: State name (e.g., "Alabama") Returns: List of city dicts """ query = f""" SELECT ?city ?cityLabel ?population ?website WHERE {{ # City is an instance of city ?city wdt:P31 wd:Q515 . # Located in the county ?city wdt:P131 ?county . ?county rdfs:label "{county}"@en . # Optional: population OPTIONAL {{ ?city wdt:P1082 ?population . }} # Optional: official website OPTIONAL {{ ?city wdt:P856 ?website . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} """ results = await self.execute_sparql(query) cities = [] for result in results: cities.append({ "name": result.get("cityLabel"), "wikidata_id": result.get("city", "").split("/")[-1], "population": result.get("population"), "website": result.get("website"), "county": county, "state": state, "source": "wikidata", "fetched_at": datetime.utcnow().isoformat() }) logger.info(f"✅ Found {len(cities)} cities in {county}") return cities async def find_person_organizations(self, person_name: str) -> List[Dict]: """ Find all organizations a person is affiliated with. Args: person_name: Person's name (e.g., "Walt Maddox") Returns: List of organization dicts """ query = f""" SELECT ?person ?personLabel ?org ?orgLabel ?position ?positionLabel WHERE {{ # Find person by name ?person rdfs:label "{person_name}"@en . ?person wdt:P31 wd:Q5 . # is a human # Person is member of organization ?person wdt:P463 ?org . # Optional: position held OPTIONAL {{ ?person wdt:P39 ?position . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} """ results = await self.execute_sparql(query) organizations = [] for result in results: organizations.append({ "person_name": result.get("personLabel"), "person_id": result.get("person", "").split("/")[-1], "organization": result.get("orgLabel"), "organization_id": result.get("org", "").split("/")[-1], "position": result.get("positionLabel"), "source": "wikidata", "fetched_at": datetime.utcnow().isoformat() }) logger.info(f"✅ Found {len(organizations)} organizations for {person_name}") return organizations async def find_elected_officials( self, city: Optional[str] = None, state: Optional[str] = None, position_type: Optional[str] = None ) -> List[Dict]: """ Find elected officials. Args: city: City name state: State name position_type: Type of position (e.g., "mayor", "council member") Returns: List of official dicts """ # Build SPARQL query dynamically filters = [] if city: filters.append(f'FILTER(CONTAINS(LCASE(?cityLabel), "{city.lower()}")).') if state: filters.append(f'FILTER(CONTAINS(LCASE(?stateLabel), "{state.lower()}")).') filter_clause = " ".join(filters) if filters else "" query = f""" SELECT ?person ?personLabel ?position ?positionLabel ?location ?locationLabel WHERE {{ # Person holds a position ?person wdt:P39 ?position . ?person wdt:P31 wd:Q5 . # is a human # Position is at a location OPTIONAL {{ ?position wdt:P276 ?location . }} {filter_clause} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} LIMIT 100 """ results = await self.execute_sparql(query) officials = [] for result in results: officials.append({ "name": result.get("personLabel"), "wikidata_id": result.get("person", "").split("/")[-1], "position": result.get("positionLabel"), "location": result.get("locationLabel"), "city": city, "state": state, "source": "wikidata", "fetched_at": datetime.utcnow().isoformat() }) logger.info(f"✅ Found {len(officials)} elected officials") return officials def save_to_json(self, data: List[Dict], filename: str): """Save data to JSON cache.""" import json filepath = self.cache_dir / filename with open(filepath, 'w') as f: json.dump(data, f, indent=2) logger.info(f"💾 Saved {len(data)} records to {filepath}") # ============================================================================ # Example Usage # ============================================================================ async def example_usage(): """Example usage of Wikidata integration.""" wikidata = WikidataQuery() # Example 1: Find school board members in Alabama logger.info("\n" + "="*80) logger.info("Example 1: Find school board members in Alabama") logger.info("="*80) try: members = await wikidata.find_school_board_members(state="Alabama") print(f"\n✅ Found {len(members)} school board members in Alabama:") for member in members[:10]: # Show first 10 print(f" • {member['name']} - {member['board']}") if member.get('position'): print(f" Position: {member['position']}") if members: wikidata.save_to_json(members, "alabama_school_board_members.json") except Exception as e: logger.error(f"Error: {e}") # Example 2: Find cities in Tuscaloosa County logger.info("\n" + "="*80) logger.info("Example 2: Find cities in Tuscaloosa County") logger.info("="*80) try: cities = await wikidata.find_cities_in_county("Tuscaloosa County", "Alabama") print(f"\n✅ Found {len(cities)} cities in Tuscaloosa County:") for city in cities[:10]: print(f" • {city['name']}") if city.get('population'): print(f" Population: {city['population']}") if city.get('website'): print(f" Website: {city['website']}") if cities: wikidata.save_to_json(cities, "tuscaloosa_county_cities.json") except Exception as e: logger.error(f"Error: {e}") # Example 3: Find organizations for a person logger.info("\n" + "="*80) logger.info("Example 3: Find organizations for Walt Maddox") logger.info("="*80) try: orgs = await wikidata.find_person_organizations("Walt Maddox") print(f"\n✅ Found {len(orgs)} organizations:") for org in orgs: print(f" • {org['organization']}") if org.get('position'): print(f" Position: {org['position']}") if orgs: wikidata.save_to_json(orgs, "walt_maddox_organizations.json") except Exception as e: logger.error(f"Error: {e}") logger.info("\n✅ Examples complete!") if __name__ == "__main__": asyncio.run(example_usage())