""" DBpedia Integration for Autocomplete and Structured Data DBpedia extracts structured "triples" from Wikipedia infoboxes. Every Wikipedia page becomes a "resource" with structured data. LOOKUP API: http://lookup.dbpedia.org/api/search REST API: https://dbpedia.org/sparql KEY ADVANTAGES: ✅ Completely FREE - no API key required ✅ Perfect for autocomplete/type-ahead - Lookup API is designed for this ✅ Structured data from Wikipedia - millions of resources ✅ Instant access to Mayor, population, school district info ✅ Rich context for search results USE CASES FOR CIVIC ENGAGEMENT: - Autocomplete in search box (cities, people, organizations) - Type-ahead suggestions - Structured data for entities (mayor, population, etc.) - Linking Wikipedia pages to structured data - Enriching search results with context EXAMPLE QUERIES: - "Tuscaloosa" → Get Mayor, population, school district - "School Board" → Find all school boards - "Alabama cities" → Get all cities in Alabama - Person name → Get positions, affiliations API DOCUMENTATION: - Lookup API: http://lookup.dbpedia.org/api/doc/ - SPARQL: https://dbpedia.org/sparql - Examples: https://wiki.dbpedia.org/develop/datasets USAGE: from discovery.dbpedia_integration import DBpediaLookup dbpedia = DBpediaLookup() # Autocomplete search results = await dbpedia.search("Tuscaloosa", max_results=10) # Get detailed info about a resource info = await dbpedia.get_resource_info("Tuscaloosa,_Alabama") # Search for specific types (cities, people, organizations) cities = await dbpedia.search_by_type("Alabama", type_filter="Place") """ import asyncio from typing import List, Dict, Optional from datetime import datetime from pathlib import Path import httpx from loguru import logger try: from pyspark.sql import SparkSession from config.settings import settings SPARK_AVAILABLE = True except ImportError: SPARK_AVAILABLE = False settings = None class DBpediaLookup: """ Query DBpedia for autocomplete and structured data. DBpedia is completely FREE and perfect for type-ahead search boxes. """ LOOKUP_API = "http://lookup.dbpedia.org/api/search" SPARQL_ENDPOINT = "https://dbpedia.org/sparql" # Common DBpedia ontology classes CLASSES = { "place": "Place", "city": "City", "person": "Person", "organization": "Organisation", "government": "GovernmentAgency", "school": "School", "politician": "Politician", } def __init__(self, cache_dir: str = "data/cache/dbpedia"): """Initialize DBpedia lookup client.""" self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(parents=True, exist_ok=True) async def search( self, query: str, max_results: int = 10, type_filter: Optional[str] = None ) -> List[Dict]: """ Search DBpedia (autocomplete/type-ahead). Args: query: Search query (e.g., "Tuscaloosa", "School Board") max_results: Maximum number of results type_filter: Filter by type (e.g., "Place", "Person", "Organisation") Returns: List of result dicts with URI, label, description, etc. """ logger.info(f"Searching DBpedia for: {query}") params = { "query": query, "maxResults": max_results, "format": "json" } if type_filter: params["type"] = type_filter async with httpx.AsyncClient(timeout=30.0) as client: try: response = await client.get( self.LOOKUP_API, params=params, headers={ "User-Agent": "CivicEngagementBot/1.0 (Educational Research)", "Accept": "application/json" } ) response.raise_for_status() data = response.json() # Extract results results = [] for item in data.get("results", []): results.append({ "label": item.get("label"), "uri": item.get("uri"), "description": item.get("description"), "classes": item.get("classes", []), "categories": item.get("categories", []), "refCount": item.get("refCount", 0), # How many Wikipedia pages link to this "source": "dbpedia", "fetched_at": datetime.utcnow().isoformat() }) logger.info(f"✅ Found {len(results)} results for '{query}'") return results except Exception as e: logger.error(f"Error searching DBpedia: {e}") raise async def search_by_type( self, query: str, type_filter: str, max_results: int = 20 ) -> List[Dict]: """ Search for specific entity types. Args: query: Search query type_filter: Entity type ("Place", "Person", "Organisation", etc.) max_results: Maximum results Returns: Filtered results of that type """ logger.info(f"Searching for {type_filter}: {query}") return await self.search( query=query, max_results=max_results, type_filter=type_filter ) async def get_resource_info(self, resource: str) -> Dict: """ Get detailed information about a DBpedia resource. Args: resource: Resource name (e.g., "Tuscaloosa,_Alabama") Returns: Dict with resource information """ # DBpedia resource URL if not resource.startswith("http"): resource_url = f"http://dbpedia.org/resource/{resource}" else: resource_url = resource logger.info(f"Fetching resource info: {resource_url}") # Query SPARQL endpoint for all properties query = f""" SELECT ?property ?value WHERE {{ <{resource_url}> ?property ?value . }} LIMIT 100 """ async with httpx.AsyncClient(timeout=30.0) as client: try: response = await client.get( self.SPARQL_ENDPOINT, params={ "query": query, "format": "json" }, headers={ "User-Agent": "CivicEngagementBot/1.0", "Accept": "application/sparql-results+json" } ) response.raise_for_status() data = response.json() # Parse results into structured dict info = { "resource": resource_url, "properties": {}, "source": "dbpedia", "fetched_at": datetime.utcnow().isoformat() } for binding in data.get("results", {}).get("bindings", []): prop = binding.get("property", {}).get("value", "") value = binding.get("value", {}).get("value", "") # Extract property name from URI prop_name = prop.split("/")[-1].split("#")[-1] # Store property if prop_name not in info["properties"]: info["properties"][prop_name] = [] info["properties"][prop_name].append(value) logger.info(f"✅ Found {len(info['properties'])} properties for {resource}") return info except Exception as e: logger.error(f"Error fetching resource info: {e}") raise async def find_cities(self, state: Optional[str] = None) -> List[Dict]: """ Find cities (with optional state filter). Args: state: State name to filter by Returns: List of city dicts """ if state: query = f"cities in {state}" else: query = "city" return await self.search_by_type( query=query, type_filter="City", max_results=50 ) async def find_people(self, name_query: str) -> List[Dict]: """ Find people by name. Args: name_query: Name or partial name Returns: List of person dicts """ return await self.search_by_type( query=name_query, type_filter="Person", max_results=20 ) async def find_organizations(self, org_query: str) -> List[Dict]: """ Find organizations. Args: org_query: Organization name or keyword Returns: List of organization dicts """ return await self.search_by_type( query=org_query, type_filter="Organisation", max_results=20 ) def save_to_json(self, data, filename: str): """Save data to JSON cache.""" import json filepath = self.cache_dir / filename with open(filepath, 'w') as f: json.dump(data, f, indent=2) logger.info(f"💾 Saved data to {filepath}") # ============================================================================ # Example Usage # ============================================================================ async def example_usage(): """Example usage of DBpedia integration.""" dbpedia = DBpediaLookup() # Example 1: Autocomplete search for "Tuscaloosa" logger.info("\n" + "="*80) logger.info("Example 1: Autocomplete search for 'Tuscaloosa'") logger.info("="*80) try: results = await dbpedia.search("Tuscaloosa", max_results=10) print(f"\n✅ Found {len(results)} results:") for result in results: print(f"\n • {result['label']}") if result.get('description'): print(f" {result['description']}") print(f" URI: {result['uri']}") print(f" Reference count: {result['refCount']}") if results: dbpedia.save_to_json(results, "tuscaloosa_search.json") except Exception as e: logger.error(f"Error: {e}") # Example 2: Get detailed info about Tuscaloosa, Alabama logger.info("\n" + "="*80) logger.info("Example 2: Get detailed info about Tuscaloosa, Alabama") logger.info("="*80) try: info = await dbpedia.get_resource_info("Tuscaloosa,_Alabama") print(f"\n✅ Found {len(info['properties'])} properties:") # Show interesting properties interesting = [ "mayor", "population", "areaCode", "postalCode", "website", "leaderTitle", "foundingDate" ] for prop in interesting: if prop in info['properties']: print(f" • {prop}: {info['properties'][prop]}") dbpedia.save_to_json(info, "tuscaloosa_info.json") except Exception as e: logger.error(f"Error: {e}") # Example 3: Search for cities in Alabama logger.info("\n" + "="*80) logger.info("Example 3: Search for cities in Alabama") logger.info("="*80) try: cities = await dbpedia.find_cities(state="Alabama") print(f"\n✅ Found {len(cities)} cities in Alabama:") for city in cities[:10]: # Show first 10 print(f" • {city['label']}") if city.get('description'): print(f" {city['description']}") if cities: dbpedia.save_to_json(cities, "alabama_cities.json") except Exception as e: logger.error(f"Error: {e}") # Example 4: Search for people (politicians) logger.info("\n" + "="*80) logger.info("Example 4: Search for Alabama politicians") logger.info("="*80) try: people = await dbpedia.find_people("Alabama mayor") print(f"\n✅ Found {len(people)} people:") for person in people[:10]: print(f" • {person['label']}") if person.get('description'): print(f" {person['description']}") if people: dbpedia.save_to_json(people, "alabama_politicians.json") except Exception as e: logger.error(f"Error: {e}") logger.info("\n✅ Examples complete!") logger.info("\n" + "="*80) logger.info("DBpedia Lookup API is perfect for autocomplete!") logger.info("Use it in your search box for instant suggestions.") logger.info("="*80) if __name__ == "__main__": asyncio.run(example_usage())