open-navigator / scripts /datasources /dbpedia /dbpedia_integration.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
"""
DBpedia Integration for Autocomplete and Structured Data
DBpedia extracts structured "triples" from Wikipedia infoboxes.
Every Wikipedia page becomes a "resource" with structured data.
LOOKUP API: http://lookup.dbpedia.org/api/search
REST API: https://dbpedia.org/sparql
KEY ADVANTAGES:
✅ Completely FREE - no API key required
✅ Perfect for autocomplete/type-ahead - Lookup API is designed for this
✅ Structured data from Wikipedia - millions of resources
✅ Instant access to Mayor, population, school district info
✅ Rich context for search results
USE CASES FOR CIVIC ENGAGEMENT:
- Autocomplete in search box (cities, people, organizations)
- Type-ahead suggestions
- Structured data for entities (mayor, population, etc.)
- Linking Wikipedia pages to structured data
- Enriching search results with context
EXAMPLE QUERIES:
- "Tuscaloosa" → Get Mayor, population, school district
- "School Board" → Find all school boards
- "Alabama cities" → Get all cities in Alabama
- Person name → Get positions, affiliations
API DOCUMENTATION:
- Lookup API: http://lookup.dbpedia.org/api/doc/
- SPARQL: https://dbpedia.org/sparql
- Examples: https://wiki.dbpedia.org/develop/datasets
USAGE:
from discovery.dbpedia_integration import DBpediaLookup
dbpedia = DBpediaLookup()
# Autocomplete search
results = await dbpedia.search("Tuscaloosa", max_results=10)
# Get detailed info about a resource
info = await dbpedia.get_resource_info("Tuscaloosa,_Alabama")
# Search for specific types (cities, people, organizations)
cities = await dbpedia.search_by_type("Alabama", type_filter="Place")
"""
import asyncio
from typing import List, Dict, Optional
from datetime import datetime
from pathlib import Path
import httpx
from loguru import logger
try:
from pyspark.sql import SparkSession
from config.settings import settings
SPARK_AVAILABLE = True
except ImportError:
SPARK_AVAILABLE = False
settings = None
class DBpediaLookup:
"""
Query DBpedia for autocomplete and structured data.
DBpedia is completely FREE and perfect for type-ahead search boxes.
"""
LOOKUP_API = "http://lookup.dbpedia.org/api/search"
SPARQL_ENDPOINT = "https://dbpedia.org/sparql"
# Common DBpedia ontology classes
CLASSES = {
"place": "Place",
"city": "City",
"person": "Person",
"organization": "Organisation",
"government": "GovernmentAgency",
"school": "School",
"politician": "Politician",
}
def __init__(self, cache_dir: str = "data/cache/dbpedia"):
"""Initialize DBpedia lookup client."""
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
async def search(
self,
query: str,
max_results: int = 10,
type_filter: Optional[str] = None
) -> List[Dict]:
"""
Search DBpedia (autocomplete/type-ahead).
Args:
query: Search query (e.g., "Tuscaloosa", "School Board")
max_results: Maximum number of results
type_filter: Filter by type (e.g., "Place", "Person", "Organisation")
Returns:
List of result dicts with URI, label, description, etc.
"""
logger.info(f"Searching DBpedia for: {query}")
params = {
"query": query,
"maxResults": max_results,
"format": "json"
}
if type_filter:
params["type"] = type_filter
async with httpx.AsyncClient(timeout=30.0) as client:
try:
response = await client.get(
self.LOOKUP_API,
params=params,
headers={
"User-Agent": "CivicEngagementBot/1.0 (Educational Research)",
"Accept": "application/json"
}
)
response.raise_for_status()
data = response.json()
# Extract results
results = []
for item in data.get("results", []):
results.append({
"label": item.get("label"),
"uri": item.get("uri"),
"description": item.get("description"),
"classes": item.get("classes", []),
"categories": item.get("categories", []),
"refCount": item.get("refCount", 0), # How many Wikipedia pages link to this
"source": "dbpedia",
"fetched_at": datetime.utcnow().isoformat()
})
logger.info(f"✅ Found {len(results)} results for '{query}'")
return results
except Exception as e:
logger.error(f"Error searching DBpedia: {e}")
raise
async def search_by_type(
self,
query: str,
type_filter: str,
max_results: int = 20
) -> List[Dict]:
"""
Search for specific entity types.
Args:
query: Search query
type_filter: Entity type ("Place", "Person", "Organisation", etc.)
max_results: Maximum results
Returns:
Filtered results of that type
"""
logger.info(f"Searching for {type_filter}: {query}")
return await self.search(
query=query,
max_results=max_results,
type_filter=type_filter
)
async def get_resource_info(self, resource: str) -> Dict:
"""
Get detailed information about a DBpedia resource.
Args:
resource: Resource name (e.g., "Tuscaloosa,_Alabama")
Returns:
Dict with resource information
"""
# DBpedia resource URL
if not resource.startswith("http"):
resource_url = f"http://dbpedia.org/resource/{resource}"
else:
resource_url = resource
logger.info(f"Fetching resource info: {resource_url}")
# Query SPARQL endpoint for all properties
query = f"""
SELECT ?property ?value
WHERE {{
<{resource_url}> ?property ?value .
}}
LIMIT 100
"""
async with httpx.AsyncClient(timeout=30.0) as client:
try:
response = await client.get(
self.SPARQL_ENDPOINT,
params={
"query": query,
"format": "json"
},
headers={
"User-Agent": "CivicEngagementBot/1.0",
"Accept": "application/sparql-results+json"
}
)
response.raise_for_status()
data = response.json()
# Parse results into structured dict
info = {
"resource": resource_url,
"properties": {},
"source": "dbpedia",
"fetched_at": datetime.utcnow().isoformat()
}
for binding in data.get("results", {}).get("bindings", []):
prop = binding.get("property", {}).get("value", "")
value = binding.get("value", {}).get("value", "")
# Extract property name from URI
prop_name = prop.split("/")[-1].split("#")[-1]
# Store property
if prop_name not in info["properties"]:
info["properties"][prop_name] = []
info["properties"][prop_name].append(value)
logger.info(f"✅ Found {len(info['properties'])} properties for {resource}")
return info
except Exception as e:
logger.error(f"Error fetching resource info: {e}")
raise
async def find_cities(self, state: Optional[str] = None) -> List[Dict]:
"""
Find cities (with optional state filter).
Args:
state: State name to filter by
Returns:
List of city dicts
"""
if state:
query = f"cities in {state}"
else:
query = "city"
return await self.search_by_type(
query=query,
type_filter="City",
max_results=50
)
async def find_people(self, name_query: str) -> List[Dict]:
"""
Find people by name.
Args:
name_query: Name or partial name
Returns:
List of person dicts
"""
return await self.search_by_type(
query=name_query,
type_filter="Person",
max_results=20
)
async def find_organizations(self, org_query: str) -> List[Dict]:
"""
Find organizations.
Args:
org_query: Organization name or keyword
Returns:
List of organization dicts
"""
return await self.search_by_type(
query=org_query,
type_filter="Organisation",
max_results=20
)
def save_to_json(self, data, filename: str):
"""Save data to JSON cache."""
import json
filepath = self.cache_dir / filename
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
logger.info(f"💾 Saved data to {filepath}")
# ============================================================================
# Example Usage
# ============================================================================
async def example_usage():
"""Example usage of DBpedia integration."""
dbpedia = DBpediaLookup()
# Example 1: Autocomplete search for "Tuscaloosa"
logger.info("\n" + "="*80)
logger.info("Example 1: Autocomplete search for 'Tuscaloosa'")
logger.info("="*80)
try:
results = await dbpedia.search("Tuscaloosa", max_results=10)
print(f"\n✅ Found {len(results)} results:")
for result in results:
print(f"\n • {result['label']}")
if result.get('description'):
print(f" {result['description']}")
print(f" URI: {result['uri']}")
print(f" Reference count: {result['refCount']}")
if results:
dbpedia.save_to_json(results, "tuscaloosa_search.json")
except Exception as e:
logger.error(f"Error: {e}")
# Example 2: Get detailed info about Tuscaloosa, Alabama
logger.info("\n" + "="*80)
logger.info("Example 2: Get detailed info about Tuscaloosa, Alabama")
logger.info("="*80)
try:
info = await dbpedia.get_resource_info("Tuscaloosa,_Alabama")
print(f"\n✅ Found {len(info['properties'])} properties:")
# Show interesting properties
interesting = [
"mayor", "population", "areaCode", "postalCode",
"website", "leaderTitle", "foundingDate"
]
for prop in interesting:
if prop in info['properties']:
print(f" • {prop}: {info['properties'][prop]}")
dbpedia.save_to_json(info, "tuscaloosa_info.json")
except Exception as e:
logger.error(f"Error: {e}")
# Example 3: Search for cities in Alabama
logger.info("\n" + "="*80)
logger.info("Example 3: Search for cities in Alabama")
logger.info("="*80)
try:
cities = await dbpedia.find_cities(state="Alabama")
print(f"\n✅ Found {len(cities)} cities in Alabama:")
for city in cities[:10]: # Show first 10
print(f" • {city['label']}")
if city.get('description'):
print(f" {city['description']}")
if cities:
dbpedia.save_to_json(cities, "alabama_cities.json")
except Exception as e:
logger.error(f"Error: {e}")
# Example 4: Search for people (politicians)
logger.info("\n" + "="*80)
logger.info("Example 4: Search for Alabama politicians")
logger.info("="*80)
try:
people = await dbpedia.find_people("Alabama mayor")
print(f"\n✅ Found {len(people)} people:")
for person in people[:10]:
print(f" • {person['label']}")
if person.get('description'):
print(f" {person['description']}")
if people:
dbpedia.save_to_json(people, "alabama_politicians.json")
except Exception as e:
logger.error(f"Error: {e}")
logger.info("\n✅ Examples complete!")
logger.info("\n" + "="*80)
logger.info("DBpedia Lookup API is perfect for autocomplete!")
logger.info("Use it in your search box for instant suggestions.")
logger.info("="*80)
if __name__ == "__main__":
asyncio.run(example_usage())