operations / scripts /discovery /discover_omirl_direct.py
jbbove's picture
hard coded the 12 sensor types that were wrong in the table scraper and fixed test that were getting stuck because there was no browser session clean up
5e42519
#!/usr/bin/env python3
"""
OMIRL Direct Navigation Discovery
Based on our findings, let's try direct URL navigation to discover
the Valori Stazioni data structure and filtering options.
"""
import asyncio
import time
from playwright.async_api import async_playwright
from pathlib import Path
import json
# Create output directory for discoveries
DISCOVERY_OUTPUT = Path("data/examples/omirl_discovery")
DISCOVERY_OUTPUT.mkdir(parents=True, exist_ok=True)
class OMIRLDirectDiscovery:
def __init__(self):
self.browser = None
self.context = None
self.page = None
self.base_url = "https://omirl.regione.liguria.it"
async def setup_browser(self):
"""Initialize browser with discovery-friendly settings"""
playwright = await async_playwright().start()
self.browser = await playwright.chromium.launch(
headless=False, # Visible for observation
slow_mo=500, # Slow interactions
)
self.context = await self.browser.new_context(
viewport={"width": 1920, "height": 1080},
locale="it-IT",
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
)
self.page = await self.context.new_page()
self.page.on("console", lambda msg: print(f"Console: {msg.text}"))
async def cleanup(self):
if self.browser:
await self.browser.close()
async def take_screenshot(self, name):
screenshot_path = DISCOVERY_OUTPUT / f"{name}.png"
await self.page.screenshot(path=screenshot_path, full_page=True)
print(f"πŸ“Έ Screenshot: {screenshot_path}")
return str(screenshot_path)
async def save_discovery(self, step_name, data):
output_file = DISCOVERY_OUTPUT / f"{step_name}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"βœ… Saved: {output_file}")
async def explore_direct_navigation(self):
"""Try direct navigation to different OMIRL sections"""
print("\n🎯 Exploring direct navigation to OMIRL sections...")
# URLs to try - starting with the correct one we know
urls_to_try = [
("sensorstable", "/#/sensorstable"), # THE CORRECT ONE!
("summarytable", "/#/summarytable"),
("valori_stazioni", "/#/summarytable/valori_stazioni"),
("valori", "/#/summarytable/valori"),
("stazioni", "/#/summarytable/stazioni"),
("tabelle_valori", "/#/tabelle/valori"),
("tabelle_stazioni", "/#/tabelle/stazioni"),
("data_valori", "/#/data/valori_stazioni"),
("dati", "/#/dati"),
("dati_tabelle", "/#/dati/tabelle")
]
results = {}
for name, path in urls_to_try:
print(f"\nπŸ” Trying: {name} -> {self.base_url + path}")
try:
# Navigate to URL
await self.page.goto(self.base_url + path, wait_until="networkidle")
await self.page.wait_for_timeout(3000) # Wait for content load
# Check page content
title = await self.page.title()
url = self.page.url
# Look for tables
tables = await self.page.query_selector_all("table")
table_count = len(tables)
# Look for form elements (filters)
selects = await self.page.query_selector_all("select")
inputs = await self.page.query_selector_all("input")
# Look for station/weather related content
content_indicators = await self.check_content_indicators()
results[name] = {
"url": url,
"title": title,
"table_count": table_count,
"select_count": len(selects),
"input_count": len(inputs),
"content_indicators": content_indicators,
"success": True
}
# Take screenshot if it looks promising
if table_count > 0 or len(selects) > 2:
screenshot = await self.take_screenshot(f"direct_{name}")
results[name]["screenshot"] = screenshot
print(f"βœ… {name}: tables={table_count}, selects={len(selects)}, inputs={len(inputs)}")
except Exception as e:
results[name] = {
"error": str(e),
"success": False
}
print(f"❌ {name}: {e}")
await self.save_discovery("direct_navigation_results", results)
return results
async def check_content_indicators(self):
"""Check for weather/station related content on current page"""
indicators = {
"weather_terms": 0,
"station_terms": 0,
"data_terms": 0,
"found_terms": []
}
# Terms to look for
weather_terms = ["precipitazione", "temperatura", "vento", "umiditΓ ", "meteo"]
station_terms = ["stazione", "stazioni", "sensore", "rilevamento"]
data_terms = ["valori", "dati", "misure", "osservazioni"]
page_text = await self.page.text_content("body")
page_text_lower = page_text.lower() if page_text else ""
for term in weather_terms:
if term in page_text_lower:
indicators["weather_terms"] += 1
indicators["found_terms"].append(f"weather: {term}")
for term in station_terms:
if term in page_text_lower:
indicators["station_terms"] += 1
indicators["found_terms"].append(f"station: {term}")
for term in data_terms:
if term in page_text_lower:
indicators["data_terms"] += 1
indicators["found_terms"].append(f"data: {term}")
return indicators
async def explore_table_structure(self, promising_urls):
"""Analyze table structure from promising URLs"""
print("\nπŸ“Š Analyzing table structures...")
table_analyses = {}
for url_name, url_data in promising_urls.items():
if not url_data.get("success") or url_data.get("table_count", 0) == 0:
continue
print(f"\nπŸ” Analyzing tables in: {url_name}")
try:
# Navigate to the URL
await self.page.goto(url_data["url"], wait_until="networkidle")
await self.page.wait_for_timeout(2000)
# Find all tables
tables = await self.page.query_selector_all("table")
table_info = []
for i, table in enumerate(tables):
# Get table headers
headers = await table.query_selector_all("th")
header_texts = []
for header in headers:
text = await header.text_content()
if text and text.strip():
header_texts.append(text.strip())
# Get table rows count
rows = await table.query_selector_all("tr")
row_count = len(rows)
# Get table classes/id for identification
table_class = await table.get_attribute("class")
table_id = await table.get_attribute("id")
table_info.append({
"table_index": i,
"headers": header_texts,
"row_count": row_count,
"table_class": table_class,
"table_id": table_id,
"has_data": row_count > 1 and len(header_texts) > 0
})
print(f" πŸ“‹ Table {i}: {len(header_texts)} headers, {row_count} rows")
if header_texts:
print(f" Headers: {', '.join(header_texts[:5])}...")
table_analyses[url_name] = table_info
except Exception as e:
print(f"❌ Error analyzing {url_name}: {e}")
await self.save_discovery("table_structure_analysis", table_analyses)
return table_analyses
async def explore_filter_controls(self, promising_urls):
"""Analyze filter controls from promising URLs"""
print("\nπŸŽ›οΈ Analyzing filter controls...")
filter_analyses = {}
for url_name, url_data in promising_urls.items():
if not url_data.get("success") or url_data.get("select_count", 0) == 0:
continue
print(f"\nπŸ” Analyzing filters in: {url_name}")
try:
await self.page.goto(url_data["url"], wait_until="networkidle")
await self.page.wait_for_timeout(2000)
# Analyze select elements
selects = await self.page.query_selector_all("select")
select_info = []
for i, select in enumerate(selects):
# Get select attributes
select_id = await select.get_attribute("id")
select_name = await select.get_attribute("name")
select_class = await select.get_attribute("class")
# Get options
options = await select.query_selector_all("option")
option_values = []
for option in options:
value = await option.get_attribute("value")
text = await option.text_content()
if text and text.strip():
option_values.append({
"value": value,
"text": text.strip()
})
select_info.append({
"select_index": i,
"id": select_id,
"name": select_name,
"class": select_class,
"option_count": len(option_values),
"options": option_values[:10] # First 10 options
})
print(f" πŸ”½ Select {i} ({select_id}): {len(option_values)} options")
filter_analyses[url_name] = select_info
except Exception as e:
print(f"❌ Error analyzing filters in {url_name}: {e}")
await self.save_discovery("filter_control_analysis", filter_analyses)
return filter_analyses
async def run_direct_discovery():
"""Run direct navigation discovery"""
discovery = OMIRLDirectDiscovery()
try:
await discovery.setup_browser()
print("πŸš€ Starting OMIRL Direct Navigation Discovery")
print("=" * 60)
# Step 1: Try direct navigation to different sections
navigation_results = await discovery.explore_direct_navigation()
# Step 2: Analyze promising URLs with tables
promising_urls = {k: v for k, v in navigation_results.items()
if v.get("success") and v.get("table_count", 0) > 0}
if promising_urls:
print(f"\nβœ… Found {len(promising_urls)} promising URLs with tables")
# Analyze table structures
table_analysis = await discovery.explore_table_structure(promising_urls)
# Analyze filter controls
filter_analysis = await discovery.explore_filter_controls(navigation_results)
print("\n" + "=" * 60)
print("βœ… Direct Discovery completed!")
print(f"πŸ“ Results saved in: {DISCOVERY_OUTPUT}")
# Summary
print("\nSummary:")
for name, data in promising_urls.items():
print(f" πŸ“‹ {name}: {data.get('table_count', 0)} tables, {data.get('select_count', 0)} selects")
else:
print("\n❌ No URLs with tables found")
except Exception as e:
print(f"❌ Discovery failed: {e}")
finally:
await discovery.cleanup()
if __name__ == "__main__":
asyncio.run(run_direct_discovery())