Spaces:

promptAId
/

operations

Runtime error

operations / scripts /discovery /discover_omirl_direct.py

hard coded the 12 sensor types that were wrong in the table scraper and fixed test that were getting stuck because there was no browser session clean up

5e42519 8 months ago

raw

history blame contribute delete

13.1 kB

	#!/usr/bin/env python3
	"""
	OMIRL Direct Navigation Discovery

	Based on our findings, let's try direct URL navigation to discover
	the Valori Stazioni data structure and filtering options.
	"""
	import asyncio
	import time
	from playwright.async_api import async_playwright
	from pathlib import Path
	import json

	# Create output directory for discoveries
	DISCOVERY_OUTPUT = Path("data/examples/omirl_discovery")
	DISCOVERY_OUTPUT.mkdir(parents=True, exist_ok=True)

	class OMIRLDirectDiscovery:
	def __init__(self):
	self.browser = None
	self.context = None
	self.page = None
	self.base_url = "https://omirl.regione.liguria.it"

	async def setup_browser(self):
	"""Initialize browser with discovery-friendly settings"""
	playwright = await async_playwright().start()
	self.browser = await playwright.chromium.launch(
	headless=False, # Visible for observation
	slow_mo=500, # Slow interactions
	)

	self.context = await self.browser.new_context(
	viewport={"width": 1920, "height": 1080},
	locale="it-IT",
	user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
	)

	self.page = await self.context.new_page()
	self.page.on("console", lambda msg: print(f"Console: {msg.text}"))

	async def cleanup(self):
	if self.browser:
	await self.browser.close()

	async def take_screenshot(self, name):
	screenshot_path = DISCOVERY_OUTPUT / f"{name}.png"
	await self.page.screenshot(path=screenshot_path, full_page=True)
	print(f"📸 Screenshot: {screenshot_path}")
	return str(screenshot_path)

	async def save_discovery(self, step_name, data):
	output_file = DISCOVERY_OUTPUT / f"{step_name}.json"
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	print(f"✅ Saved: {output_file}")

	async def explore_direct_navigation(self):
	"""Try direct navigation to different OMIRL sections"""
	print("\n🎯 Exploring direct navigation to OMIRL sections...")

	# URLs to try - starting with the correct one we know
	urls_to_try = [
	("sensorstable", "/#/sensorstable"), # THE CORRECT ONE!
	("summarytable", "/#/summarytable"),
	("valori_stazioni", "/#/summarytable/valori_stazioni"),
	("valori", "/#/summarytable/valori"),
	("stazioni", "/#/summarytable/stazioni"),
	("tabelle_valori", "/#/tabelle/valori"),
	("tabelle_stazioni", "/#/tabelle/stazioni"),
	("data_valori", "/#/data/valori_stazioni"),
	("dati", "/#/dati"),
	("dati_tabelle", "/#/dati/tabelle")
	]

	results = {}

	for name, path in urls_to_try:
	print(f"\n🔍 Trying: {name} -> {self.base_url + path}")

	try:
	# Navigate to URL
	await self.page.goto(self.base_url + path, wait_until="networkidle")
	await self.page.wait_for_timeout(3000) # Wait for content load

	# Check page content
	title = await self.page.title()
	url = self.page.url

	# Look for tables
	tables = await self.page.query_selector_all("table")
	table_count = len(tables)

	# Look for form elements (filters)
	selects = await self.page.query_selector_all("select")
	inputs = await self.page.query_selector_all("input")

	# Look for station/weather related content
	content_indicators = await self.check_content_indicators()

	results[name] = {
	"url": url,
	"title": title,
	"table_count": table_count,
	"select_count": len(selects),
	"input_count": len(inputs),
	"content_indicators": content_indicators,
	"success": True
	}

	# Take screenshot if it looks promising
	if table_count > 0 or len(selects) > 2:
	screenshot = await self.take_screenshot(f"direct_{name}")
	results[name]["screenshot"] = screenshot

	print(f"✅ {name}: tables={table_count}, selects={len(selects)}, inputs={len(inputs)}")

	except Exception as e:
	results[name] = {
	"error": str(e),
	"success": False
	}
	print(f"❌ {name}: {e}")

	await self.save_discovery("direct_navigation_results", results)
	return results

	async def check_content_indicators(self):
	"""Check for weather/station related content on current page"""
	indicators = {
	"weather_terms": 0,
	"station_terms": 0,
	"data_terms": 0,
	"found_terms": []
	}

	# Terms to look for
	weather_terms = ["precipitazione", "temperatura", "vento", "umidità", "meteo"]
	station_terms = ["stazione", "stazioni", "sensore", "rilevamento"]
	data_terms = ["valori", "dati", "misure", "osservazioni"]

	page_text = await self.page.text_content("body")
	page_text_lower = page_text.lower() if page_text else ""

	for term in weather_terms:
	if term in page_text_lower:
	indicators["weather_terms"] += 1
	indicators["found_terms"].append(f"weather: {term}")

	for term in station_terms:
	if term in page_text_lower:
	indicators["station_terms"] += 1
	indicators["found_terms"].append(f"station: {term}")

	for term in data_terms:
	if term in page_text_lower:
	indicators["data_terms"] += 1
	indicators["found_terms"].append(f"data: {term}")

	return indicators

	async def explore_table_structure(self, promising_urls):
	"""Analyze table structure from promising URLs"""
	print("\n📊 Analyzing table structures...")

	table_analyses = {}

	for url_name, url_data in promising_urls.items():
	if not url_data.get("success") or url_data.get("table_count", 0) == 0:
	continue

	print(f"\n🔍 Analyzing tables in: {url_name}")

	try:
	# Navigate to the URL
	await self.page.goto(url_data["url"], wait_until="networkidle")
	await self.page.wait_for_timeout(2000)

	# Find all tables
	tables = await self.page.query_selector_all("table")

	table_info = []
	for i, table in enumerate(tables):
	# Get table headers
	headers = await table.query_selector_all("th")
	header_texts = []
	for header in headers:
	text = await header.text_content()
	if text and text.strip():
	header_texts.append(text.strip())

	# Get table rows count
	rows = await table.query_selector_all("tr")
	row_count = len(rows)

	# Get table classes/id for identification
	table_class = await table.get_attribute("class")
	table_id = await table.get_attribute("id")

	table_info.append({
	"table_index": i,
	"headers": header_texts,
	"row_count": row_count,
	"table_class": table_class,
	"table_id": table_id,
	"has_data": row_count > 1 and len(header_texts) > 0
	})

	print(f" 📋 Table {i}: {len(header_texts)} headers, {row_count} rows")
	if header_texts:
	print(f" Headers: {', '.join(header_texts[:5])}...")

	table_analyses[url_name] = table_info

	except Exception as e:
	print(f"❌ Error analyzing {url_name}: {e}")

	await self.save_discovery("table_structure_analysis", table_analyses)
	return table_analyses

	async def explore_filter_controls(self, promising_urls):
	"""Analyze filter controls from promising URLs"""
	print("\n🎛️ Analyzing filter controls...")

	filter_analyses = {}

	for url_name, url_data in promising_urls.items():
	if not url_data.get("success") or url_data.get("select_count", 0) == 0:
	continue

	print(f"\n🔍 Analyzing filters in: {url_name}")

	try:
	await self.page.goto(url_data["url"], wait_until="networkidle")
	await self.page.wait_for_timeout(2000)

	# Analyze select elements
	selects = await self.page.query_selector_all("select")
	select_info = []

	for i, select in enumerate(selects):
	# Get select attributes
	select_id = await select.get_attribute("id")
	select_name = await select.get_attribute("name")
	select_class = await select.get_attribute("class")

	# Get options
	options = await select.query_selector_all("option")
	option_values = []
	for option in options:
	value = await option.get_attribute("value")
	text = await option.text_content()
	if text and text.strip():
	option_values.append({
	"value": value,
	"text": text.strip()
	})

	select_info.append({
	"select_index": i,
	"id": select_id,
	"name": select_name,
	"class": select_class,
	"option_count": len(option_values),
	"options": option_values[:10] # First 10 options
	})

	print(f" 🔽 Select {i} ({select_id}): {len(option_values)} options")

	filter_analyses[url_name] = select_info

	except Exception as e:
	print(f"❌ Error analyzing filters in {url_name}: {e}")

	await self.save_discovery("filter_control_analysis", filter_analyses)
	return filter_analyses

	async def run_direct_discovery():
	"""Run direct navigation discovery"""
	discovery = OMIRLDirectDiscovery()

	try:
	await discovery.setup_browser()

	print("🚀 Starting OMIRL Direct Navigation Discovery")
	print("=" * 60)

	# Step 1: Try direct navigation to different sections
	navigation_results = await discovery.explore_direct_navigation()

	# Step 2: Analyze promising URLs with tables
	promising_urls = {k: v for k, v in navigation_results.items()
	if v.get("success") and v.get("table_count", 0) > 0}

	if promising_urls:
	print(f"\n✅ Found {len(promising_urls)} promising URLs with tables")

	# Analyze table structures
	table_analysis = await discovery.explore_table_structure(promising_urls)

	# Analyze filter controls
	filter_analysis = await discovery.explore_filter_controls(navigation_results)

	print("\n" + "=" * 60)
	print("✅ Direct Discovery completed!")
	print(f"📁 Results saved in: {DISCOVERY_OUTPUT}")

	# Summary
	print("\nSummary:")
	for name, data in promising_urls.items():
	print(f" 📋 {name}: {data.get('table_count', 0)} tables, {data.get('select_count', 0)} selects")

	else:
	print("\n❌ No URLs with tables found")

	except Exception as e:
	print(f"❌ Discovery failed: {e}")
	finally:
	await discovery.cleanup()

	if __name__ == "__main__":
	asyncio.run(run_direct_discovery())