Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| OMIRL Direct Navigation Discovery | |
| Based on our findings, let's try direct URL navigation to discover | |
| the Valori Stazioni data structure and filtering options. | |
| """ | |
| import asyncio | |
| import time | |
| from playwright.async_api import async_playwright | |
| from pathlib import Path | |
| import json | |
| # Create output directory for discoveries | |
| DISCOVERY_OUTPUT = Path("data/examples/omirl_discovery") | |
| DISCOVERY_OUTPUT.mkdir(parents=True, exist_ok=True) | |
| class OMIRLDirectDiscovery: | |
| def __init__(self): | |
| self.browser = None | |
| self.context = None | |
| self.page = None | |
| self.base_url = "https://omirl.regione.liguria.it" | |
| async def setup_browser(self): | |
| """Initialize browser with discovery-friendly settings""" | |
| playwright = await async_playwright().start() | |
| self.browser = await playwright.chromium.launch( | |
| headless=False, # Visible for observation | |
| slow_mo=500, # Slow interactions | |
| ) | |
| self.context = await self.browser.new_context( | |
| viewport={"width": 1920, "height": 1080}, | |
| locale="it-IT", | |
| user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36" | |
| ) | |
| self.page = await self.context.new_page() | |
| self.page.on("console", lambda msg: print(f"Console: {msg.text}")) | |
| async def cleanup(self): | |
| if self.browser: | |
| await self.browser.close() | |
| async def take_screenshot(self, name): | |
| screenshot_path = DISCOVERY_OUTPUT / f"{name}.png" | |
| await self.page.screenshot(path=screenshot_path, full_page=True) | |
| print(f"πΈ Screenshot: {screenshot_path}") | |
| return str(screenshot_path) | |
| async def save_discovery(self, step_name, data): | |
| output_file = DISCOVERY_OUTPUT / f"{step_name}.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| print(f"β Saved: {output_file}") | |
| async def explore_direct_navigation(self): | |
| """Try direct navigation to different OMIRL sections""" | |
| print("\nπ― Exploring direct navigation to OMIRL sections...") | |
| # URLs to try - starting with the correct one we know | |
| urls_to_try = [ | |
| ("sensorstable", "/#/sensorstable"), # THE CORRECT ONE! | |
| ("summarytable", "/#/summarytable"), | |
| ("valori_stazioni", "/#/summarytable/valori_stazioni"), | |
| ("valori", "/#/summarytable/valori"), | |
| ("stazioni", "/#/summarytable/stazioni"), | |
| ("tabelle_valori", "/#/tabelle/valori"), | |
| ("tabelle_stazioni", "/#/tabelle/stazioni"), | |
| ("data_valori", "/#/data/valori_stazioni"), | |
| ("dati", "/#/dati"), | |
| ("dati_tabelle", "/#/dati/tabelle") | |
| ] | |
| results = {} | |
| for name, path in urls_to_try: | |
| print(f"\nπ Trying: {name} -> {self.base_url + path}") | |
| try: | |
| # Navigate to URL | |
| await self.page.goto(self.base_url + path, wait_until="networkidle") | |
| await self.page.wait_for_timeout(3000) # Wait for content load | |
| # Check page content | |
| title = await self.page.title() | |
| url = self.page.url | |
| # Look for tables | |
| tables = await self.page.query_selector_all("table") | |
| table_count = len(tables) | |
| # Look for form elements (filters) | |
| selects = await self.page.query_selector_all("select") | |
| inputs = await self.page.query_selector_all("input") | |
| # Look for station/weather related content | |
| content_indicators = await self.check_content_indicators() | |
| results[name] = { | |
| "url": url, | |
| "title": title, | |
| "table_count": table_count, | |
| "select_count": len(selects), | |
| "input_count": len(inputs), | |
| "content_indicators": content_indicators, | |
| "success": True | |
| } | |
| # Take screenshot if it looks promising | |
| if table_count > 0 or len(selects) > 2: | |
| screenshot = await self.take_screenshot(f"direct_{name}") | |
| results[name]["screenshot"] = screenshot | |
| print(f"β {name}: tables={table_count}, selects={len(selects)}, inputs={len(inputs)}") | |
| except Exception as e: | |
| results[name] = { | |
| "error": str(e), | |
| "success": False | |
| } | |
| print(f"β {name}: {e}") | |
| await self.save_discovery("direct_navigation_results", results) | |
| return results | |
| async def check_content_indicators(self): | |
| """Check for weather/station related content on current page""" | |
| indicators = { | |
| "weather_terms": 0, | |
| "station_terms": 0, | |
| "data_terms": 0, | |
| "found_terms": [] | |
| } | |
| # Terms to look for | |
| weather_terms = ["precipitazione", "temperatura", "vento", "umiditΓ ", "meteo"] | |
| station_terms = ["stazione", "stazioni", "sensore", "rilevamento"] | |
| data_terms = ["valori", "dati", "misure", "osservazioni"] | |
| page_text = await self.page.text_content("body") | |
| page_text_lower = page_text.lower() if page_text else "" | |
| for term in weather_terms: | |
| if term in page_text_lower: | |
| indicators["weather_terms"] += 1 | |
| indicators["found_terms"].append(f"weather: {term}") | |
| for term in station_terms: | |
| if term in page_text_lower: | |
| indicators["station_terms"] += 1 | |
| indicators["found_terms"].append(f"station: {term}") | |
| for term in data_terms: | |
| if term in page_text_lower: | |
| indicators["data_terms"] += 1 | |
| indicators["found_terms"].append(f"data: {term}") | |
| return indicators | |
| async def explore_table_structure(self, promising_urls): | |
| """Analyze table structure from promising URLs""" | |
| print("\nπ Analyzing table structures...") | |
| table_analyses = {} | |
| for url_name, url_data in promising_urls.items(): | |
| if not url_data.get("success") or url_data.get("table_count", 0) == 0: | |
| continue | |
| print(f"\nπ Analyzing tables in: {url_name}") | |
| try: | |
| # Navigate to the URL | |
| await self.page.goto(url_data["url"], wait_until="networkidle") | |
| await self.page.wait_for_timeout(2000) | |
| # Find all tables | |
| tables = await self.page.query_selector_all("table") | |
| table_info = [] | |
| for i, table in enumerate(tables): | |
| # Get table headers | |
| headers = await table.query_selector_all("th") | |
| header_texts = [] | |
| for header in headers: | |
| text = await header.text_content() | |
| if text and text.strip(): | |
| header_texts.append(text.strip()) | |
| # Get table rows count | |
| rows = await table.query_selector_all("tr") | |
| row_count = len(rows) | |
| # Get table classes/id for identification | |
| table_class = await table.get_attribute("class") | |
| table_id = await table.get_attribute("id") | |
| table_info.append({ | |
| "table_index": i, | |
| "headers": header_texts, | |
| "row_count": row_count, | |
| "table_class": table_class, | |
| "table_id": table_id, | |
| "has_data": row_count > 1 and len(header_texts) > 0 | |
| }) | |
| print(f" π Table {i}: {len(header_texts)} headers, {row_count} rows") | |
| if header_texts: | |
| print(f" Headers: {', '.join(header_texts[:5])}...") | |
| table_analyses[url_name] = table_info | |
| except Exception as e: | |
| print(f"β Error analyzing {url_name}: {e}") | |
| await self.save_discovery("table_structure_analysis", table_analyses) | |
| return table_analyses | |
| async def explore_filter_controls(self, promising_urls): | |
| """Analyze filter controls from promising URLs""" | |
| print("\nποΈ Analyzing filter controls...") | |
| filter_analyses = {} | |
| for url_name, url_data in promising_urls.items(): | |
| if not url_data.get("success") or url_data.get("select_count", 0) == 0: | |
| continue | |
| print(f"\nπ Analyzing filters in: {url_name}") | |
| try: | |
| await self.page.goto(url_data["url"], wait_until="networkidle") | |
| await self.page.wait_for_timeout(2000) | |
| # Analyze select elements | |
| selects = await self.page.query_selector_all("select") | |
| select_info = [] | |
| for i, select in enumerate(selects): | |
| # Get select attributes | |
| select_id = await select.get_attribute("id") | |
| select_name = await select.get_attribute("name") | |
| select_class = await select.get_attribute("class") | |
| # Get options | |
| options = await select.query_selector_all("option") | |
| option_values = [] | |
| for option in options: | |
| value = await option.get_attribute("value") | |
| text = await option.text_content() | |
| if text and text.strip(): | |
| option_values.append({ | |
| "value": value, | |
| "text": text.strip() | |
| }) | |
| select_info.append({ | |
| "select_index": i, | |
| "id": select_id, | |
| "name": select_name, | |
| "class": select_class, | |
| "option_count": len(option_values), | |
| "options": option_values[:10] # First 10 options | |
| }) | |
| print(f" π½ Select {i} ({select_id}): {len(option_values)} options") | |
| filter_analyses[url_name] = select_info | |
| except Exception as e: | |
| print(f"β Error analyzing filters in {url_name}: {e}") | |
| await self.save_discovery("filter_control_analysis", filter_analyses) | |
| return filter_analyses | |
| async def run_direct_discovery(): | |
| """Run direct navigation discovery""" | |
| discovery = OMIRLDirectDiscovery() | |
| try: | |
| await discovery.setup_browser() | |
| print("π Starting OMIRL Direct Navigation Discovery") | |
| print("=" * 60) | |
| # Step 1: Try direct navigation to different sections | |
| navigation_results = await discovery.explore_direct_navigation() | |
| # Step 2: Analyze promising URLs with tables | |
| promising_urls = {k: v for k, v in navigation_results.items() | |
| if v.get("success") and v.get("table_count", 0) > 0} | |
| if promising_urls: | |
| print(f"\nβ Found {len(promising_urls)} promising URLs with tables") | |
| # Analyze table structures | |
| table_analysis = await discovery.explore_table_structure(promising_urls) | |
| # Analyze filter controls | |
| filter_analysis = await discovery.explore_filter_controls(navigation_results) | |
| print("\n" + "=" * 60) | |
| print("β Direct Discovery completed!") | |
| print(f"π Results saved in: {DISCOVERY_OUTPUT}") | |
| # Summary | |
| print("\nSummary:") | |
| for name, data in promising_urls.items(): | |
| print(f" π {name}: {data.get('table_count', 0)} tables, {data.get('select_count', 0)} selects") | |
| else: | |
| print("\nβ No URLs with tables found") | |
| except Exception as e: | |
| print(f"β Discovery failed: {e}") | |
| finally: | |
| await discovery.cleanup() | |
| if __name__ == "__main__": | |
| asyncio.run(run_direct_discovery()) | |