import gradio as gr import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options import time import os # --- Tier 1: The Scout (Lightweight HTTP Request) --- def tier1_scout(url: str): """ Attempts to get the page title using a simple, direct HTTP request. This is fast and cheap, mimicking browser network traffic. """ try: # Masquerade as a common browser to bypass simple firewalls. headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Connection': 'keep-alive', } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx) # Parse the HTML with a lightweight tool. soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('title') if title and title.string: return title.string.strip() else: return None # Success, but no title tag found except requests.exceptions.RequestException as e: # This includes connection errors, timeouts, and bad status codes. print(f"Tier 1 Error: {e}") return None # Failure # --- Tier 2: The Infiltrator (Headless Browser) --- def tier2_infiltrator(url: str): """ Uses a headless browser (Selenium) to render JavaScript. This version is configured to work inside a Hugging Face Space by using a pre-installed Chromium browser. """ try: # Setup headless Chrome browser options chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") # Essential for running in a container chrome_options.add_argument("--disable-dev-shm-usage") # Overcomes limited resource problems chrome_options.add_argument("--window-size=1920,1080") # **IMPORTANT CHANGE**: Point directly to the system-installed chromedriver. # This path is correct for the environment created when you use a `packages.txt` # file with 'chromium-driver' and 'chromium-browser' in it. service = Service(executable_path="/usr/bin/chromedriver") driver = webdriver.Chrome(service=service, options=chrome_options) # NOTE on Proxies: In a real-world scenario, you would integrate a residential proxy here. # Example (conceptual): # PROXY = "user:pass@host:port" # chrome_options.add_argument(f'--proxy-server=http://{PROXY}') driver.get(url) time.sleep(5) # Wait for dynamic content to load via JavaScript title = driver.title driver.quit() if title: return title.strip() return None except Exception as e: print(f"Tier 2 Error: {e}") return None # Failure # --- Tier 3: The "Digital Ghost" (Conceptually Impossible on this Platform) --- def tier3_digital_ghost(url: str): """ This tier remains conceptually impossible on this platform. It requires a full graphical OS and upgraded hardware not available in standard Hugging Face Spaces. """ return "Tier 3 cannot be executed on this platform due to hardware and OS limitations." # --- Main API Function that Orchestrates the Tiers --- def tiered_web_interaction_api(url: str): """ The core function that follows the conceptual blueprint. It tries each tier in order until one succeeds. """ if not url.startswith('http'): url = 'https://' + url log = [] # === Try Tier 1: The Scout === log.append("--- Starting Tier 1: The Scout ---") title = tier1_scout(url) if title: log.append(f"✅ Tier 1 SUCCESS") log.append(f"Title: {title}") return "\n".join(log) else: log.append("❌ Tier 1 FAILED. Escalating to Tier 2...") # === Try Tier 2: The Infiltrator === log.append("\n--- Starting Tier 2: The Infiltrator (Headless Browser) ---") title = tier2_infiltrator(url) if title: log.append(f"✅ Tier 2 SUCCESS") log.append(f"Title: {title}") return "\n".join(log) else: log.append("❌ Tier 2 FAILED. Escalating to Tier 3...") # === Report on Tier 3: The "Digital Ghost" === log.append("\n--- Assessing Tier 3: The Digital Ghost ---") status = tier3_digital_ghost(url) log.append(f"ℹ️ Tier 3 Status: {status}") log.append("\n❌ All tiers failed. The page is inaccessible with the available methods.") return "\n".join(log) # --- Create the Gradio User Interface --- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # Conceptual Blueprint: The Tiered Web Interaction API Enter a URL to fetch its title. The application will use a tiered strategy, starting with the simplest method and escalating only if necessary. **Note:** Tier 2 has been configured to run on Hugging Face Spaces. """ ) with gr.Row(): url_input = gr.Textbox( label="Target URL", placeholder="e.g., scrapethissite.com/pages/javascript/", scale=4 ) submit_button = gr.Button("Fetch Title", variant="primary", scale=1) output_log = gr.Textbox(label="Execution Log", lines=15, interactive=False) submit_button.click( fn=tiered_web_interaction_api, inputs=url_input, outputs=output_log ) gr.Examples( examples=[ "https://google.com", # Should pass on Tier 1 "https://github.com", # Should pass on Tier 1 "https://www.scrapethissite.com/pages/javascript/", # Needs Tier 2 ], inputs=url_input ) if __name__ == "__main__": demo.launch()