Spaces:

threeorfour
/

waht

Build error

File size: 6,161 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import os

# --- Tier 1: The Scout (Lightweight HTTP Request) ---
def tier1_scout(url: str):
    """
    Attempts to get the page title using a simple, direct HTTP request.
    This is fast and cheap, mimicking browser network traffic.
    """
    try:
        # Masquerade as a common browser to bypass simple firewalls.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Connection': 'keep-alive',
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)

        # Parse the HTML with a lightweight tool.
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('title')

        if title and title.string:
            return title.string.strip()
        else:
            return None # Success, but no title tag found
    except requests.exceptions.RequestException as e:
        # This includes connection errors, timeouts, and bad status codes.
        print(f"Tier 1 Error: {e}")
        return None # Failure

# --- Tier 2: The Infiltrator (Headless Browser) ---
def tier2_infiltrator(url: str):
    """
    Uses a headless browser (Selenium) to render JavaScript.
    This version is configured to work inside a Hugging Face Space
    by using a pre-installed Chromium browser.
    """
    try:
        # Setup headless Chrome browser options
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox") # Essential for running in a container
        chrome_options.add_argument("--disable-dev-shm-usage") # Overcomes limited resource problems
        chrome_options.add_argument("--window-size=1920,1080")

        # **IMPORTANT CHANGE**: Point directly to the system-installed chromedriver.
        # This path is correct for the environment created when you use a `packages.txt`
        # file with 'chromium-driver' and 'chromium-browser' in it.
        service = Service(executable_path="/usr/bin/chromedriver")
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # NOTE on Proxies: In a real-world scenario, you would integrate a residential proxy here.
        # Example (conceptual):
        # PROXY = "user:pass@host:port"
        # chrome_options.add_argument(f'--proxy-server=http://{PROXY}')

        driver.get(url)
        time.sleep(5)  # Wait for dynamic content to load via JavaScript

        title = driver.title
        driver.quit()

        if title:
            return title.strip()
        return None
    except Exception as e:
        print(f"Tier 2 Error: {e}")
        return None # Failure

# --- Tier 3: The "Digital Ghost" (Conceptually Impossible on this Platform) ---
def tier3_digital_ghost(url: str):
    """
    This tier remains conceptually impossible on this platform.
    It requires a full graphical OS and upgraded hardware not available
    in standard Hugging Face Spaces.
    """
    return "Tier 3 cannot be executed on this platform due to hardware and OS limitations."


# --- Main API Function that Orchestrates the Tiers ---
def tiered_web_interaction_api(url: str):
    """
    The core function that follows the conceptual blueprint.
    It tries each tier in order until one succeeds.
    """
    if not url.startswith('http'):
        url = 'https://' + url

    log = []

    # === Try Tier 1: The Scout ===
    log.append("--- Starting Tier 1: The Scout ---")
    title = tier1_scout(url)
    if title:
        log.append(f"✅ Tier 1 SUCCESS")
        log.append(f"Title: {title}")
        return "\n".join(log)
    else:
        log.append("❌ Tier 1 FAILED. Escalating to Tier 2...")

    # === Try Tier 2: The Infiltrator ===
    log.append("\n--- Starting Tier 2: The Infiltrator (Headless Browser) ---")
    title = tier2_infiltrator(url)
    if title:
        log.append(f"✅ Tier 2 SUCCESS")
        log.append(f"Title: {title}")
        return "\n".join(log)
    else:
        log.append("❌ Tier 2 FAILED. Escalating to Tier 3...")

    # === Report on Tier 3: The "Digital Ghost" ===
    log.append("\n--- Assessing Tier 3: The Digital Ghost ---")
    status = tier3_digital_ghost(url)
    log.append(f"ℹ️ Tier 3 Status: {status}")
    log.append("\n❌ All tiers failed. The page is inaccessible with the available methods.")

    return "\n".join(log)

# --- Create the Gradio User Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Conceptual Blueprint: The Tiered Web Interaction API
        Enter a URL to fetch its title. The application will use a tiered strategy,
        starting with the simplest method and escalating only if necessary.
        **Note:** Tier 2 has been configured to run on Hugging Face Spaces.
        """
    )

    with gr.Row():
        url_input = gr.Textbox(
            label="Target URL",
            placeholder="e.g., scrapethissite.com/pages/javascript/",
            scale=4
        )
        submit_button = gr.Button("Fetch Title", variant="primary", scale=1)

    output_log = gr.Textbox(label="Execution Log", lines=15, interactive=False)

    submit_button.click(
        fn=tiered_web_interaction_api,
        inputs=url_input,
        outputs=output_log
    )

    gr.Examples(
        examples=[
            "https://google.com", # Should pass on Tier 1
            "https://github.com", # Should pass on Tier 1
            "https://www.scrapethissite.com/pages/javascript/", # Needs Tier 2
        ],
        inputs=url_input
    )

if __name__ == "__main__":
    demo.launch()