Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| import time | |
| import os | |
| # --- Tier 1: The Scout (Lightweight HTTP Request) --- | |
| def tier1_scout(url: str): | |
| """ | |
| Attempts to get the page title using a simple, direct HTTP request. | |
| This is fast and cheap, mimicking browser network traffic. | |
| """ | |
| try: | |
| # Masquerade as a common browser to bypass simple firewalls. | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Accept-Encoding': 'gzip, deflate, br', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
| 'Connection': 'keep-alive', | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx) | |
| # Parse the HTML with a lightweight tool. | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| title = soup.find('title') | |
| if title and title.string: | |
| return title.string.strip() | |
| else: | |
| return None # Success, but no title tag found | |
| except requests.exceptions.RequestException as e: | |
| # This includes connection errors, timeouts, and bad status codes. | |
| print(f"Tier 1 Error: {e}") | |
| return None # Failure | |
| # --- Tier 2: The Infiltrator (Headless Browser) --- | |
| def tier2_infiltrator(url: str): | |
| """ | |
| Uses a headless browser (Selenium) to render JavaScript. | |
| This version is configured to work inside a Hugging Face Space | |
| by using a pre-installed Chromium browser. | |
| """ | |
| try: | |
| # Setup headless Chrome browser options | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") | |
| chrome_options.add_argument("--no-sandbox") # Essential for running in a container | |
| chrome_options.add_argument("--disable-dev-shm-usage") # Overcomes limited resource problems | |
| chrome_options.add_argument("--window-size=1920,1080") | |
| # **IMPORTANT CHANGE**: Point directly to the system-installed chromedriver. | |
| # This path is correct for the environment created when you use a `packages.txt` | |
| # file with 'chromium-driver' and 'chromium-browser' in it. | |
| service = Service(executable_path="/usr/bin/chromedriver") | |
| driver = webdriver.Chrome(service=service, options=chrome_options) | |
| # NOTE on Proxies: In a real-world scenario, you would integrate a residential proxy here. | |
| # Example (conceptual): | |
| # PROXY = "user:pass@host:port" | |
| # chrome_options.add_argument(f'--proxy-server=http://{PROXY}') | |
| driver.get(url) | |
| time.sleep(5) # Wait for dynamic content to load via JavaScript | |
| title = driver.title | |
| driver.quit() | |
| if title: | |
| return title.strip() | |
| return None | |
| except Exception as e: | |
| print(f"Tier 2 Error: {e}") | |
| return None # Failure | |
| # --- Tier 3: The "Digital Ghost" (Conceptually Impossible on this Platform) --- | |
| def tier3_digital_ghost(url: str): | |
| """ | |
| This tier remains conceptually impossible on this platform. | |
| It requires a full graphical OS and upgraded hardware not available | |
| in standard Hugging Face Spaces. | |
| """ | |
| return "Tier 3 cannot be executed on this platform due to hardware and OS limitations." | |
| # --- Main API Function that Orchestrates the Tiers --- | |
| def tiered_web_interaction_api(url: str): | |
| """ | |
| The core function that follows the conceptual blueprint. | |
| It tries each tier in order until one succeeds. | |
| """ | |
| if not url.startswith('http'): | |
| url = 'https://' + url | |
| log = [] | |
| # === Try Tier 1: The Scout === | |
| log.append("--- Starting Tier 1: The Scout ---") | |
| title = tier1_scout(url) | |
| if title: | |
| log.append(f"✅ Tier 1 SUCCESS") | |
| log.append(f"Title: {title}") | |
| return "\n".join(log) | |
| else: | |
| log.append("❌ Tier 1 FAILED. Escalating to Tier 2...") | |
| # === Try Tier 2: The Infiltrator === | |
| log.append("\n--- Starting Tier 2: The Infiltrator (Headless Browser) ---") | |
| title = tier2_infiltrator(url) | |
| if title: | |
| log.append(f"✅ Tier 2 SUCCESS") | |
| log.append(f"Title: {title}") | |
| return "\n".join(log) | |
| else: | |
| log.append("❌ Tier 2 FAILED. Escalating to Tier 3...") | |
| # === Report on Tier 3: The "Digital Ghost" === | |
| log.append("\n--- Assessing Tier 3: The Digital Ghost ---") | |
| status = tier3_digital_ghost(url) | |
| log.append(f"ℹ️ Tier 3 Status: {status}") | |
| log.append("\n❌ All tiers failed. The page is inaccessible with the available methods.") | |
| return "\n".join(log) | |
| # --- Create the Gradio User Interface --- | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # Conceptual Blueprint: The Tiered Web Interaction API | |
| Enter a URL to fetch its title. The application will use a tiered strategy, | |
| starting with the simplest method and escalating only if necessary. | |
| **Note:** Tier 2 has been configured to run on Hugging Face Spaces. | |
| """ | |
| ) | |
| with gr.Row(): | |
| url_input = gr.Textbox( | |
| label="Target URL", | |
| placeholder="e.g., scrapethissite.com/pages/javascript/", | |
| scale=4 | |
| ) | |
| submit_button = gr.Button("Fetch Title", variant="primary", scale=1) | |
| output_log = gr.Textbox(label="Execution Log", lines=15, interactive=False) | |
| submit_button.click( | |
| fn=tiered_web_interaction_api, | |
| inputs=url_input, | |
| outputs=output_log | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| "https://google.com", # Should pass on Tier 1 | |
| "https://github.com", # Should pass on Tier 1 | |
| "https://www.scrapethissite.com/pages/javascript/", # Needs Tier 2 | |
| ], | |
| inputs=url_input | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |