QuickAgent / BrowsingAgent /tools /SolveCaptcha.py
varun324242's picture
Upload 58 files
0d3af20 verified
import base64
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import presence_of_element_located, \
frame_to_be_available_and_switch_to_it
from selenium.webdriver.support.wait import WebDriverWait
from agency_swarm.tools import BaseTool
from .util import get_b64_screenshot, remove_highlight_and_labels
from .util.selenium import get_web_driver
from agency_swarm.util import get_openai_client
class SolveCaptcha(BaseTool):
"""
This tool asks a human to solve captcha on the current webpage. Make sure that captcha is visible before running it.
"""
def run(self):
wd = get_web_driver()
try:
WebDriverWait(wd, 10).until(
frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[@title='reCAPTCHA']"))
)
element = WebDriverWait(wd, 3).until(
presence_of_element_located((By.ID, "recaptcha-anchor"))
)
except Exception as e:
return "Could not find captcha checkbox"
try:
# Scroll the element into view
wd.execute_script("arguments[0].scrollIntoView(true);", element)
time.sleep(1) # Give some time for the scrolling to complete
# Click the element using JavaScript
wd.execute_script("arguments[0].click();", element)
except Exception as e:
return f"Could not click captcha checkbox: {str(e)}"
try:
# Now check if the reCAPTCHA is checked
WebDriverWait(wd, 3).until(
lambda d: d.find_element(By.CLASS_NAME, "recaptcha-checkbox").get_attribute(
"aria-checked") == "true"
)
return "Success"
except Exception as e:
pass
wd.switch_to.default_content()
client = get_openai_client()
WebDriverWait(wd, 10).until(
frame_to_be_available_and_switch_to_it(
(By.XPATH, "//iframe[@title='recaptcha challenge expires in two minutes']"))
)
time.sleep(2)
attempts = 0
while attempts < 5:
tiles = wd.find_elements(By.CLASS_NAME, "rc-imageselect-tile")
# filter out tiles with rc-imageselect-dynamic-selected class
tiles = [tile for tile in tiles if
not tile.get_attribute("class").endswith("rc-imageselect-dynamic-selected")]
image_content = []
i = 0
for tile in tiles:
i += 1
screenshot = get_b64_screenshot(wd, tile)
image_content.append(
{
"type": "text",
"text": f"Image {i}:",
}
)
image_content.append(
{
"type": "image_url",
"image_url":
{
"url": f"data:image/jpeg;base64,{screenshot}",
"detail": "high",
}
},
)
# highlight all titles with rc-imageselect-tile class but not with rc-imageselect-dynamic-selected
# wd = highlight_elements_with_labels(wd, 'td.rc-imageselect-tile:not(.rc-imageselect-dynamic-selected)')
# screenshot = get_b64_screenshot(wd, wd.find_element(By.ID, "rc-imageselect"))
task_text = wd.find_element(By.CLASS_NAME, "rc-imageselect-instructions").text.strip().replace("\n",
" ")
continuous_task = 'once there are none left' in task_text.lower()
task_text = task_text.replace("Click verify", "Output 0")
task_text = task_text.replace("click skip", "Output 0")
task_text = task_text.replace("once", "if")
task_text = task_text.replace("none left", "none")
task_text = task_text.replace("all", "only")
task_text = task_text.replace("squares", "images")
additional_info = ""
if len(tiles) > 9:
additional_info = ("Keep in mind that all images are a part of a bigger image "
"from left to right, and top to bottom. The grid is 4x4. ")
messages = [
{
"role": "system",
"content": f"""You are an advanced AI designed to support users with visual impairments.
User will provide you with {i} images numbered from 1 to {i}. Your task is to output
the numbers of the images that contain the requested object, or at least some part of the requested
object. {additional_info}If there are no individual images that satisfy this condition, output 0.
""".replace("\n", ""),
},
{
"role": "user",
"content": [
*image_content,
{
"type": "text",
"text": f"{task_text}. Only output numbers separated by commas and nothing else. "
f"Output 0 if there are none."
}
]
}]
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
max_tokens=1024,
temperature=0.0,
)
message = response.choices[0].message
message_text = message.content
# check if 0 is in the message
if "0" in message_text and "10" not in message_text:
# Find the button by its ID
verify_button = wd.find_element(By.ID, "recaptcha-verify-button")
verify_button_text = verify_button.text
# Click the button
wd.execute_script("arguments[0].click();", verify_button)
time.sleep(1)
try:
if self.verify_checkbox(wd):
return "Success. Captcha solved."
except Exception as e:
print('Not checked')
pass
else:
numbers = [int(s.strip()) for s in message_text.split(",") if s.strip().isdigit()]
# Click the tiles based on the provided numbers
for number in numbers:
wd.execute_script("arguments[0].click();", tiles[number - 1])
time.sleep(0.5)
time.sleep(3)
if not continuous_task:
# Find the button by its ID
verify_button = wd.find_element(By.ID, "recaptcha-verify-button")
verify_button_text = verify_button.text
# Click the button
wd.execute_script("arguments[0].click();", verify_button)
try:
if self.verify_checkbox(wd):
return "Success. Captcha solved."
except Exception as e:
pass
else:
continue
if "verify" in verify_button_text.lower():
attempts += 1
wd = remove_highlight_and_labels(wd)
wd.switch_to.default_content()
# close captcha
try:
element = WebDriverWait(wd, 3).until(
presence_of_element_located((By.XPATH, "//iframe[@title='reCAPTCHA']"))
)
wd.execute_script(f"document.elementFromPoint({element.location['x']}, {element.location['y']-10}).click();")
except Exception as e:
print(e)
pass
return "Could not solve captcha."
def verify_checkbox(self, wd):
wd.switch_to.default_content()
try:
WebDriverWait(wd, 10).until(
frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[@title='reCAPTCHA']"))
)
WebDriverWait(wd, 5).until(
lambda d: d.find_element(By.CLASS_NAME, "recaptcha-checkbox").get_attribute(
"aria-checked") == "true"
)
return True
except Exception as e:
wd.switch_to.default_content()
WebDriverWait(wd, 10).until(
frame_to_be_available_and_switch_to_it(
(By.XPATH, "//iframe[@title='recaptcha challenge expires in two minutes']"))
)
return False