Spaces:

mdirshad09
/

FaceNet

Runtime error

App Files Files Community

FaceNet / scrapper.py

mdirshad09

Upload 8 files

2519bba about 2 years ago

raw

history blame contribute delete

5.56 kB

	import selenium
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	import time
	import requests
	import os
	import random
	import hashlib
	import json

	user_agents = [
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.1234.56 Safari/537.36",
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/101.0.1234.56 Safari/537.36",
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/101.0.1234.56",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/14.1.2",
	]

	def fetch_image_data(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 5):
	def scroll_to_end(wd):
	wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(sleep_between_interactions)

	search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

	wd.get(search_url.format(q=query))

	image_data_list = []

	image_count = 0
	results_start = 0

	while image_count < max_links_to_fetch:
	scroll_to_end(wd)

	# Get all image thumbnail results
	thumbnail_results = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
	number_results = len(thumbnail_results)

	print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
	done = False
	for img in thumbnail_results[results_start:number_results]:
	try:
	img.click()
	time.sleep(sleep_between_interactions)
	except Exception:
	continue

	# Extract image data: URL, title, and dimensions
	actual_images = wd.find_elements(By.CLASS_NAME, 'pT0Scc')
	for actual_image in actual_images:
	print("ACTUAL IMAGE: ", actual_image)
	if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
	image_url = actual_image.get_attribute('src')

	response = requests.get(image_url)
	if response.status_code == 200:
	image_title = actual_image.get_attribute('alt')

	# Find the parent <a> tag of the image for the page URL
	parent_a_tag = actual_image.find_element(By.XPATH, './ancestor::a')

	# Get the page URL directly from the parent <a> tag
	image_page_url = parent_a_tag.get_attribute('href')

	# Create a folder for the specific query if it doesn't exist
	query_folder = os.path.join('images', query)
	if not os.path.exists(query_folder):
	os.makedirs(query_folder)

	# Generate a unique file name using the URL hash
	file_name = hashlib.sha1(image_url.encode()).hexdigest()[:10]

	# Create a file path with the .jpg extension
	file_path = os.path.join(query_folder, f"{file_name}.jpg")
	# id = id.split('/')[-1]
	# Save the image
	with open(file_path, 'wb') as f:
	f.write(response.content)

	print(f"SUCCESS - saved {image_url} - as {file_path}")

	# Store the metadata in the list
	image_data_list.append({
	"url": image_url,
	"title": image_title,
	"page_url": image_page_url,
	"Id": file_name
	})

	image_count += 1 # Increment the image count

	if image_count >= max_links_to_fetch:
	print(f"Found: {len(image_data_list)} images, done!")
	done = True
	break # Exit the loop
	if done:
	break
	if done:
	break

	# Move the result start point further down
	results_start = len(thumbnail_results)

	return image_data_list

	if __name__ == '__main__':
	# Select a random user agent
	selected_user_agent = random.choice(user_agents)

	# Set the user agent for Edge driver
	options = webdriver.EdgeOptions()
	options.add_argument(f'user-agent={selected_user_agent}')

	# Initialize the Edge driver with the specified user agent
	wd = webdriver.Edge(options=options)

	queries = ["Elon Musk", "Barack Obama", "Taylor Swift", "Bill Gates", "Eminem"] # change your set of queries here

	for query in queries:
	num_of_images = 20
	wd.get('https://google.com')
	search_box = wd.find_element(By.NAME, 'q')
	search_box.send_keys(query)
	image_data_list = fetch_image_data(query, num_of_images, wd)

	# Create a dictionary to store the image data
	query_image_data = {
	"query": query,
	"images": image_data_list
	}

	# Serialize the image data dictionary to JSON
	json_data = json.dumps(query_image_data, indent=4)

	# Save the JSON data to a file with the query name
	json_filename = f"{query}.json"
	with open(json_filename, 'w') as json_file:
	json_file.write(json_data)

	wd.quit()