shoolinicv / scraper.py
visharxd's picture
First commit
6e3c6f3
import pandas as pd
import requests
import os
import logging
from requests.exceptions import RequestException
def download_images(excel_file, output_folder):
# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
# Read Excel file
df = pd.read_excel(excel_file)
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Base URL template
base_url = "https://myshoolini.s3.ap-south-1.amazonaws.com/face_detect_data/{}/img/1.png"
# Iterate through roll numbers
for index, row in df.iterrows():
roll_number = str(row['Registration Id']) # Adjust column name as needed
name = str(row['Student Name']) # Adjust column name as needed
try:
# Construct full URL
url = base_url.format(roll_number) # Note: Changed from name to roll_number
# Log the URL being accessed
logger.info(f"Attempting to access URL: {url}")
# Send GET request with timeout
response = requests.get(url, timeout=10)
# Check status code
if response.status_code == 200:
logger.info(f"Successfully accessed {url}")
if len(response.content) > 0:
# Save image with name as filename
filename = os.path.join(output_folder, f"{name}_{roll_number}.png")
with open(filename, 'wb') as file:
file.write(response.content)
logger.info(f"Successfully downloaded image for {roll_number} - {name}")
else:
logger.error(f"Empty response received for {roll_number} - {name}")
elif response.status_code == 404:
logger.error(f"Image not found for {roll_number} - {name}")
elif response.status_code == 403:
logger.error(f"Access forbidden for {roll_number} - {name}")
else:
logger.error(f"HTTP {response.status_code} error for {roll_number} - {name}")
except requests.Timeout:
logger.error(f"Request timed out for {roll_number} - {name}")
except requests.ConnectionError:
logger.error(f"Connection error for {roll_number} - {name}")
except RequestException as e:
logger.error(f"Request failed for {roll_number} - {name}: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error for {roll_number} - {name}: {str(e)}")
# Example usage
if __name__ == "__main__":
excel_file_path = "list6.xlsx" # Replace with your Excel file path
output_directory = r"C:\Users\kashy\Downloads\source-code-face-recognition\source code\images" # Output folder for images
download_images(excel_file_path, output_directory)