import csv import time import os from visa_scraper import IndonesianVisaScraper from typing import List, Dict, Any # --- Configuration --- OUTPUT_CSV_FILE = 'indonesian_visa_data_all.csv' # Add a delay between requests to avoid overwhelming the server (in seconds) REQUEST_DELAY = 0.5 # --- Test Mode Settings --- # Set TEST_MODE to True to run on a small sample. # Set it to False to run on all countries. TEST_MODE = True TEST_LIMIT = 5 # Number of countries to test if TEST_MODE is True def save_to_csv(data: List[Dict[str, Any]], filename: str): """ Saves a list of dictionaries to a CSV file. Appends if the file exists, otherwise creates a new file and writes the header. """ if not data: return # Check if the file already exists to decide whether to write a header file_exists = os.path.isfile(filename) with open(filename, 'a', newline='', encoding='utf-8') as csvfile: fieldnames = data[0].keys() writer = csv.DictWriter(csvfile, fieldnames=fieldnames) if not file_exists: writer.writeheader() # Write header only if the file is new writer.writerows(data) def main(): """ Main function to orchestrate the scraping and saving process. """ scraper = IndonesianVisaScraper() # Define the CSV headers csv_headers = [ 'country', 'main_purpose', 'sub_activity_name', 'visa_name', 'visa_code', 'duration', 'stay_summary', 'cost_summary', 'is_multiple_entry', 'is_visa_on_arrival', 'is_guarantor_required', 'passport_validity', 'full_description', 'detailed_info_html', 'visa_id', 'sub_activity_id' ] # Create an empty file with headers first to ensure it's clean with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=csv_headers) writer.writeheader() total_purposes = len(scraper.PARENT_ACTIVITY_MAPPING) countries_to_scrape = list(scraper.COUNTRY_MAPPING.keys()) if TEST_MODE: print("--- ๐Ÿงช TEST MODE ENABLED ---") print(f"Running for the first {TEST_LIMIT} countries only.") countries_to_scrape = countries_to_scrape[:TEST_LIMIT] else: print("--- ๐Ÿš€ FULL SCRAPE MODE ---") total_countries = len(countries_to_scrape) print(f"Target file: {OUTPUT_CSV_FILE}") print(f"Scraping for {total_countries} countries and {total_purposes} main purposes.") print("-" * 50) # 1. Iterate through each country for i, country_name in enumerate(countries_to_scrape, 1): country_id = scraper.get_country_id(country_name) # 2. Iterate through each main purpose (parent activity) for j, parent_activity_name in enumerate(scraper.PARENT_ACTIVITY_MAPPING.keys(), 1): parent_activity_id = scraper.get_parent_activity_id(parent_activity_name) print(f"({i}/{total_countries}) {country_name} | ({j}/{total_purposes}) {parent_activity_name}") # 3. Get all sub-activities for the main purpose time.sleep(REQUEST_DELAY) # Respectful delay sub_activities = scraper.get_sub_activities(parent_activity_id) if not sub_activities: print(" -> No sub-activities found. Skipping.") continue # 4. Iterate through each sub-activity for sub_activity in sub_activities: sub_activity_id = sub_activity['id'] sub_activity_name = sub_activity['name'] print(f" -> Sub-activity: {sub_activity_name}") # 5. Get available visa types for the sub-activity and country time.sleep(REQUEST_DELAY) # Respectful delay visa_types_data = scraper.get_visa_types(sub_activity_id, country_id) rows_to_write = [] if not visa_types_data or not visa_types_data.get('data'): message = "No specific visa found" if visa_types_data and visa_types_data.get('status') == 'empty': message = visa_types_data.get('message', "Guarantor likely required") print(f" -> {message}") # Add a row indicating why there's no visa data row = {field: '' for field in csv_headers} row.update({ 'country': country_name, 'main_purpose': parent_activity_name, 'sub_activity_name': sub_activity_name, 'visa_name': message, }) rows_to_write.append(row) else: visa_list = visa_types_data.get('data', []) print(f" -> Found {len(visa_list)} potential visa type(s). Fetching details...") # 6. Iterate through each visa type and get full details for visa_type in visa_list: visa_id = visa_type['id'] time.sleep(REQUEST_DELAY) # Respectful delay details_response = scraper.get_visa_full_details(visa_id) if details_response and details_response['success']: details = details_response['data'] # 7. Prepare a row with all collected data row = { 'country': country_name, 'main_purpose': parent_activity_name, 'sub_activity_name': sub_activity_name, 'visa_name': details.get('name', visa_type.get('name')), 'visa_code': details.get('code', 'N/A'), 'duration': details.get('duration_time', 'N/A'), 'stay_summary': visa_type.get('stay_summary', 'N/A'), 'cost_summary': visa_type.get('cost_summary', 'N/A'), 'is_multiple_entry': details.get('is_multiple_entry', False), 'is_visa_on_arrival': details.get('is_arrival', False), 'is_guarantor_required': details.get('is_guarantor', False), 'passport_validity': f"{details.get('passport_value', 'N/A')} {details.get('passport_unit', '')}".strip(), 'full_description': details.get('description', 'N/A'), 'detailed_info_html': details.get('info_html', 'N/A'), 'visa_id': visa_id, 'sub_activity_id': sub_activity_id, } rows_to_write.append(row) print(f" - Fetched details for: {row['visa_name']}") else: print(f" - FAILED to fetch details for visa ID {visa_id}") # 8. Append the collected rows to the CSV file if rows_to_write: save_to_csv(rows_to_write, OUTPUT_CSV_FILE) print("-" * 50) print(f"โœ… Scraping complete! Data saved to {OUTPUT_CSV_FILE}") if __name__ == "__main__": main()