Spaces:
Running
Running
| import csv | |
| import time | |
| import os | |
| from visa_scraper import IndonesianVisaScraper | |
| from typing import List, Dict, Any | |
| # --- Configuration --- | |
| OUTPUT_CSV_FILE = 'indonesian_visa_data_all.csv' | |
| # Add a delay between requests to avoid overwhelming the server (in seconds) | |
| REQUEST_DELAY = 0.5 | |
| # --- Test Mode Settings --- | |
| # Set TEST_MODE to True to run on a small sample. | |
| # Set it to False to run on all countries. | |
| TEST_MODE = True | |
| TEST_LIMIT = 5 # Number of countries to test if TEST_MODE is True | |
| def save_to_csv(data: List[Dict[str, Any]], filename: str): | |
| """ | |
| Saves a list of dictionaries to a CSV file. Appends if the file exists, | |
| otherwise creates a new file and writes the header. | |
| """ | |
| if not data: | |
| return | |
| # Check if the file already exists to decide whether to write a header | |
| file_exists = os.path.isfile(filename) | |
| with open(filename, 'a', newline='', encoding='utf-8') as csvfile: | |
| fieldnames = data[0].keys() | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| if not file_exists: | |
| writer.writeheader() # Write header only if the file is new | |
| writer.writerows(data) | |
| def main(): | |
| """ | |
| Main function to orchestrate the scraping and saving process. | |
| """ | |
| scraper = IndonesianVisaScraper() | |
| # Define the CSV headers | |
| csv_headers = [ | |
| 'country', 'main_purpose', 'sub_activity_name', 'visa_name', | |
| 'visa_code', 'duration', 'stay_summary', 'cost_summary', | |
| 'is_multiple_entry', 'is_visa_on_arrival', 'is_guarantor_required', | |
| 'passport_validity', 'full_description', 'detailed_info_html', | |
| 'visa_id', 'sub_activity_id' | |
| ] | |
| # Create an empty file with headers first to ensure it's clean | |
| with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.DictWriter(f, fieldnames=csv_headers) | |
| writer.writeheader() | |
| total_purposes = len(scraper.PARENT_ACTIVITY_MAPPING) | |
| countries_to_scrape = list(scraper.COUNTRY_MAPPING.keys()) | |
| if TEST_MODE: | |
| print("--- π§ͺ TEST MODE ENABLED ---") | |
| print(f"Running for the first {TEST_LIMIT} countries only.") | |
| countries_to_scrape = countries_to_scrape[:TEST_LIMIT] | |
| else: | |
| print("--- π FULL SCRAPE MODE ---") | |
| total_countries = len(countries_to_scrape) | |
| print(f"Target file: {OUTPUT_CSV_FILE}") | |
| print(f"Scraping for {total_countries} countries and {total_purposes} main purposes.") | |
| print("-" * 50) | |
| # 1. Iterate through each country | |
| for i, country_name in enumerate(countries_to_scrape, 1): | |
| country_id = scraper.get_country_id(country_name) | |
| # 2. Iterate through each main purpose (parent activity) | |
| for j, parent_activity_name in enumerate(scraper.PARENT_ACTIVITY_MAPPING.keys(), 1): | |
| parent_activity_id = scraper.get_parent_activity_id(parent_activity_name) | |
| print(f"({i}/{total_countries}) {country_name} | ({j}/{total_purposes}) {parent_activity_name}") | |
| # 3. Get all sub-activities for the main purpose | |
| time.sleep(REQUEST_DELAY) # Respectful delay | |
| sub_activities = scraper.get_sub_activities(parent_activity_id) | |
| if not sub_activities: | |
| print(" -> No sub-activities found. Skipping.") | |
| continue | |
| # 4. Iterate through each sub-activity | |
| for sub_activity in sub_activities: | |
| sub_activity_id = sub_activity['id'] | |
| sub_activity_name = sub_activity['name'] | |
| print(f" -> Sub-activity: {sub_activity_name}") | |
| # 5. Get available visa types for the sub-activity and country | |
| time.sleep(REQUEST_DELAY) # Respectful delay | |
| visa_types_data = scraper.get_visa_types(sub_activity_id, country_id) | |
| rows_to_write = [] | |
| if not visa_types_data or not visa_types_data.get('data'): | |
| message = "No specific visa found" | |
| if visa_types_data and visa_types_data.get('status') == 'empty': | |
| message = visa_types_data.get('message', "Guarantor likely required") | |
| print(f" -> {message}") | |
| # Add a row indicating why there's no visa data | |
| row = {field: '' for field in csv_headers} | |
| row.update({ | |
| 'country': country_name, | |
| 'main_purpose': parent_activity_name, | |
| 'sub_activity_name': sub_activity_name, | |
| 'visa_name': message, | |
| }) | |
| rows_to_write.append(row) | |
| else: | |
| visa_list = visa_types_data.get('data', []) | |
| print(f" -> Found {len(visa_list)} potential visa type(s). Fetching details...") | |
| # 6. Iterate through each visa type and get full details | |
| for visa_type in visa_list: | |
| visa_id = visa_type['id'] | |
| time.sleep(REQUEST_DELAY) # Respectful delay | |
| details_response = scraper.get_visa_full_details(visa_id) | |
| if details_response and details_response['success']: | |
| details = details_response['data'] | |
| # 7. Prepare a row with all collected data | |
| row = { | |
| 'country': country_name, | |
| 'main_purpose': parent_activity_name, | |
| 'sub_activity_name': sub_activity_name, | |
| 'visa_name': details.get('name', visa_type.get('name')), | |
| 'visa_code': details.get('code', 'N/A'), | |
| 'duration': details.get('duration_time', 'N/A'), | |
| 'stay_summary': visa_type.get('stay_summary', 'N/A'), | |
| 'cost_summary': visa_type.get('cost_summary', 'N/A'), | |
| 'is_multiple_entry': details.get('is_multiple_entry', False), | |
| 'is_visa_on_arrival': details.get('is_arrival', False), | |
| 'is_guarantor_required': details.get('is_guarantor', False), | |
| 'passport_validity': f"{details.get('passport_value', 'N/A')} {details.get('passport_unit', '')}".strip(), | |
| 'full_description': details.get('description', 'N/A'), | |
| 'detailed_info_html': details.get('info_html', 'N/A'), | |
| 'visa_id': visa_id, | |
| 'sub_activity_id': sub_activity_id, | |
| } | |
| rows_to_write.append(row) | |
| print(f" - Fetched details for: {row['visa_name']}") | |
| else: | |
| print(f" - FAILED to fetch details for visa ID {visa_id}") | |
| # 8. Append the collected rows to the CSV file | |
| if rows_to_write: | |
| save_to_csv(rows_to_write, OUTPUT_CSV_FILE) | |
| print("-" * 50) | |
| print(f"β Scraping complete! Data saved to {OUTPUT_CSV_FILE}") | |
| if __name__ == "__main__": | |
| main() |