Spaces:
Running
Running
File size: 7,584 Bytes
8750431 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | import csv
import time
import os
from visa_scraper import IndonesianVisaScraper
from typing import List, Dict, Any
# --- Configuration ---
OUTPUT_CSV_FILE = 'indonesian_visa_data_all.csv'
# Add a delay between requests to avoid overwhelming the server (in seconds)
REQUEST_DELAY = 0.5
# --- Test Mode Settings ---
# Set TEST_MODE to True to run on a small sample.
# Set it to False to run on all countries.
TEST_MODE = True
TEST_LIMIT = 5 # Number of countries to test if TEST_MODE is True
def save_to_csv(data: List[Dict[str, Any]], filename: str):
"""
Saves a list of dictionaries to a CSV file. Appends if the file exists,
otherwise creates a new file and writes the header.
"""
if not data:
return
# Check if the file already exists to decide whether to write a header
file_exists = os.path.isfile(filename)
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
fieldnames = data[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not file_exists:
writer.writeheader() # Write header only if the file is new
writer.writerows(data)
def main():
"""
Main function to orchestrate the scraping and saving process.
"""
scraper = IndonesianVisaScraper()
# Define the CSV headers
csv_headers = [
'country', 'main_purpose', 'sub_activity_name', 'visa_name',
'visa_code', 'duration', 'stay_summary', 'cost_summary',
'is_multiple_entry', 'is_visa_on_arrival', 'is_guarantor_required',
'passport_validity', 'full_description', 'detailed_info_html',
'visa_id', 'sub_activity_id'
]
# Create an empty file with headers first to ensure it's clean
with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=csv_headers)
writer.writeheader()
total_purposes = len(scraper.PARENT_ACTIVITY_MAPPING)
countries_to_scrape = list(scraper.COUNTRY_MAPPING.keys())
if TEST_MODE:
print("--- 🧪 TEST MODE ENABLED ---")
print(f"Running for the first {TEST_LIMIT} countries only.")
countries_to_scrape = countries_to_scrape[:TEST_LIMIT]
else:
print("--- 🚀 FULL SCRAPE MODE ---")
total_countries = len(countries_to_scrape)
print(f"Target file: {OUTPUT_CSV_FILE}")
print(f"Scraping for {total_countries} countries and {total_purposes} main purposes.")
print("-" * 50)
# 1. Iterate through each country
for i, country_name in enumerate(countries_to_scrape, 1):
country_id = scraper.get_country_id(country_name)
# 2. Iterate through each main purpose (parent activity)
for j, parent_activity_name in enumerate(scraper.PARENT_ACTIVITY_MAPPING.keys(), 1):
parent_activity_id = scraper.get_parent_activity_id(parent_activity_name)
print(f"({i}/{total_countries}) {country_name} | ({j}/{total_purposes}) {parent_activity_name}")
# 3. Get all sub-activities for the main purpose
time.sleep(REQUEST_DELAY) # Respectful delay
sub_activities = scraper.get_sub_activities(parent_activity_id)
if not sub_activities:
print(" -> No sub-activities found. Skipping.")
continue
# 4. Iterate through each sub-activity
for sub_activity in sub_activities:
sub_activity_id = sub_activity['id']
sub_activity_name = sub_activity['name']
print(f" -> Sub-activity: {sub_activity_name}")
# 5. Get available visa types for the sub-activity and country
time.sleep(REQUEST_DELAY) # Respectful delay
visa_types_data = scraper.get_visa_types(sub_activity_id, country_id)
rows_to_write = []
if not visa_types_data or not visa_types_data.get('data'):
message = "No specific visa found"
if visa_types_data and visa_types_data.get('status') == 'empty':
message = visa_types_data.get('message', "Guarantor likely required")
print(f" -> {message}")
# Add a row indicating why there's no visa data
row = {field: '' for field in csv_headers}
row.update({
'country': country_name,
'main_purpose': parent_activity_name,
'sub_activity_name': sub_activity_name,
'visa_name': message,
})
rows_to_write.append(row)
else:
visa_list = visa_types_data.get('data', [])
print(f" -> Found {len(visa_list)} potential visa type(s). Fetching details...")
# 6. Iterate through each visa type and get full details
for visa_type in visa_list:
visa_id = visa_type['id']
time.sleep(REQUEST_DELAY) # Respectful delay
details_response = scraper.get_visa_full_details(visa_id)
if details_response and details_response['success']:
details = details_response['data']
# 7. Prepare a row with all collected data
row = {
'country': country_name,
'main_purpose': parent_activity_name,
'sub_activity_name': sub_activity_name,
'visa_name': details.get('name', visa_type.get('name')),
'visa_code': details.get('code', 'N/A'),
'duration': details.get('duration_time', 'N/A'),
'stay_summary': visa_type.get('stay_summary', 'N/A'),
'cost_summary': visa_type.get('cost_summary', 'N/A'),
'is_multiple_entry': details.get('is_multiple_entry', False),
'is_visa_on_arrival': details.get('is_arrival', False),
'is_guarantor_required': details.get('is_guarantor', False),
'passport_validity': f"{details.get('passport_value', 'N/A')} {details.get('passport_unit', '')}".strip(),
'full_description': details.get('description', 'N/A'),
'detailed_info_html': details.get('info_html', 'N/A'),
'visa_id': visa_id,
'sub_activity_id': sub_activity_id,
}
rows_to_write.append(row)
print(f" - Fetched details for: {row['visa_name']}")
else:
print(f" - FAILED to fetch details for visa ID {visa_id}")
# 8. Append the collected rows to the CSV file
if rows_to_write:
save_to_csv(rows_to_write, OUTPUT_CSV_FILE)
print("-" * 50)
print(f"✅ Scraping complete! Data saved to {OUTPUT_CSV_FILE}")
if __name__ == "__main__":
main() |