File size: 7,584 Bytes
8750431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import csv
import time
import os
from visa_scraper import IndonesianVisaScraper
from typing import List, Dict, Any

# --- Configuration ---
OUTPUT_CSV_FILE = 'indonesian_visa_data_all.csv'
# Add a delay between requests to avoid overwhelming the server (in seconds)
REQUEST_DELAY = 0.5

# --- Test Mode Settings ---
# Set TEST_MODE to True to run on a small sample.
# Set it to False to run on all countries.
TEST_MODE = True
TEST_LIMIT = 5 # Number of countries to test if TEST_MODE is True

def save_to_csv(data: List[Dict[str, Any]], filename: str):
    """

    Saves a list of dictionaries to a CSV file. Appends if the file exists,

    otherwise creates a new file and writes the header.

    """
    if not data:
        return

    # Check if the file already exists to decide whether to write a header
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if not file_exists:
            writer.writeheader()  # Write header only if the file is new
        
        writer.writerows(data)

def main():
    """

    Main function to orchestrate the scraping and saving process.

    """
    scraper = IndonesianVisaScraper()
    
    # Define the CSV headers
    csv_headers = [
        'country', 'main_purpose', 'sub_activity_name', 'visa_name',
        'visa_code', 'duration', 'stay_summary', 'cost_summary',
        'is_multiple_entry', 'is_visa_on_arrival', 'is_guarantor_required',
        'passport_validity', 'full_description', 'detailed_info_html',
        'visa_id', 'sub_activity_id'
    ]

    # Create an empty file with headers first to ensure it's clean
    with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=csv_headers)
        writer.writeheader()

    total_purposes = len(scraper.PARENT_ACTIVITY_MAPPING)

    countries_to_scrape = list(scraper.COUNTRY_MAPPING.keys())

    if TEST_MODE:
        print("--- 🧪 TEST MODE ENABLED ---")
        print(f"Running for the first {TEST_LIMIT} countries only.")
        countries_to_scrape = countries_to_scrape[:TEST_LIMIT]
    else:
        print("--- 🚀 FULL SCRAPE MODE ---")

    total_countries = len(countries_to_scrape)

    print(f"Target file: {OUTPUT_CSV_FILE}")
    print(f"Scraping for {total_countries} countries and {total_purposes} main purposes.")
    print("-" * 50)

    # 1. Iterate through each country
    for i, country_name in enumerate(countries_to_scrape, 1):
        country_id = scraper.get_country_id(country_name)
        
        # 2. Iterate through each main purpose (parent activity)
        for j, parent_activity_name in enumerate(scraper.PARENT_ACTIVITY_MAPPING.keys(), 1):
            parent_activity_id = scraper.get_parent_activity_id(parent_activity_name)
            
            print(f"({i}/{total_countries}) {country_name} | ({j}/{total_purposes}) {parent_activity_name}")

            # 3. Get all sub-activities for the main purpose
            time.sleep(REQUEST_DELAY) # Respectful delay
            sub_activities = scraper.get_sub_activities(parent_activity_id)

            if not sub_activities:
                print("  -> No sub-activities found. Skipping.")
                continue

            # 4. Iterate through each sub-activity
            for sub_activity in sub_activities:
                sub_activity_id = sub_activity['id']
                sub_activity_name = sub_activity['name']
                print(f"  -> Sub-activity: {sub_activity_name}")

                # 5. Get available visa types for the sub-activity and country
                time.sleep(REQUEST_DELAY) # Respectful delay
                visa_types_data = scraper.get_visa_types(sub_activity_id, country_id)

                rows_to_write = []

                if not visa_types_data or not visa_types_data.get('data'):
                    message = "No specific visa found"
                    if visa_types_data and visa_types_data.get('status') == 'empty':
                        message = visa_types_data.get('message', "Guarantor likely required")
                    
                    print(f"    -> {message}")
                    # Add a row indicating why there's no visa data
                    row = {field: '' for field in csv_headers}
                    row.update({
                        'country': country_name,
                        'main_purpose': parent_activity_name,
                        'sub_activity_name': sub_activity_name,
                        'visa_name': message,
                    })
                    rows_to_write.append(row)
                else:
                    visa_list = visa_types_data.get('data', [])
                    print(f"    -> Found {len(visa_list)} potential visa type(s). Fetching details...")

                    # 6. Iterate through each visa type and get full details
                    for visa_type in visa_list:
                        visa_id = visa_type['id']
                        time.sleep(REQUEST_DELAY) # Respectful delay
                        details_response = scraper.get_visa_full_details(visa_id)

                        if details_response and details_response['success']:
                            details = details_response['data']
                            
                            # 7. Prepare a row with all collected data
                            row = {
                                'country': country_name,
                                'main_purpose': parent_activity_name,
                                'sub_activity_name': sub_activity_name,
                                'visa_name': details.get('name', visa_type.get('name')),
                                'visa_code': details.get('code', 'N/A'),
                                'duration': details.get('duration_time', 'N/A'),
                                'stay_summary': visa_type.get('stay_summary', 'N/A'),
                                'cost_summary': visa_type.get('cost_summary', 'N/A'),
                                'is_multiple_entry': details.get('is_multiple_entry', False),
                                'is_visa_on_arrival': details.get('is_arrival', False),
                                'is_guarantor_required': details.get('is_guarantor', False),
                                'passport_validity': f"{details.get('passport_value', 'N/A')} {details.get('passport_unit', '')}".strip(),
                                'full_description': details.get('description', 'N/A'),
                                'detailed_info_html': details.get('info_html', 'N/A'),
                                'visa_id': visa_id,
                                'sub_activity_id': sub_activity_id,
                            }
                            rows_to_write.append(row)
                            print(f"      - Fetched details for: {row['visa_name']}")
                        else:
                            print(f"      - FAILED to fetch details for visa ID {visa_id}")

                # 8. Append the collected rows to the CSV file
                if rows_to_write:
                    save_to_csv(rows_to_write, OUTPUT_CSV_FILE)
    
    print("-" * 50)
    print(f"✅ Scraping complete! Data saved to {OUTPUT_CSV_FILE}")

if __name__ == "__main__":
    main()