Spaces:

spunteam
/

streamlit-web-crawler

Running

File size: 7,584 Bytes
import csv
import time
import os
from visa_scraper import IndonesianVisaScraper
from typing import List, Dict, Any

# --- Configuration ---
OUTPUT_CSV_FILE = 'indonesian_visa_data_all.csv'
# Add a delay between requests to avoid overwhelming the server (in seconds)
REQUEST_DELAY = 0.5

# --- Test Mode Settings ---
# Set TEST_MODE to True to run on a small sample.
# Set it to False to run on all countries.
TEST_MODE = True
TEST_LIMIT = 5 # Number of countries to test if TEST_MODE is True

def save_to_csv(data: List[Dict[str, Any]], filename: str):
    """

    Saves a list of dictionaries to a CSV file. Appends if the file exists,

    otherwise creates a new file and writes the header.

    """
    if not data:
        return

    # Check if the file already exists to decide whether to write a header
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if not file_exists:
            writer.writeheader()  # Write header only if the file is new
        
        writer.writerows(data)

def main():
    """

    Main function to orchestrate the scraping and saving process.

    """
    scraper = IndonesianVisaScraper()
    
    # Define the CSV headers
    csv_headers = [
        'country', 'main_purpose', 'sub_activity_name', 'visa_name',
        'visa_code', 'duration', 'stay_summary', 'cost_summary',
        'is_multiple_entry', 'is_visa_on_arrival', 'is_guarantor_required',
        'passport_validity', 'full_description', 'detailed_info_html',
        'visa_id', 'sub_activity_id'
    ]

    # Create an empty file with headers first to ensure it's clean
    with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=csv_headers)
        writer.writeheader()

    total_purposes = len(scraper.PARENT_ACTIVITY_MAPPING)

    countries_to_scrape = list(scraper.COUNTRY_MAPPING.keys())

    if TEST_MODE:
        print("--- 🧪 TEST MODE ENABLED ---")
        print(f"Running for the first {TEST_LIMIT} countries only.")
        countries_to_scrape = countries_to_scrape[:TEST_LIMIT]
    else:
        print("--- 🚀 FULL SCRAPE MODE ---")

    total_countries = len(countries_to_scrape)

    print(f"Target file: {OUTPUT_CSV_FILE}")
    print(f"Scraping for {total_countries} countries and {total_purposes} main purposes.")
    print("-" * 50)

    # 1. Iterate through each country
    for i, country_name in enumerate(countries_to_scrape, 1):
        country_id = scraper.get_country_id(country_name)
        
        # 2. Iterate through each main purpose (parent activity)
        for j, parent_activity_name in enumerate(scraper.PARENT_ACTIVITY_MAPPING.keys(), 1):
            parent_activity_id = scraper.get_parent_activity_id(parent_activity_name)
            
            print(f"({i}/{total_countries}) {country_name} | ({j}/{total_purposes}) {parent_activity_name}")

            # 3. Get all sub-activities for the main purpose
            time.sleep(REQUEST_DELAY) # Respectful delay
            sub_activities = scraper.get_sub_activities(parent_activity_id)

            if not sub_activities:
                print("  -> No sub-activities found. Skipping.")
                continue

            # 4. Iterate through each sub-activity
            for sub_activity in sub_activities:
                sub_activity_id = sub_activity['id']
                sub_activity_name = sub_activity['name']
                print(f"  -> Sub-activity: {sub_activity_name}")

                # 5. Get available visa types for the sub-activity and country
                time.sleep(REQUEST_DELAY) # Respectful delay
                visa_types_data = scraper.get_visa_types(sub_activity_id, country_id)

                rows_to_write = []

                if not visa_types_data or not visa_types_data.get('data'):
                    message = "No specific visa found"
                    if visa_types_data and visa_types_data.get('status') == 'empty':
                        message = visa_types_data.get('message', "Guarantor likely required")
                    
                    print(f"    -> {message}")
                    # Add a row indicating why there's no visa data
                    row = {field: '' for field in csv_headers}
                    row.update({
                        'country': country_name,
                        'main_purpose': parent_activity_name,
                        'sub_activity_name': sub_activity_name,
                        'visa_name': message,
                    })
                    rows_to_write.append(row)
                else:
                    visa_list = visa_types_data.get('data', [])
                    print(f"    -> Found {len(visa_list)} potential visa type(s). Fetching details...")

                    # 6. Iterate through each visa type and get full details
                    for visa_type in visa_list:
                        visa_id = visa_type['id']
                        time.sleep(REQUEST_DELAY) # Respectful delay
                        details_response = scraper.get_visa_full_details(visa_id)

                        if details_response and details_response['success']:
                            details = details_response['data']
                            
                            # 7. Prepare a row with all collected data
                            row = {
                                'country': country_name,
                                'main_purpose': parent_activity_name,
                                'sub_activity_name': sub_activity_name,
                                'visa_name': details.get('name', visa_type.get('name')),
                                'visa_code': details.get('code', 'N/A'),
                                'duration': details.get('duration_time', 'N/A'),
                                'stay_summary': visa_type.get('stay_summary', 'N/A'),
                                'cost_summary': visa_type.get('cost_summary', 'N/A'),
                                'is_multiple_entry': details.get('is_multiple_entry', False),
                                'is_visa_on_arrival': details.get('is_arrival', False),
                                'is_guarantor_required': details.get('is_guarantor', False),
                                'passport_validity': f"{details.get('passport_value', 'N/A')} {details.get('passport_unit', '')}".strip(),
                                'full_description': details.get('description', 'N/A'),
                                'detailed_info_html': details.get('info_html', 'N/A'),
                                'visa_id': visa_id,
                                'sub_activity_id': sub_activity_id,
                            }
                            rows_to_write.append(row)
                            print(f"      - Fetched details for: {row['visa_name']}")
                        else:
                            print(f"      - FAILED to fetch details for visa ID {visa_id}")

                # 8. Append the collected rows to the CSV file
                if rows_to_write:
                    save_to_csv(rows_to_write, OUTPUT_CSV_FILE)
    
    print("-" * 50)
    print(f"✅ Scraping complete! Data saved to {OUTPUT_CSV_FILE}")

if __name__ == "__main__":
    main()