streamlit-web-crawler / src /save_all_to_csv.py
exorcist123's picture
test
8750431
import csv
import time
import os
from visa_scraper import IndonesianVisaScraper
from typing import List, Dict, Any
# --- Configuration ---
OUTPUT_CSV_FILE = 'indonesian_visa_data_all.csv'
# Add a delay between requests to avoid overwhelming the server (in seconds)
REQUEST_DELAY = 0.5
# --- Test Mode Settings ---
# Set TEST_MODE to True to run on a small sample.
# Set it to False to run on all countries.
TEST_MODE = True
TEST_LIMIT = 5 # Number of countries to test if TEST_MODE is True
def save_to_csv(data: List[Dict[str, Any]], filename: str):
"""
Saves a list of dictionaries to a CSV file. Appends if the file exists,
otherwise creates a new file and writes the header.
"""
if not data:
return
# Check if the file already exists to decide whether to write a header
file_exists = os.path.isfile(filename)
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
fieldnames = data[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not file_exists:
writer.writeheader() # Write header only if the file is new
writer.writerows(data)
def main():
"""
Main function to orchestrate the scraping and saving process.
"""
scraper = IndonesianVisaScraper()
# Define the CSV headers
csv_headers = [
'country', 'main_purpose', 'sub_activity_name', 'visa_name',
'visa_code', 'duration', 'stay_summary', 'cost_summary',
'is_multiple_entry', 'is_visa_on_arrival', 'is_guarantor_required',
'passport_validity', 'full_description', 'detailed_info_html',
'visa_id', 'sub_activity_id'
]
# Create an empty file with headers first to ensure it's clean
with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=csv_headers)
writer.writeheader()
total_purposes = len(scraper.PARENT_ACTIVITY_MAPPING)
countries_to_scrape = list(scraper.COUNTRY_MAPPING.keys())
if TEST_MODE:
print("--- πŸ§ͺ TEST MODE ENABLED ---")
print(f"Running for the first {TEST_LIMIT} countries only.")
countries_to_scrape = countries_to_scrape[:TEST_LIMIT]
else:
print("--- πŸš€ FULL SCRAPE MODE ---")
total_countries = len(countries_to_scrape)
print(f"Target file: {OUTPUT_CSV_FILE}")
print(f"Scraping for {total_countries} countries and {total_purposes} main purposes.")
print("-" * 50)
# 1. Iterate through each country
for i, country_name in enumerate(countries_to_scrape, 1):
country_id = scraper.get_country_id(country_name)
# 2. Iterate through each main purpose (parent activity)
for j, parent_activity_name in enumerate(scraper.PARENT_ACTIVITY_MAPPING.keys(), 1):
parent_activity_id = scraper.get_parent_activity_id(parent_activity_name)
print(f"({i}/{total_countries}) {country_name} | ({j}/{total_purposes}) {parent_activity_name}")
# 3. Get all sub-activities for the main purpose
time.sleep(REQUEST_DELAY) # Respectful delay
sub_activities = scraper.get_sub_activities(parent_activity_id)
if not sub_activities:
print(" -> No sub-activities found. Skipping.")
continue
# 4. Iterate through each sub-activity
for sub_activity in sub_activities:
sub_activity_id = sub_activity['id']
sub_activity_name = sub_activity['name']
print(f" -> Sub-activity: {sub_activity_name}")
# 5. Get available visa types for the sub-activity and country
time.sleep(REQUEST_DELAY) # Respectful delay
visa_types_data = scraper.get_visa_types(sub_activity_id, country_id)
rows_to_write = []
if not visa_types_data or not visa_types_data.get('data'):
message = "No specific visa found"
if visa_types_data and visa_types_data.get('status') == 'empty':
message = visa_types_data.get('message', "Guarantor likely required")
print(f" -> {message}")
# Add a row indicating why there's no visa data
row = {field: '' for field in csv_headers}
row.update({
'country': country_name,
'main_purpose': parent_activity_name,
'sub_activity_name': sub_activity_name,
'visa_name': message,
})
rows_to_write.append(row)
else:
visa_list = visa_types_data.get('data', [])
print(f" -> Found {len(visa_list)} potential visa type(s). Fetching details...")
# 6. Iterate through each visa type and get full details
for visa_type in visa_list:
visa_id = visa_type['id']
time.sleep(REQUEST_DELAY) # Respectful delay
details_response = scraper.get_visa_full_details(visa_id)
if details_response and details_response['success']:
details = details_response['data']
# 7. Prepare a row with all collected data
row = {
'country': country_name,
'main_purpose': parent_activity_name,
'sub_activity_name': sub_activity_name,
'visa_name': details.get('name', visa_type.get('name')),
'visa_code': details.get('code', 'N/A'),
'duration': details.get('duration_time', 'N/A'),
'stay_summary': visa_type.get('stay_summary', 'N/A'),
'cost_summary': visa_type.get('cost_summary', 'N/A'),
'is_multiple_entry': details.get('is_multiple_entry', False),
'is_visa_on_arrival': details.get('is_arrival', False),
'is_guarantor_required': details.get('is_guarantor', False),
'passport_validity': f"{details.get('passport_value', 'N/A')} {details.get('passport_unit', '')}".strip(),
'full_description': details.get('description', 'N/A'),
'detailed_info_html': details.get('info_html', 'N/A'),
'visa_id': visa_id,
'sub_activity_id': sub_activity_id,
}
rows_to_write.append(row)
print(f" - Fetched details for: {row['visa_name']}")
else:
print(f" - FAILED to fetch details for visa ID {visa_id}")
# 8. Append the collected rows to the CSV file
if rows_to_write:
save_to_csv(rows_to_write, OUTPUT_CSV_FILE)
print("-" * 50)
print(f"βœ… Scraping complete! Data saved to {OUTPUT_CSV_FILE}")
if __name__ == "__main__":
main()