Spaces:

spunteam
/

streamlit-web-crawler

Running

App Files Files Community

streamlit-web-crawler / src /save_all_to_csv.py

exorcist123

test

8750431 6 months ago

raw

history blame contribute delete

7.58 kB

	import csv
	import time
	import os
	from visa_scraper import IndonesianVisaScraper
	from typing import List, Dict, Any

	# --- Configuration ---
	OUTPUT_CSV_FILE = 'indonesian_visa_data_all.csv'
	# Add a delay between requests to avoid overwhelming the server (in seconds)
	REQUEST_DELAY = 0.5

	# --- Test Mode Settings ---
	# Set TEST_MODE to True to run on a small sample.
	# Set it to False to run on all countries.
	TEST_MODE = True
	TEST_LIMIT = 5 # Number of countries to test if TEST_MODE is True

	def save_to_csv(data: List[Dict[str, Any]], filename: str):
	"""
	Saves a list of dictionaries to a CSV file. Appends if the file exists,
	otherwise creates a new file and writes the header.
	"""
	if not data:
	return

	# Check if the file already exists to decide whether to write a header
	file_exists = os.path.isfile(filename)

	with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
	fieldnames = data[0].keys()
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	if not file_exists:
	writer.writeheader() # Write header only if the file is new

	writer.writerows(data)

	def main():
	"""
	Main function to orchestrate the scraping and saving process.
	"""
	scraper = IndonesianVisaScraper()

	# Define the CSV headers
	csv_headers = [
	'country', 'main_purpose', 'sub_activity_name', 'visa_name',
	'visa_code', 'duration', 'stay_summary', 'cost_summary',
	'is_multiple_entry', 'is_visa_on_arrival', 'is_guarantor_required',
	'passport_validity', 'full_description', 'detailed_info_html',
	'visa_id', 'sub_activity_id'
	]

	# Create an empty file with headers first to ensure it's clean
	with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
	writer = csv.DictWriter(f, fieldnames=csv_headers)
	writer.writeheader()

	total_purposes = len(scraper.PARENT_ACTIVITY_MAPPING)

	countries_to_scrape = list(scraper.COUNTRY_MAPPING.keys())

	if TEST_MODE:
	print("--- 🧪 TEST MODE ENABLED ---")
	print(f"Running for the first {TEST_LIMIT} countries only.")
	countries_to_scrape = countries_to_scrape[:TEST_LIMIT]
	else:
	print("--- 🚀 FULL SCRAPE MODE ---")

	total_countries = len(countries_to_scrape)

	print(f"Target file: {OUTPUT_CSV_FILE}")
	print(f"Scraping for {total_countries} countries and {total_purposes} main purposes.")
	print("-" * 50)

	# 1. Iterate through each country
	for i, country_name in enumerate(countries_to_scrape, 1):
	country_id = scraper.get_country_id(country_name)

	# 2. Iterate through each main purpose (parent activity)
	for j, parent_activity_name in enumerate(scraper.PARENT_ACTIVITY_MAPPING.keys(), 1):
	parent_activity_id = scraper.get_parent_activity_id(parent_activity_name)

	print(f"({i}/{total_countries}) {country_name} \| ({j}/{total_purposes}) {parent_activity_name}")

	# 3. Get all sub-activities for the main purpose
	time.sleep(REQUEST_DELAY) # Respectful delay
	sub_activities = scraper.get_sub_activities(parent_activity_id)

	if not sub_activities:
	print(" -> No sub-activities found. Skipping.")
	continue

	# 4. Iterate through each sub-activity
	for sub_activity in sub_activities:
	sub_activity_id = sub_activity['id']
	sub_activity_name = sub_activity['name']
	print(f" -> Sub-activity: {sub_activity_name}")

	# 5. Get available visa types for the sub-activity and country
	time.sleep(REQUEST_DELAY) # Respectful delay
	visa_types_data = scraper.get_visa_types(sub_activity_id, country_id)

	rows_to_write = []

	if not visa_types_data or not visa_types_data.get('data'):
	message = "No specific visa found"
	if visa_types_data and visa_types_data.get('status') == 'empty':
	message = visa_types_data.get('message', "Guarantor likely required")

	print(f" -> {message}")
	# Add a row indicating why there's no visa data
	row = {field: '' for field in csv_headers}
	row.update({
	'country': country_name,
	'main_purpose': parent_activity_name,
	'sub_activity_name': sub_activity_name,
	'visa_name': message,
	})
	rows_to_write.append(row)
	else:
	visa_list = visa_types_data.get('data', [])
	print(f" -> Found {len(visa_list)} potential visa type(s). Fetching details...")

	# 6. Iterate through each visa type and get full details
	for visa_type in visa_list:
	visa_id = visa_type['id']
	time.sleep(REQUEST_DELAY) # Respectful delay
	details_response = scraper.get_visa_full_details(visa_id)

	if details_response and details_response['success']:
	details = details_response['data']

	# 7. Prepare a row with all collected data
	row = {
	'country': country_name,
	'main_purpose': parent_activity_name,
	'sub_activity_name': sub_activity_name,
	'visa_name': details.get('name', visa_type.get('name')),
	'visa_code': details.get('code', 'N/A'),
	'duration': details.get('duration_time', 'N/A'),
	'stay_summary': visa_type.get('stay_summary', 'N/A'),
	'cost_summary': visa_type.get('cost_summary', 'N/A'),
	'is_multiple_entry': details.get('is_multiple_entry', False),
	'is_visa_on_arrival': details.get('is_arrival', False),
	'is_guarantor_required': details.get('is_guarantor', False),
	'passport_validity': f"{details.get('passport_value', 'N/A')} {details.get('passport_unit', '')}".strip(),
	'full_description': details.get('description', 'N/A'),
	'detailed_info_html': details.get('info_html', 'N/A'),
	'visa_id': visa_id,
	'sub_activity_id': sub_activity_id,
	}
	rows_to_write.append(row)
	print(f" - Fetched details for: {row['visa_name']}")
	else:
	print(f" - FAILED to fetch details for visa ID {visa_id}")

	# 8. Append the collected rows to the CSV file
	if rows_to_write:
	save_to_csv(rows_to_write, OUTPUT_CSV_FILE)

	print("-" * 50)
	print(f"✅ Scraping complete! Data saved to {OUTPUT_CSV_FILE}")

	if __name__ == "__main__":
	main()