#!/usr/bin/env python3 import os import subprocess import sys from dotenv import load_dotenv def main(): # Load environment variables from .env file load_dotenv() # Configuration output_dir = "data" # Top AI/ML topics identified via API search topic_ids = [ "T12072", # Machine Learning and Algorithms "T11948", # Deep Learning and Neural Networks ] topic_filter = "|".join(topic_ids) pub_year = "2018-2024" min_citations = "20" # Check for API key api_key = os.environ.get("OPENALEX_API_KEY") if not api_key: print("Error: OPENALEX_API_KEY environment variable is not set.") print("Please get your API key from https://openalex.org/settings/api and set it:") print("export OPENALEX_API_KEY='your-key-here'") sys.exit(1) # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) filter_str = f"topics.id:{topic_filter},publication_year:{pub_year},cited_by_count:>{min_citations}" print(f"šŸš€ Starting OpenAlex download for AI/ML papers...") print(f"šŸ“‚ Output directory: {output_dir}") print(f"šŸ” Filter: {filter_str}") # Build the command command = [ "openalex", "download", "--api-key", api_key, "--output", output_dir, "--filter", filter_str, "--resume", "--workers", "10" ] try: # Run the command and pipe output to terminal process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) # Print progress in real-time for line in process.stdout: print(line, end="") process.wait() if process.returncode == 0: print("\nāœ… Download completed successfully.") else: print(f"\nāŒ Download failed with return code {process.returncode}.") print("You can run this script again to resume from the last checkpoint.") except KeyboardInterrupt: print("\nšŸ›‘ Download interrupted by user. Run again to resume.") sys.exit(1) except Exception as e: print(f"\nšŸ’„ An error occurred: {e}") sys.exit(1) if __name__ == "__main__": main()