TigerGraph-Hack / utils /download_aiml_data.py
Meshyboi's picture
Upload 27 files
90645a4 verified
Raw
History Blame Contribute Delete
2.33 kB
#!/usr/bin/env python3
import os
import subprocess
import sys
from dotenv import load_dotenv
def main():
# Load environment variables from .env file
load_dotenv()
# Configuration
output_dir = "data"
# Top AI/ML topics identified via API search
topic_ids = [
"T12072", # Machine Learning and Algorithms
"T11948", # Deep Learning and Neural Networks
]
topic_filter = "|".join(topic_ids)
pub_year = "2018-2024"
min_citations = "20"
# Check for API key
api_key = os.environ.get("OPENALEX_API_KEY")
if not api_key:
print("Error: OPENALEX_API_KEY environment variable is not set.")
print("Please get your API key from https://openalex.org/settings/api and set it:")
print("export OPENALEX_API_KEY='your-key-here'")
sys.exit(1)
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
filter_str = f"topics.id:{topic_filter},publication_year:{pub_year},cited_by_count:>{min_citations}"
print(f"πŸš€ Starting OpenAlex download for AI/ML papers...")
print(f"πŸ“‚ Output directory: {output_dir}")
print(f"πŸ” Filter: {filter_str}")
# Build the command
command = [
"openalex", "download",
"--api-key", api_key,
"--output", output_dir,
"--filter", filter_str,
"--resume",
"--workers", "10"
]
try:
# Run the command and pipe output to terminal
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1
)
# Print progress in real-time
for line in process.stdout:
print(line, end="")
process.wait()
if process.returncode == 0:
print("\nβœ… Download completed successfully.")
else:
print(f"\n❌ Download failed with return code {process.returncode}.")
print("You can run this script again to resume from the last checkpoint.")
except KeyboardInterrupt:
print("\nπŸ›‘ Download interrupted by user. Run again to resume.")
sys.exit(1)
except Exception as e:
print(f"\nπŸ’₯ An error occurred: {e}")
sys.exit(1)
if __name__ == "__main__":
main()