vuln-code-analysis / scripts /fetch_rosetta_code.py
lilbool's picture
Upload 212 files
497f2f3 verified
import os
import re
import requests
from bs4 import BeautifulSoup
def sanitize_filename(filename):
"""Remove ou substitui caracteres inválidos em nomes de arquivos."""
return re.sub(r'[<>:"/\\|?*]', '_', filename)
def fetch_task_links(category_url):
"""Fetch all task links from Rosetta Code's category page."""
response = requests.get(category_url)
if response.status_code != 200:
print(f"[ERROR] Failed to fetch {category_url}. Status code: {response.status_code}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.select('.mw-category-group ul li a')
return [("https://rosettacode.org" + link['href'], link.text) for link in links]
def fetch_code_from_task(task_url):
"""Fetch code snippets from a specific task on Rosetta Code."""
response = requests.get(task_url)
if response.status_code != 200:
print(f"[ERROR] Failed to fetch {task_url}. Status code: {response.status_code}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
code_blocks = soup.find_all('pre')
return [code.text for code in code_blocks]
def save_safe_codes(task_name, codes, save_dir):
"""Save the safe codes as text files."""
os.makedirs(save_dir, exist_ok=True)
task_name = sanitize_filename(task_name) # Sanitizar o nome da tarefa
for i, code in enumerate(codes):
filename = f"{task_name}_{i+1}.txt"
filepath = os.path.join(save_dir, filename)
try:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(code)
print(f"[SUCCESS] Saved: {filepath}")
except Exception as e:
print(f"[ERROR] Could not save file {filepath}: {e}")
if __name__ == "__main__":
category_url = "https://rosettacode.org/wiki/Category:Programming_Tasks"
save_directory = "safe-code-analyzer/safe_codes"
# Fetch tasks
tasks = fetch_task_links(category_url)
print(f"[INFO] Found {len(tasks)} tasks on Rosetta Code.")
# Fetch and save codes
for task_url, task_name in tasks[:10]: # Ajuste o número de tarefas a serem processadas
print(f"[INFO] Fetching codes for task: {task_name}")
codes = fetch_code_from_task(task_url)
save_safe_codes(task_name, codes, save_directory)