import os import re import requests from bs4 import BeautifulSoup def sanitize_filename(filename): """Remove ou substitui caracteres inválidos em nomes de arquivos.""" return re.sub(r'[<>:"/\\|?*]', '_', filename) def fetch_task_links(category_url): """Fetch all task links from Rosetta Code's category page.""" response = requests.get(category_url) if response.status_code != 200: print(f"[ERROR] Failed to fetch {category_url}. Status code: {response.status_code}") return [] soup = BeautifulSoup(response.text, 'html.parser') links = soup.select('.mw-category-group ul li a') return [("https://rosettacode.org" + link['href'], link.text) for link in links] def fetch_code_from_task(task_url): """Fetch code snippets from a specific task on Rosetta Code.""" response = requests.get(task_url) if response.status_code != 200: print(f"[ERROR] Failed to fetch {task_url}. Status code: {response.status_code}") return [] soup = BeautifulSoup(response.text, 'html.parser') code_blocks = soup.find_all('pre') return [code.text for code in code_blocks] def save_safe_codes(task_name, codes, save_dir): """Save the safe codes as text files.""" os.makedirs(save_dir, exist_ok=True) task_name = sanitize_filename(task_name) # Sanitizar o nome da tarefa for i, code in enumerate(codes): filename = f"{task_name}_{i+1}.txt" filepath = os.path.join(save_dir, filename) try: with open(filepath, 'w', encoding='utf-8') as f: f.write(code) print(f"[SUCCESS] Saved: {filepath}") except Exception as e: print(f"[ERROR] Could not save file {filepath}: {e}") if __name__ == "__main__": category_url = "https://rosettacode.org/wiki/Category:Programming_Tasks" save_directory = "safe-code-analyzer/safe_codes" # Fetch tasks tasks = fetch_task_links(category_url) print(f"[INFO] Found {len(tasks)} tasks on Rosetta Code.") # Fetch and save codes for task_url, task_name in tasks[:10]: # Ajuste o número de tarefas a serem processadas print(f"[INFO] Fetching codes for task: {task_name}") codes = fetch_code_from_task(task_url) save_safe_codes(task_name, codes, save_directory)