import gradio as gr import re import requests from markdownify import markdownify from requests.exceptions import RequestException from smolagents import ( CodeAgent, ToolCallingAgent, InferenceClientModel, WebSearchTool, MCPClient, tool ) # MCP Server URL for GitHub tools MCP_SERVER_URL = "https://baction-vulnerability-scanner-server.hf.space/gradio_api/mcp/" @tool def visit_webpage(url: str) -> str: """Visits a webpage at the given URL and returns its content as a markdown string. Args: url: The URL of the webpage to visit. Returns: The content of the webpage converted to Markdown, or an error message if the request fails. """ try: # Add user agent to avoid blocking headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } # Send a GET request to the URL response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() # Raise an exception for bad status codes # Convert the HTML content to Markdown markdown_content = markdownify(response.text).strip() # Remove multiple line breaks markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) # Limit content length to avoid overwhelming the AI if len(markdown_content) > 5000: markdown_content = markdown_content[:5000] + "\n\n[Content truncated due to length...]" return markdown_content except RequestException as e: return f"Error fetching the webpage: {str(e)}" except Exception as e: return f"An unexpected error occurred: {str(e)}" def parse_github_url(url): """Parse GitHub URL to extract owner, repo, and file path""" # Handle repository URLs repo_pattern = r'https://github\.com/([^/]+)/([^/]+)/?$' repo_match = re.match(repo_pattern, url.strip()) if repo_match: return repo_match.group(1), repo_match.group(2), None # Handle file URLs file_pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+)$' file_match = re.match(file_pattern, url.strip()) if file_match: return file_match.group(1), file_match.group(2), file_match.group(3) return None, None, None def analyze_vulnerabilities_multiagent(message, history, hf_token): """Multi-agent vulnerability analysis with web scraping capabilities""" # Validate HF token input if not hf_token.strip(): return "āŒ Please provide a Hugging Face API key. Get one from [Hugging Face](https://huggingface.co/settings/tokens)" try: # Parse the GitHub URL owner, repo, file_path = parse_github_url(message) if not owner or not repo: return "āŒ Invalid GitHub URL. Please provide a valid GitHub repository or file URL." if not file_path: return "āŒ Please provide a specific file URL for analysis. Repository-wide analysis is not supported in multi-agent mode." # Connect to MCP server for GitHub tools mcp_client = MCPClient({ "url": MCP_SERVER_URL, "timeout": 120 }) github_tools = mcp_client.get_tools() # Initialize AI model model = InferenceClientModel(token=hf_token.strip()) # Create a single agent with all tools (simpler approach) all_tools = github_tools + [visit_webpage] # Create single agent instead of multi-agent to avoid tool_choice issues agent = CodeAgent( tools=all_tools, model=model, additional_authorized_imports=["re", "requests"], max_steps=12 ) # Simplified prompt for single agent analysis enhanced_prompt = f""" You are a cybersecurity expert. Analyze this GitHub file for security vulnerabilities. GitHub URL: {message} Repository: {owner}/{repo} File Path: {file_path} **ANALYSIS STEPS:** 1. **Get File Data**: - Use get_repository_info with owner="{owner}", repo="{repo}" - Use get_file_content with owner="{owner}", repo="{repo}", path="{file_path}" 2. **Find Vulnerabilities**: Analyze code for: - SQL injection patterns - Command injection (eval, exec, os.system) - XSS vulnerabilities - Path traversal - Hardcoded secrets - Input validation issues 3. **CVE Research**: - Search for CVEs: simple_cve_search("SQL injection", 3) - Extract CVE IDs from the string result using regex - Visit NVD for the first CVE: visit_webpage("https://nvd.nist.gov/vuln/detail/CVE-XXXX-XXXX") - Include the full NVD webpage content in your report 4. **Generate Report**: # šŸ›”ļø Security Analysis Report ## šŸ” File Overview - **Path**: {file_path} - **Repository**: {owner}/{repo} ## 🚨 Vulnerabilities Found [List vulnerabilities with line numbers] ## šŸ“Š CVE Research **Top Related CVE**: [First CVE ID from regex extraction] **CVE Details Webpage Content**: [Complete content from visit_webpage call] **Key Details from CVE Details**: [CVSS score, attack vector, impact extracted from webpage] ## āš ļø Other Possible CVEs [Show other 2 CVE IDs from search] ## šŸ› ļø Remediation [Specific fixes] ## āš ļø Disclaimer AI analysis may not be 100% accurate. Manual security review recommended. **REMEMBER**: Always call visit_webpage for the first CVE ID to get detailed CVE information! **CRITICAL INSTRUCTIONS**: - simple_cve_search returns a STRING with CVE IDs and descriptions - Extract CVE IDs using: re.findall(r'CVE-\d{4}-\d+', cve_search_string) - TRY to visit CVE Details webpage for the first CVE ID found (more reliable than NVD) - Use this exact pattern: 1. Call simple_cve_search("SQL injection", 3) 2. Extract CVE IDs with regex from the returned string 3. Take the first CVE ID from the list 4. Call visit_webpage("https://www.cvedetails.com/cve/CVE-YYYY-NNNNN/") with the EXACT CVE ID (keep hyphens) 5. If webpage fails (403 error), continue with analysis using CVE search results only - Keep variable names simple and avoid complex operations - ALWAYS use keyword arguments for MCP tools (e.g., owner="user", repo="repo", path="file.py") - NOTE: CVE format is standard CVE-YYYY-NNNNN (like CVE-2024-54762) - Example: If you get "CVE-2024-54762", visit "https://www.cvedetails.com/cve/CVE-2024-54762/" - DO NOT remove hyphens from CVE IDs when visiting CVE Details URLs - If CVE Details access fails, use the CVE descriptions from simple_cve_search results """ # Run the agent analysis result = agent.run(enhanced_prompt) # Disconnect MCP client mcp_client.disconnect() return str(result) except Exception as e: return f"āŒ Error in multi-agent analysis: {str(e)}\n\nPlease ensure:\n• Valid GitHub file URL (not repository URL)\n• Hugging Face token is correct\n• File is accessible" # Gradio UI with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown("## šŸ›”ļø Enhanced GitHub Vulnerability Scanner") gr.Markdown(""" **Advanced Security Analysis with Web Scraping** This intelligent vulnerability scanner uses AI agents with web scraping capabilities to perform comprehensive security analysis of GitHub files. **Key Features:** - **šŸ¤– AI Agent System**: Single agent with multiple tools for efficient analysis - **🌐 Web Scraping**: Automatically visits NVD webpages to get detailed CVE information - **šŸ“Š CVE Database Integration**: Searches CVE knowledge base and gets top 3 matches - **šŸ” Smart Analysis**: AI-generated vulnerability descriptions (not hardcoded) - **šŸ“‹ Detailed Reports**: Comprehensive reports with NVD data and remediation advice - **āš ļø Accuracy Disclaimer**: Shows alternative CVEs and warns about AI limitations **Project Links:** - šŸ“‚ **Source Code**: [GitHub Repository](https://github.com/banno-0720/vulnerability-scanner) - šŸ”§ **MCP Server**: [Hugging Face Space](https://huggingface.co/spaces/HimanshuGoyal2004/github-mcp-server) āš ļø **Important Notice**: This tool is designed for legitimate security research and vulnerability assessment purposes only. Do not use this scanner for malicious activities, unauthorized access, or any illegal purposes. Always ensure you have proper authorization before scanning repositories that don't belong to you. """) gr.Markdown("---") # API Configuration Section with gr.Row(): with gr.Column(scale=1): gr.Markdown("### šŸ”‘ API Configuration") hf_token_box = gr.Textbox( label="šŸ¤— Hugging Face API Key", placeholder="Enter your Hugging Face API key for AI model access", type="password", info="šŸ”— Get your free key: https://huggingface.co/settings/tokens" ) gr.Markdown("---") gr.Markdown("### šŸ’¬ Enhanced Security Analysis") gr.Markdown("Paste a GitHub **FILE URL** (not repository URL) below to start the enhanced security analysis.") # Chatbot Interface chatbot = gr.ChatInterface( fn=lambda msg, hist, hf_token: analyze_vulnerabilities_multiagent(msg, hist, hf_token), additional_inputs=[hf_token_box], type="messages", examples=[ ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/database/schema.sql", ""], ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/python/database.py", ""], ["https://github.com/banno-0720/documentation-agent/blob/main/code.py", ""] ], ) if __name__ == "__main__": demo.launch(server_port=7860) # Different port to avoid conflict with server