Spaces:

AdityaDevx
/

vulnerability-scanner-api

Running

App Files Files Community

AdityaDevx commited on 3 days ago

Commit

dda1f70

1 Parent(s): 5fd1110

Add repo scanning + better logging

Browse files

Files changed (3) hide show

Dockerfile +1 -1
README.md +66 -3
api.py +186 -10

Dockerfile CHANGED Viewed

@@ -9,4 +9,4 @@ COPY api.py .
 EXPOSE 7860
-CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]


9
10	EXPOSE 7860
11
12	+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "info"]

README.md CHANGED Viewed

@@ -1,11 +1,74 @@
 ---
 title: Vulnerability Scanner Api
-emoji: 🏆
 colorFrom: blue
-colorTo: blue
 sdk: docker
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Vulnerability Scanner Api
+emoji: 🔒
 colorFrom: blue
+colorTo: red
 sdk: docker
 pinned: false
 license: mit
 ---
+# Vulnerability Scanner API
+AI-powered security vulnerability scanner for GitHub repositories and files.
+## Features
+- 🔍 Single file vulnerability scanning
+- 📦 Full repository scanning (up to 15 files)
+- 🤖 Powered by Groq LLaMA 3.3 70B
+- 🚀 Fast and accurate security analysis
+- 📊 Detailed vulnerability reports with severity levels
+## API Endpoints
+### Health Check
+```bash
+GET /api/health
+```
+### Scan GitHub File or Repository
+```bash
+POST /api/scan
+Content-Type: application/json
+{
+  "url": "https://github.com/owner/repo/blob/main/file.py"
+}
+```
+Or scan entire repository:
+```bash
+{
+  "url": "https://github.com/owner/repo"
+}
+```
+## Environment Variables
+- `GROQ_API_KEY` - Required for AI analysis
+- `GITHUB_TOKEN` - Optional, for private repos and higher rate limits
+## Usage
+Test the API:
+```bash
+curl -X POST https://adityadevx-vulnerability-scanner-api.hf.space/api/scan \
+  -H "Content-Type: application/json" \
+  -d '{"url": "https://github.com/owner/repo/blob/main/file.py"}'
+```
+## Response Format
+```json
+{
+  "result": "# Security Analysis Report\n\n## Vulnerabilities Found\n..."
+}
+```
+Or in case of error:
+```json
+{
+  "error": "Error message"
+}
+```

api.py CHANGED Viewed

@@ -2,12 +2,30 @@
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import re, os, requests, asyncio, concurrent.futures
 from dotenv import load_dotenv
 from groq import Groq
 load_dotenv()
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -17,10 +35,16 @@ app.add_middleware(
 )
 def parse_github_url(url):
-    m = re.match(r"https://github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+)", url.strip())
     if m:
-        return m.group(1), m.group(2), m.group(3)
-    return None, None, None
 def get_file_content(owner, repo, path):
     token = os.getenv("GITHUB_TOKEN", "")
@@ -34,7 +58,139 @@ def get_file_content(owner, repo, path):
     content = base64.b64decode(data["content"]).decode("utf-8", errors="replace")
     return content, None
 def run_scan(owner, repo, file_path):
     code, err = get_file_content(owner, repo, file_path)
     if err:
         return {"error": err}
@@ -100,21 +256,41 @@ class ScanRequest(BaseModel):
 @app.post("/api/scan")
 async def scan(req: ScanRequest):
-    owner, repo, file_path = parse_github_url(req.url)
-    if not owner or not repo or not file_path:
-        return {"error": "Invalid GitHub file URL. Must contain /blob/."}
-    loop = asyncio.get_running_loop()
-    with concurrent.futures.ThreadPoolExecutor() as pool:
-        result = await loop.run_in_executor(pool, run_scan, owner, repo, file_path)
     return result
 @app.get("/api/health")
 def health():
     return {"status": "ok", "groq_key_set": bool(os.getenv("GROQ_API_KEY"))}
 @app.get("/")
 def root():
     return {"status": "Vulnerability Scanner API running"}

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import re, os, requests, asyncio, concurrent.futures
+from datetime import datetime
 from dotenv import load_dotenv
 from groq import Groq
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
 load_dotenv()
 app = FastAPI()
+@app.on_event("startup")
+async def startup_event():
+    logger.info("===== Application Startup =====")
+    logger.info(f"GROQ_API_KEY set: {bool(os.getenv('GROQ_API_KEY'))}")
+    logger.info(f"GITHUB_TOKEN set: {bool(os.getenv('GITHUB_TOKEN'))}")
+    logger.info("API is ready to accept requests")
+    print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
 )
 def parse_github_url(url):
+    url = url.strip().rstrip('/')
+    # File URL: https://github.com/owner/repo/blob/branch/path
+    m = re.match(r"https://github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+)", url)
+    if m:
+        return m.group(1), m.group(2), m.group(3), False
+    # Repo URL: https://github.com/owner/repo
+    m = re.match(r"https://github\.com/([^/]+)/([^/]+)/?$", url)
     if m:
+        return m.group(1), m.group(2), None, True
+    return None, None, None, False
 def get_file_content(owner, repo, path):
     token = os.getenv("GITHUB_TOKEN", "")
     content = base64.b64decode(data["content"]).decode("utf-8", errors="replace")
     return content, None
+def get_repo_files(owner, repo, path="", max_files=20):
+    """Recursively get code files from repo"""
+    token = os.getenv("GITHUB_TOKEN", "")
+    headers = {"Authorization": f"token {token}"} if (token and token != "your_github_personal_access_token_here") else {}
+    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
+    try:
+        r = requests.get(url, headers=headers, timeout=15)
+        if r.status_code != 200:
+            return []
+        items = r.json()
+        files = []
+        for item in items:
+            if len(files) >= max_files:
+                break
+            if item['type'] == 'file':
+                # Only scan code files
+                ext = item['name'].split('.')[-1].lower()
+                if ext in ['py', 'js', 'jsx', 'ts', 'tsx', 'java', 'cpp', 'c', 'php', 'rb', 'go', 'rs', 'sql', 'sh']:
+                    files.append(item['path'])
+            elif item['type'] == 'dir' and item['name'] not in ['.git', 'node_modules', '__pycache__', 'dist', 'build']:
+                # Recursively scan directories
+                files.extend(get_repo_files(owner, repo, item['path'], max_files - len(files)))
+        return files
+    except:
+        return []
+def run_repo_scan(owner, repo):
+    """Scan entire repository"""
+    logger.info(f"Starting repo scan for {owner}/{repo}")
+    # Get list of code files
+    files = get_repo_files(owner, repo)
+    if not files:
+        return {"error": "No code files found or repo is private"}
+    logger.info(f"Found {len(files)} files to scan")
+    # Scan each file and collect results
+    all_vulnerabilities = []
+    scanned_count = 0
+    for file_path in files[:15]:  # Limit to 15 files to avoid timeout
+        logger.info(f"Scanning file {scanned_count + 1}/{len(files[:15])}: {file_path}")
+        code, err = get_file_content(owner, repo, file_path)
+        if err:
+            continue
+        # Truncate large files
+        if len(code) > 4000:
+            code = code[:4000] + "\n... [truncated]"
+        # Quick scan for this file
+        client = Groq(api_key=os.getenv("GROQ_API_KEY", ""))
+        prompt = f"""Analyze this code file for security vulnerabilities. Be concise.
+File: {file_path}
+```
+{code}
+```
+List only CRITICAL and HIGH severity vulnerabilities found. Format:
+- **[Vulnerability]** in `{file_path}` line X: [brief issue]
+If no critical/high issues, respond: "No critical issues found."
+"""
+        try:
+            response = client.chat.completions.create(
+                model="llama-3.3-70b-versatile",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.1,
+                max_tokens=800,
+            )
+            result = response.choices[0].message.content.strip()
+            if "no critical issues" not in result.lower():
+                all_vulnerabilities.append(f"### {file_path}\n{result}\n")
+            scanned_count += 1
+        except Exception as e:
+            logger.error(f"Error scanning {file_path}: {str(e)}")
+            continue
+    # Generate final report
+    if not all_vulnerabilities:
+        report = f"""# Repository Security Scan Report
+## Repository Overview
+- Repository: {owner}/{repo}
+- Files Scanned: {scanned_count}
+- Status: ✅ No critical vulnerabilities detected
+## Summary
+All scanned files passed security checks. No CRITICAL or HIGH severity issues found.
+## Risk Summary
+- Critical: 0 | High: 0 | Medium: 0 | Low: 0
+- **Overall Risk**: LOW
+"""
+    else:
+        report = f"""# Repository Security Scan Report
+## Repository Overview
+- Repository: {owner}/{repo}
+- Files Scanned: {scanned_count}
+- Vulnerabilities Found: {len(all_vulnerabilities)} files with issues
+## Vulnerabilities by File
+{''.join(all_vulnerabilities)}
+## Recommendations
+1. Review and fix all CRITICAL and HIGH severity issues immediately
+2. Implement input validation and sanitization
+3. Use parameterized queries for database operations
+4. Keep dependencies updated
+## Risk Summary
+- Critical: {sum(1 for v in all_vulnerabilities if 'CRITICAL' in v)} | High: {sum(1 for v in all_vulnerabilities if 'HIGH' in v)} | Medium: 0 | Low: 0
+- **Overall Risk**: {'CRITICAL' if any('CRITICAL' in v for v in all_vulnerabilities) else 'HIGH'}
+"""
+    logger.info(f"Repo scan completed: {scanned_count} files scanned")
+    return {"result": report}
 def run_scan(owner, repo, file_path):
+    """Scan single file"""
     code, err = get_file_content(owner, repo, file_path)
     if err:
         return {"error": err}
 @app.post("/api/scan")
 async def scan(req: ScanRequest):
+    logger.info(f"Received scan request for URL: {req.url}")
+    owner, repo, file_path, is_repo = parse_github_url(req.url)
+    if not owner or not repo:
+        logger.warning(f"Invalid URL format: {req.url}")
+        return {"error": "Invalid GitHub URL. Use github.com/owner/repo or .../blob/branch/file"}
+    if is_repo:
+        # Full repository scan
+        logger.info(f"Starting repository scan: {owner}/{repo}")
+        loop = asyncio.get_running_loop()
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            result = await loop.run_in_executor(pool, run_repo_scan, owner, repo)
+    else:
+        # Single file scan
+        logger.info(f"Scanning file: {owner}/{repo}/{file_path}")
+        loop = asyncio.get_running_loop()
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            result = await loop.run_in_executor(pool, run_scan, owner, repo, file_path)
+    if "error" in result:
+        logger.error(f"Scan failed: {result['error']}")
+    else:
+        logger.info("Scan completed successfully")
     return result
 @app.get("/api/health")
 def health():
+    logger.info("Health check requested")
     return {"status": "ok", "groq_key_set": bool(os.getenv("GROQ_API_KEY"))}
 @app.get("/")
 def root():
+    logger.info("Root endpoint accessed")
     return {"status": "Vulnerability Scanner API running"}