AdityaDevx commited on
Commit
dda1f70
·
1 Parent(s): 5fd1110

Add repo scanning + better logging

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. README.md +66 -3
  3. api.py +186 -10
Dockerfile CHANGED
@@ -9,4 +9,4 @@ COPY api.py .
9
 
10
  EXPOSE 7860
11
 
12
- CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
 
9
 
10
  EXPOSE 7860
11
 
12
+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "info"]
README.md CHANGED
@@ -1,11 +1,74 @@
1
  ---
2
  title: Vulnerability Scanner Api
3
- emoji: 🏆
4
  colorFrom: blue
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Vulnerability Scanner Api
3
+ emoji: 🔒
4
  colorFrom: blue
5
+ colorTo: red
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
  ---
10
 
11
+ # Vulnerability Scanner API
12
+
13
+ AI-powered security vulnerability scanner for GitHub repositories and files.
14
+
15
+ ## Features
16
+
17
+ - 🔍 Single file vulnerability scanning
18
+ - 📦 Full repository scanning (up to 15 files)
19
+ - 🤖 Powered by Groq LLaMA 3.3 70B
20
+ - 🚀 Fast and accurate security analysis
21
+ - 📊 Detailed vulnerability reports with severity levels
22
+
23
+ ## API Endpoints
24
+
25
+ ### Health Check
26
+ ```bash
27
+ GET /api/health
28
+ ```
29
+
30
+ ### Scan GitHub File or Repository
31
+ ```bash
32
+ POST /api/scan
33
+ Content-Type: application/json
34
+
35
+ {
36
+ "url": "https://github.com/owner/repo/blob/main/file.py"
37
+ }
38
+ ```
39
+
40
+ Or scan entire repository:
41
+ ```bash
42
+ {
43
+ "url": "https://github.com/owner/repo"
44
+ }
45
+ ```
46
+
47
+ ## Environment Variables
48
+
49
+ - `GROQ_API_KEY` - Required for AI analysis
50
+ - `GITHUB_TOKEN` - Optional, for private repos and higher rate limits
51
+
52
+ ## Usage
53
+
54
+ Test the API:
55
+ ```bash
56
+ curl -X POST https://adityadevx-vulnerability-scanner-api.hf.space/api/scan \
57
+ -H "Content-Type: application/json" \
58
+ -d '{"url": "https://github.com/owner/repo/blob/main/file.py"}'
59
+ ```
60
+
61
+ ## Response Format
62
+
63
+ ```json
64
+ {
65
+ "result": "# Security Analysis Report\n\n## Vulnerabilities Found\n..."
66
+ }
67
+ ```
68
+
69
+ Or in case of error:
70
+ ```json
71
+ {
72
+ "error": "Error message"
73
+ }
74
+ ```
api.py CHANGED
@@ -2,12 +2,30 @@
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  import re, os, requests, asyncio, concurrent.futures
 
5
  from dotenv import load_dotenv
6
  from groq import Groq
 
 
 
 
 
 
 
 
7
 
8
  load_dotenv()
9
 
10
  app = FastAPI()
 
 
 
 
 
 
 
 
 
11
  app.add_middleware(
12
  CORSMiddleware,
13
  allow_origins=["*"],
@@ -17,10 +35,16 @@ app.add_middleware(
17
  )
18
 
19
  def parse_github_url(url):
20
- m = re.match(r"https://github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+)", url.strip())
 
 
 
 
 
 
21
  if m:
22
- return m.group(1), m.group(2), m.group(3)
23
- return None, None, None
24
 
25
  def get_file_content(owner, repo, path):
26
  token = os.getenv("GITHUB_TOKEN", "")
@@ -34,7 +58,139 @@ def get_file_content(owner, repo, path):
34
  content = base64.b64decode(data["content"]).decode("utf-8", errors="replace")
35
  return content, None
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def run_scan(owner, repo, file_path):
 
38
  code, err = get_file_content(owner, repo, file_path)
39
  if err:
40
  return {"error": err}
@@ -100,21 +256,41 @@ class ScanRequest(BaseModel):
100
 
101
  @app.post("/api/scan")
102
  async def scan(req: ScanRequest):
103
- owner, repo, file_path = parse_github_url(req.url)
104
- if not owner or not repo or not file_path:
105
- return {"error": "Invalid GitHub file URL. Must contain /blob/."}
106
- loop = asyncio.get_running_loop()
107
- with concurrent.futures.ThreadPoolExecutor() as pool:
108
- result = await loop.run_in_executor(pool, run_scan, owner, repo, file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  return result
110
 
111
 
112
  @app.get("/api/health")
113
  def health():
 
114
  return {"status": "ok", "groq_key_set": bool(os.getenv("GROQ_API_KEY"))}
115
 
116
 
117
  @app.get("/")
118
  def root():
 
119
  return {"status": "Vulnerability Scanner API running"}
120
-
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  import re, os, requests, asyncio, concurrent.futures
5
+ from datetime import datetime
6
  from dotenv import load_dotenv
7
  from groq import Groq
8
+ import logging
9
+
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
 
17
  load_dotenv()
18
 
19
  app = FastAPI()
20
+
21
+ @app.on_event("startup")
22
+ async def startup_event():
23
+ logger.info("===== Application Startup =====")
24
+ logger.info(f"GROQ_API_KEY set: {bool(os.getenv('GROQ_API_KEY'))}")
25
+ logger.info(f"GITHUB_TOKEN set: {bool(os.getenv('GITHUB_TOKEN'))}")
26
+ logger.info("API is ready to accept requests")
27
+ print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
28
+
29
  app.add_middleware(
30
  CORSMiddleware,
31
  allow_origins=["*"],
 
35
  )
36
 
37
  def parse_github_url(url):
38
+ url = url.strip().rstrip('/')
39
+ # File URL: https://github.com/owner/repo/blob/branch/path
40
+ m = re.match(r"https://github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+)", url)
41
+ if m:
42
+ return m.group(1), m.group(2), m.group(3), False
43
+ # Repo URL: https://github.com/owner/repo
44
+ m = re.match(r"https://github\.com/([^/]+)/([^/]+)/?$", url)
45
  if m:
46
+ return m.group(1), m.group(2), None, True
47
+ return None, None, None, False
48
 
49
  def get_file_content(owner, repo, path):
50
  token = os.getenv("GITHUB_TOKEN", "")
 
58
  content = base64.b64decode(data["content"]).decode("utf-8", errors="replace")
59
  return content, None
60
 
61
+ def get_repo_files(owner, repo, path="", max_files=20):
62
+ """Recursively get code files from repo"""
63
+ token = os.getenv("GITHUB_TOKEN", "")
64
+ headers = {"Authorization": f"token {token}"} if (token and token != "your_github_personal_access_token_here") else {}
65
+ url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
66
+
67
+ try:
68
+ r = requests.get(url, headers=headers, timeout=15)
69
+ if r.status_code != 200:
70
+ return []
71
+
72
+ items = r.json()
73
+ files = []
74
+
75
+ for item in items:
76
+ if len(files) >= max_files:
77
+ break
78
+
79
+ if item['type'] == 'file':
80
+ # Only scan code files
81
+ ext = item['name'].split('.')[-1].lower()
82
+ if ext in ['py', 'js', 'jsx', 'ts', 'tsx', 'java', 'cpp', 'c', 'php', 'rb', 'go', 'rs', 'sql', 'sh']:
83
+ files.append(item['path'])
84
+ elif item['type'] == 'dir' and item['name'] not in ['.git', 'node_modules', '__pycache__', 'dist', 'build']:
85
+ # Recursively scan directories
86
+ files.extend(get_repo_files(owner, repo, item['path'], max_files - len(files)))
87
+
88
+ return files
89
+ except:
90
+ return []
91
+
92
+ def run_repo_scan(owner, repo):
93
+ """Scan entire repository"""
94
+ logger.info(f"Starting repo scan for {owner}/{repo}")
95
+
96
+ # Get list of code files
97
+ files = get_repo_files(owner, repo)
98
+ if not files:
99
+ return {"error": "No code files found or repo is private"}
100
+
101
+ logger.info(f"Found {len(files)} files to scan")
102
+
103
+ # Scan each file and collect results
104
+ all_vulnerabilities = []
105
+ scanned_count = 0
106
+
107
+ for file_path in files[:15]: # Limit to 15 files to avoid timeout
108
+ logger.info(f"Scanning file {scanned_count + 1}/{len(files[:15])}: {file_path}")
109
+ code, err = get_file_content(owner, repo, file_path)
110
+
111
+ if err:
112
+ continue
113
+
114
+ # Truncate large files
115
+ if len(code) > 4000:
116
+ code = code[:4000] + "\n... [truncated]"
117
+
118
+ # Quick scan for this file
119
+ client = Groq(api_key=os.getenv("GROQ_API_KEY", ""))
120
+
121
+ prompt = f"""Analyze this code file for security vulnerabilities. Be concise.
122
+
123
+ File: {file_path}
124
+
125
+ ```
126
+ {code}
127
+ ```
128
+
129
+ List only CRITICAL and HIGH severity vulnerabilities found. Format:
130
+ - **[Vulnerability]** in `{file_path}` line X: [brief issue]
131
+
132
+ If no critical/high issues, respond: "No critical issues found."
133
+ """
134
+
135
+ try:
136
+ response = client.chat.completions.create(
137
+ model="llama-3.3-70b-versatile",
138
+ messages=[{"role": "user", "content": prompt}],
139
+ temperature=0.1,
140
+ max_tokens=800,
141
+ )
142
+ result = response.choices[0].message.content.strip()
143
+ if "no critical issues" not in result.lower():
144
+ all_vulnerabilities.append(f"### {file_path}\n{result}\n")
145
+ scanned_count += 1
146
+ except Exception as e:
147
+ logger.error(f"Error scanning {file_path}: {str(e)}")
148
+ continue
149
+
150
+ # Generate final report
151
+ if not all_vulnerabilities:
152
+ report = f"""# Repository Security Scan Report
153
+
154
+ ## Repository Overview
155
+ - Repository: {owner}/{repo}
156
+ - Files Scanned: {scanned_count}
157
+ - Status: ✅ No critical vulnerabilities detected
158
+
159
+ ## Summary
160
+ All scanned files passed security checks. No CRITICAL or HIGH severity issues found.
161
+
162
+ ## Risk Summary
163
+ - Critical: 0 | High: 0 | Medium: 0 | Low: 0
164
+ - **Overall Risk**: LOW
165
+ """
166
+ else:
167
+ report = f"""# Repository Security Scan Report
168
+
169
+ ## Repository Overview
170
+ - Repository: {owner}/{repo}
171
+ - Files Scanned: {scanned_count}
172
+ - Vulnerabilities Found: {len(all_vulnerabilities)} files with issues
173
+
174
+ ## Vulnerabilities by File
175
+
176
+ {''.join(all_vulnerabilities)}
177
+
178
+ ## Recommendations
179
+ 1. Review and fix all CRITICAL and HIGH severity issues immediately
180
+ 2. Implement input validation and sanitization
181
+ 3. Use parameterized queries for database operations
182
+ 4. Keep dependencies updated
183
+
184
+ ## Risk Summary
185
+ - Critical: {sum(1 for v in all_vulnerabilities if 'CRITICAL' in v)} | High: {sum(1 for v in all_vulnerabilities if 'HIGH' in v)} | Medium: 0 | Low: 0
186
+ - **Overall Risk**: {'CRITICAL' if any('CRITICAL' in v for v in all_vulnerabilities) else 'HIGH'}
187
+ """
188
+
189
+ logger.info(f"Repo scan completed: {scanned_count} files scanned")
190
+ return {"result": report}
191
+
192
  def run_scan(owner, repo, file_path):
193
+ """Scan single file"""
194
  code, err = get_file_content(owner, repo, file_path)
195
  if err:
196
  return {"error": err}
 
256
 
257
  @app.post("/api/scan")
258
  async def scan(req: ScanRequest):
259
+ logger.info(f"Received scan request for URL: {req.url}")
260
+ owner, repo, file_path, is_repo = parse_github_url(req.url)
261
+
262
+ if not owner or not repo:
263
+ logger.warning(f"Invalid URL format: {req.url}")
264
+ return {"error": "Invalid GitHub URL. Use github.com/owner/repo or .../blob/branch/file"}
265
+
266
+ if is_repo:
267
+ # Full repository scan
268
+ logger.info(f"Starting repository scan: {owner}/{repo}")
269
+ loop = asyncio.get_running_loop()
270
+ with concurrent.futures.ThreadPoolExecutor() as pool:
271
+ result = await loop.run_in_executor(pool, run_repo_scan, owner, repo)
272
+ else:
273
+ # Single file scan
274
+ logger.info(f"Scanning file: {owner}/{repo}/{file_path}")
275
+ loop = asyncio.get_running_loop()
276
+ with concurrent.futures.ThreadPoolExecutor() as pool:
277
+ result = await loop.run_in_executor(pool, run_scan, owner, repo, file_path)
278
+
279
+ if "error" in result:
280
+ logger.error(f"Scan failed: {result['error']}")
281
+ else:
282
+ logger.info("Scan completed successfully")
283
+
284
  return result
285
 
286
 
287
  @app.get("/api/health")
288
  def health():
289
+ logger.info("Health check requested")
290
  return {"status": "ok", "groq_key_set": bool(os.getenv("GROQ_API_KEY"))}
291
 
292
 
293
  @app.get("/")
294
  def root():
295
+ logger.info("Root endpoint accessed")
296
  return {"status": "Vulnerability Scanner API running"}