improve security on upload-url route
Browse files- backend/pyproject.toml +1 -0
- backend/routes/upload.py +85 -10
backend/pyproject.toml
CHANGED
|
@@ -27,6 +27,7 @@ dependencies = [
|
|
| 27 |
"beautifulsoup4>=4.12.0",
|
| 28 |
"evaluate>=0.4.0",
|
| 29 |
"requests>=2.31.0",
|
|
|
|
| 30 |
]
|
| 31 |
|
| 32 |
[build-system]
|
|
|
|
| 27 |
"beautifulsoup4>=4.12.0",
|
| 28 |
"evaluate>=0.4.0",
|
| 29 |
"requests>=2.31.0",
|
| 30 |
+
"validators>=0.34.0",
|
| 31 |
]
|
| 32 |
|
| 33 |
[build-system]
|
backend/routes/upload.py
CHANGED
|
@@ -6,7 +6,11 @@ from bs4 import BeautifulSoup
|
|
| 6 |
from PyPDF2 import PdfReader
|
| 7 |
import requests
|
| 8 |
from fastapi import Form
|
| 9 |
-
from typing import Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
router = APIRouter(tags=["files"])
|
| 12 |
|
|
@@ -20,6 +24,14 @@ os.makedirs(UPLOAD_ROOT, exist_ok=True)
|
|
| 20 |
# Minimum length for any file (in characters)
|
| 21 |
MIN_FILE_LENGTH = 500
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def validate_pdf(file_path: str) -> bool:
|
| 24 |
"""Validate if file is a valid PDF."""
|
| 25 |
try:
|
|
@@ -218,16 +230,76 @@ async def upload_url(url: str = Form(...)):
|
|
| 218 |
Dictionary with status and session_id
|
| 219 |
"""
|
| 220 |
try:
|
| 221 |
-
#
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
#
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
# Extract the text
|
| 233 |
text = soup.get_text()
|
|
@@ -237,7 +309,10 @@ async def upload_url(url: str = Form(...)):
|
|
| 237 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 238 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
| 239 |
|
| 240 |
-
#
|
|
|
|
|
|
|
|
|
|
| 241 |
if len(text) > 25000:
|
| 242 |
text = text[:25000]
|
| 243 |
|
|
|
|
| 6 |
from PyPDF2 import PdfReader
|
| 7 |
import requests
|
| 8 |
from fastapi import Form
|
| 9 |
+
from typing import Optional, List
|
| 10 |
+
import re
|
| 11 |
+
from urllib.parse import urlparse
|
| 12 |
+
import html
|
| 13 |
+
import validators
|
| 14 |
|
| 15 |
router = APIRouter(tags=["files"])
|
| 16 |
|
|
|
|
| 24 |
# Minimum length for any file (in characters)
|
| 25 |
MIN_FILE_LENGTH = 500
|
| 26 |
|
| 27 |
+
# Configuration des limites de sécurité
|
| 28 |
+
MAX_CONTENT_SIZE = 5 * 1024 * 1024 # 5 MB max pour le contenu téléchargé
|
| 29 |
+
REQUEST_TIMEOUT = 10 # Timeout pour les requêtes HTTP
|
| 30 |
+
# Liste des domaines autorisés (vide = tous autorisés, mais à remplir en production)
|
| 31 |
+
ALLOWED_DOMAINS: List[str] = []
|
| 32 |
+
# Liste d'extensions de fichiers à bloquer dans les URLs
|
| 33 |
+
BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
|
| 34 |
+
|
| 35 |
def validate_pdf(file_path: str) -> bool:
|
| 36 |
"""Validate if file is a valid PDF."""
|
| 37 |
try:
|
|
|
|
| 230 |
Dictionary with status and session_id
|
| 231 |
"""
|
| 232 |
try:
|
| 233 |
+
# Valider que l'URL est bien formée
|
| 234 |
+
if not validators.url(url):
|
| 235 |
+
raise HTTPException(status_code=400, detail="Invalid URL format")
|
| 236 |
+
|
| 237 |
+
# Vérifier si l'URL a une extension bloquée
|
| 238 |
+
parsed_url = urlparse(url)
|
| 239 |
+
path = parsed_url.path.lower()
|
| 240 |
+
if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
|
| 241 |
+
raise HTTPException(status_code=400, detail="This file type is not allowed")
|
| 242 |
+
|
| 243 |
+
# Vérifier si le domaine est autorisé (si la liste n'est pas vide)
|
| 244 |
+
domain = parsed_url.netloc
|
| 245 |
+
if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
|
| 246 |
+
raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
|
| 247 |
+
|
| 248 |
+
# Retrieve the content from the URL with proper headers to mimic a browser
|
| 249 |
+
headers = {
|
| 250 |
+
'User-Agent': 'Mozilla/5.0 (compatible; YourBenchBot/1.0; +https://yourbench.example.com)',
|
| 251 |
+
'Accept': 'text/html,application/xhtml+xml',
|
| 252 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 253 |
+
}
|
| 254 |
|
| 255 |
+
response = requests.get(
|
| 256 |
+
url,
|
| 257 |
+
timeout=REQUEST_TIMEOUT,
|
| 258 |
+
headers=headers,
|
| 259 |
+
stream=True # Pour vérifier la taille avant de télécharger tout le contenu
|
| 260 |
+
)
|
| 261 |
+
response.raise_for_status()
|
| 262 |
|
| 263 |
+
# Vérifier le Content-Type
|
| 264 |
+
content_type = response.headers.get('Content-Type', '')
|
| 265 |
+
if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
|
| 266 |
+
raise HTTPException(
|
| 267 |
+
status_code=400,
|
| 268 |
+
detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Vérifier la taille du contenu
|
| 272 |
+
content_length = int(response.headers.get('Content-Length', 0))
|
| 273 |
+
if content_length > MAX_CONTENT_SIZE:
|
| 274 |
+
raise HTTPException(
|
| 275 |
+
status_code=400,
|
| 276 |
+
detail=f"Content too large ({content_length} bytes). Maximum size: {MAX_CONTENT_SIZE} bytes."
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
# Lire le contenu avec une limite de taille
|
| 280 |
+
content = ""
|
| 281 |
+
bytes_read = 0
|
| 282 |
+
for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
|
| 283 |
+
bytes_read += len(chunk.encode('utf-8') if isinstance(chunk, str) else chunk)
|
| 284 |
+
if bytes_read > MAX_CONTENT_SIZE:
|
| 285 |
+
raise HTTPException(
|
| 286 |
+
status_code=400,
|
| 287 |
+
detail=f"Content exceeded maximum allowed size of {MAX_CONTENT_SIZE} bytes"
|
| 288 |
+
)
|
| 289 |
+
content += chunk if isinstance(chunk, str) else chunk.decode('utf-8', errors='replace')
|
| 290 |
+
|
| 291 |
+
# Extract text from HTML with BeautifulSoup using the lxml parser for better security
|
| 292 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 293 |
+
|
| 294 |
+
# Remove potentially dangerous elements
|
| 295 |
+
for element in soup(['script', 'style', 'iframe', 'object', 'embed', 'noscript']):
|
| 296 |
+
element.extract()
|
| 297 |
+
|
| 298 |
+
# Remove on* attributes (event handlers) from all tags
|
| 299 |
+
for tag in soup.find_all(True):
|
| 300 |
+
for attr in list(tag.attrs):
|
| 301 |
+
if attr.startswith('on'):
|
| 302 |
+
del tag[attr]
|
| 303 |
|
| 304 |
# Extract the text
|
| 305 |
text = soup.get_text()
|
|
|
|
| 309 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 310 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
| 311 |
|
| 312 |
+
# Sanitize the text to prevent any potential stored XSS
|
| 313 |
+
text = html.escape(text)
|
| 314 |
+
|
| 315 |
+
# Limit to 25000 characters if necessary
|
| 316 |
if len(text) > 25000:
|
| 317 |
text = text[:25000]
|
| 318 |
|