Spaces:

heboya8
/

demo

Paused

App Files Files Community

tfrere HF Staff commited on Apr 7, 2025

Commit

b1e6db8

1 Parent(s): ab84d6a

improve security on upload-url route

Browse files

Files changed (2) hide show

backend/pyproject.toml +1 -0
backend/routes/upload.py +85 -10

backend/pyproject.toml CHANGED Viewed

@@ -27,6 +27,7 @@ dependencies = [
     "beautifulsoup4>=4.12.0",
     "evaluate>=0.4.0",
     "requests>=2.31.0",
 ]
 [build-system]

     "beautifulsoup4>=4.12.0",
     "evaluate>=0.4.0",
     "requests>=2.31.0",
+    "validators>=0.34.0",
 ]
 [build-system]

backend/routes/upload.py CHANGED Viewed

@@ -6,7 +6,11 @@ from bs4 import BeautifulSoup
 from PyPDF2 import PdfReader
 import requests
 from fastapi import Form
-from typing import Optional
 router = APIRouter(tags=["files"])
@@ -20,6 +24,14 @@ os.makedirs(UPLOAD_ROOT, exist_ok=True)
 # Minimum length for any file (in characters)
 MIN_FILE_LENGTH = 500
 def validate_pdf(file_path: str) -> bool:
     """Validate if file is a valid PDF."""
     try:
@@ -218,16 +230,76 @@ async def upload_url(url: str = Form(...)):
         Dictionary with status and session_id
     """
     try:
-        # Retrieve the content from the URL
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()  # Raise an exception if the HTTP status is not 200
-        # Extract text from HTML with BeautifulSoup
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove script and style tags
-        for script in soup(["script", "style"]):
-            script.extract()
         # Extract the text
         text = soup.get_text()
@@ -237,7 +309,10 @@ async def upload_url(url: str = Form(...)):
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
         text = '\n'.join(chunk for chunk in chunks if chunk)
-        # Limit to 1000 characters if necessary
         if len(text) > 25000:
             text = text[:25000]

 from PyPDF2 import PdfReader
 import requests
 from fastapi import Form
+from typing import Optional, List
+import re
+from urllib.parse import urlparse
+import html
+import validators
 router = APIRouter(tags=["files"])
 # Minimum length for any file (in characters)
 MIN_FILE_LENGTH = 500
+# Configuration des limites de sécurité
+MAX_CONTENT_SIZE = 5 * 1024 * 1024  # 5 MB max pour le contenu téléchargé
+REQUEST_TIMEOUT = 10  # Timeout pour les requêtes HTTP
+# Liste des domaines autorisés (vide = tous autorisés, mais à remplir en production)
+ALLOWED_DOMAINS: List[str] = []
+# Liste d'extensions de fichiers à bloquer dans les URLs
+BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
 def validate_pdf(file_path: str) -> bool:
     """Validate if file is a valid PDF."""
     try:
         Dictionary with status and session_id
     """
     try:
+        # Valider que l'URL est bien formée
+        if not validators.url(url):
+            raise HTTPException(status_code=400, detail="Invalid URL format")
+        # Vérifier si l'URL a une extension bloquée
+        parsed_url = urlparse(url)
+        path = parsed_url.path.lower()
+        if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
+            raise HTTPException(status_code=400, detail="This file type is not allowed")
+        # Vérifier si le domaine est autorisé (si la liste n'est pas vide)
+        domain = parsed_url.netloc
+        if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
+            raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
+        # Retrieve the content from the URL with proper headers to mimic a browser
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (compatible; YourBenchBot/1.0; +https://yourbench.example.com)',
+            'Accept': 'text/html,application/xhtml+xml',
+            'Accept-Language': 'en-US,en;q=0.5',
+        }
+        response = requests.get(
+            url,
+            timeout=REQUEST_TIMEOUT,
+            headers=headers,
+            stream=True  # Pour vérifier la taille avant de télécharger tout le contenu
+        )
+        response.raise_for_status()
+        # Vérifier le Content-Type
+        content_type = response.headers.get('Content-Type', '')
+        if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
+            )
+        # Vérifier la taille du contenu
+        content_length = int(response.headers.get('Content-Length', 0))
+        if content_length > MAX_CONTENT_SIZE:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Content too large ({content_length} bytes). Maximum size: {MAX_CONTENT_SIZE} bytes."
+            )
+        # Lire le contenu avec une limite de taille
+        content = ""
+        bytes_read = 0
+        for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
+            bytes_read += len(chunk.encode('utf-8') if isinstance(chunk, str) else chunk)
+            if bytes_read > MAX_CONTENT_SIZE:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Content exceeded maximum allowed size of {MAX_CONTENT_SIZE} bytes"
+                )
+            content += chunk if isinstance(chunk, str) else chunk.decode('utf-8', errors='replace')
+        # Extract text from HTML with BeautifulSoup using the lxml parser for better security
+        soup = BeautifulSoup(content, 'html.parser')
+        # Remove potentially dangerous elements
+        for element in soup(['script', 'style', 'iframe', 'object', 'embed', 'noscript']):
+            element.extract()
+        # Remove on* attributes (event handlers) from all tags
+        for tag in soup.find_all(True):
+            for attr in list(tag.attrs):
+                if attr.startswith('on'):
+                    del tag[attr]
         # Extract the text
         text = soup.get_text()
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
         text = '\n'.join(chunk for chunk in chunks if chunk)
+        # Sanitize the text to prevent any potential stored XSS
+        text = html.escape(text)
+        # Limit to 25000 characters if necessary
         if len(text) > 25000:
             text = text[:25000]