Align 500MB upload limits and stream large files
Browse files- Dockerfile +1 -0
- backend/api/routes/datasets.py +23 -6
- backend/api/routes/predict.py +11 -2
- start.sh +2 -0
Dockerfile
CHANGED
|
@@ -33,6 +33,7 @@ RUN mkdir -p backend/runs backend/tmp
|
|
| 33 |
RUN chmod +x start.sh
|
| 34 |
|
| 35 |
ENV PYTHONPATH=$HOME/app/backend:$PYTHONPATH
|
|
|
|
| 36 |
|
| 37 |
# Tell your start.sh script to boot Streamlit on 7860 (the only port HF exposes)
|
| 38 |
ENV PORT=7860
|
|
|
|
| 33 |
RUN chmod +x start.sh
|
| 34 |
|
| 35 |
ENV PYTHONPATH=$HOME/app/backend:$PYTHONPATH
|
| 36 |
+
ENV MAX_UPLOAD_MB=500
|
| 37 |
|
| 38 |
# Tell your start.sh script to boot Streamlit on 7860 (the only port HF exposes)
|
| 39 |
ENV PORT=7860
|
backend/api/routes/datasets.py
CHANGED
|
@@ -36,6 +36,28 @@ def _save_uploaded_file(dataset_id: str, filename: str, payload: bytes) -> str:
|
|
| 36 |
return file_path
|
| 37 |
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def _persist_dataframe_as_csv(dataset_id: str, df):
|
| 40 |
csv_path = os.path.join("tmp", f"{dataset_id}.csv")
|
| 41 |
df.to_csv(csv_path, index=False)
|
|
@@ -236,12 +258,7 @@ async def upload_dataset(
|
|
| 236 |
filename = file.filename or "upload.csv"
|
| 237 |
|
| 238 |
try:
|
| 239 |
-
|
| 240 |
-
except Exception as e:
|
| 241 |
-
return {"error": f"Failed to save upload: {e}"}
|
| 242 |
-
|
| 243 |
-
try:
|
| 244 |
-
uploaded_path = _save_uploaded_file(dataset_id, filename, raw_bytes)
|
| 245 |
except Exception as e:
|
| 246 |
return {"error": f"Failed to save upload: {e}"}
|
| 247 |
|
|
|
|
| 36 |
return file_path
|
| 37 |
|
| 38 |
|
| 39 |
+
async def _stream_upload_to_file(dataset_id: str, filename: str, upload: UploadFile) -> str:
|
| 40 |
+
ext = os.path.splitext(filename or "")[1].lower() or ".csv"
|
| 41 |
+
file_path = os.path.join("tmp", f"{dataset_id}{ext}")
|
| 42 |
+
os.makedirs("tmp", exist_ok=True)
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
with open(file_path, "wb") as buffer:
|
| 46 |
+
while True:
|
| 47 |
+
chunk = await upload.read(1024 * 1024)
|
| 48 |
+
if not chunk:
|
| 49 |
+
break
|
| 50 |
+
buffer.write(chunk)
|
| 51 |
+
except Exception:
|
| 52 |
+
if os.path.exists(file_path):
|
| 53 |
+
os.remove(file_path)
|
| 54 |
+
raise
|
| 55 |
+
finally:
|
| 56 |
+
await upload.close()
|
| 57 |
+
|
| 58 |
+
return file_path
|
| 59 |
+
|
| 60 |
+
|
| 61 |
def _persist_dataframe_as_csv(dataset_id: str, df):
|
| 62 |
csv_path = os.path.join("tmp", f"{dataset_id}.csv")
|
| 63 |
df.to_csv(csv_path, index=False)
|
|
|
|
| 258 |
filename = file.filename or "upload.csv"
|
| 259 |
|
| 260 |
try:
|
| 261 |
+
uploaded_path = await _stream_upload_to_file(dataset_id, filename, file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
except Exception as e:
|
| 263 |
return {"error": f"Failed to save upload: {e}"}
|
| 264 |
|
backend/api/routes/predict.py
CHANGED
|
@@ -10,6 +10,7 @@ import pandas as pd
|
|
| 10 |
from infra.database import get_db, JobModel
|
| 11 |
from infra.result_contract import normalize_results
|
| 12 |
from infra.storage import get_schema_path
|
|
|
|
| 13 |
from core.file_loader import load_dataframe
|
| 14 |
|
| 15 |
router = APIRouter(prefix="/api", tags=["predict"])
|
|
@@ -184,11 +185,19 @@ async def contract_check(job_id: str, file: UploadFile = File(...)):
|
|
| 184 |
except Exception:
|
| 185 |
contract_schema = {}
|
| 186 |
|
|
|
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
except Exception as e:
|
| 191 |
raise HTTPException(status_code=422, detail=f"Could not read inference file: {e}")
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
incoming_columns = list(df.columns)
|
| 194 |
missing = sorted(set(expected_features) - set(incoming_columns))
|
|
|
|
| 10 |
from infra.database import get_db, JobModel
|
| 11 |
from infra.result_contract import normalize_results
|
| 12 |
from infra.storage import get_schema_path
|
| 13 |
+
from api.routes.datasets import _stream_upload_to_file
|
| 14 |
from core.file_loader import load_dataframe
|
| 15 |
|
| 16 |
router = APIRouter(prefix="/api", tags=["predict"])
|
|
|
|
| 185 |
except Exception:
|
| 186 |
contract_schema = {}
|
| 187 |
|
| 188 |
+
temp_path = None
|
| 189 |
try:
|
| 190 |
+
temp_path = await _stream_upload_to_file(
|
| 191 |
+
f"contract_{job_id}_{os.urandom(4).hex()}",
|
| 192 |
+
file.filename or "inference.csv",
|
| 193 |
+
file,
|
| 194 |
+
)
|
| 195 |
+
df = load_dataframe(filepath=temp_path)
|
| 196 |
except Exception as e:
|
| 197 |
raise HTTPException(status_code=422, detail=f"Could not read inference file: {e}")
|
| 198 |
+
finally:
|
| 199 |
+
if temp_path and os.path.exists(temp_path):
|
| 200 |
+
os.remove(temp_path)
|
| 201 |
|
| 202 |
incoming_columns = list(df.columns)
|
| 203 |
missing = sorted(set(expected_features) - set(incoming_columns))
|
start.sh
CHANGED
|
@@ -12,6 +12,7 @@ export REDIS_URL="${REDIS_URL:-redis://127.0.0.1:6379/0}"
|
|
| 12 |
export CELERY_BROKER_URL="${CELERY_BROKER_URL:-$REDIS_URL}"
|
| 13 |
export CELERY_RESULT_BACKEND="${CELERY_RESULT_BACKEND:-$REDIS_URL}"
|
| 14 |
export AUTOML_API_URL="${AUTOML_API_URL:-http://127.0.0.1:8000/api}"
|
|
|
|
| 15 |
|
| 16 |
mkdir -p /tmp/nginx_client_body /tmp/nginx_proxy /tmp/nginx_fastcgi /tmp/nginx_uwsgi /tmp/nginx_scgi
|
| 17 |
|
|
@@ -46,6 +47,7 @@ python -m streamlit run app.py \
|
|
| 46 |
--server.headless true \
|
| 47 |
--server.enableCORS false \
|
| 48 |
--server.enableXsrfProtection false \
|
|
|
|
| 49 |
--browser.gatherUsageStats false &
|
| 50 |
FRONTEND_PID=$!
|
| 51 |
|
|
|
|
| 12 |
export CELERY_BROKER_URL="${CELERY_BROKER_URL:-$REDIS_URL}"
|
| 13 |
export CELERY_RESULT_BACKEND="${CELERY_RESULT_BACKEND:-$REDIS_URL}"
|
| 14 |
export AUTOML_API_URL="${AUTOML_API_URL:-http://127.0.0.1:8000/api}"
|
| 15 |
+
export MAX_UPLOAD_MB="${MAX_UPLOAD_MB:-500}"
|
| 16 |
|
| 17 |
mkdir -p /tmp/nginx_client_body /tmp/nginx_proxy /tmp/nginx_fastcgi /tmp/nginx_uwsgi /tmp/nginx_scgi
|
| 18 |
|
|
|
|
| 47 |
--server.headless true \
|
| 48 |
--server.enableCORS false \
|
| 49 |
--server.enableXsrfProtection false \
|
| 50 |
+
--server.maxUploadSize "${MAX_UPLOAD_MB}" \
|
| 51 |
--browser.gatherUsageStats false &
|
| 52 |
FRONTEND_PID=$!
|
| 53 |
|