Update main.py
Browse files
main.py
CHANGED
|
@@ -45,10 +45,6 @@ OUTPUT_DIR.mkdir(exist_ok=True)
|
|
| 45 |
# Initialize redactor
|
| 46 |
redactor = PDFRedactor()
|
| 47 |
|
| 48 |
-
# ---------------- In-Memory Redaction Status Tracker ----------------
|
| 49 |
-
# request_id -> status (pending | processing | completed | failed)
|
| 50 |
-
redaction_status: Dict[str, str] = {}
|
| 51 |
-
|
| 52 |
# ---------------- Response Models ----------------
|
| 53 |
class RedactionEntity(BaseModel):
|
| 54 |
entity_type: str
|
|
@@ -69,15 +65,35 @@ class RedactionStatusResponse(BaseModel):
|
|
| 69 |
files: List[str]
|
| 70 |
message: str
|
| 71 |
|
| 72 |
-
|
| 73 |
class HealthResponse(BaseModel):
|
| 74 |
status: str
|
| 75 |
version: str
|
| 76 |
model_loaded: bool
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# ---------------- Helper Functions ----------------
|
| 79 |
def get_public_url(bucket: str, storage_path: str) -> str:
|
| 80 |
return f"{os.getenv('SUPABASE_URL')}/storage/v1/object/public/{bucket}/{storage_path}"
|
|
|
|
| 81 |
def cleanup_files(job_id: str):
|
| 82 |
"""Clean up temporary files after a delay"""
|
| 83 |
try:
|
|
@@ -103,10 +119,8 @@ def download_file_from_supabase(bucket: str, storage_path: str, local_path: Path
|
|
| 103 |
|
| 104 |
def upload_file_to_supabase(bucket: str, storage_path: str, local_path: Path):
|
| 105 |
logger.info(f"Uploading {local_path} to {storage_path}")
|
| 106 |
-
|
| 107 |
with local_path.open("rb") as f:
|
| 108 |
content = f.read()
|
| 109 |
-
|
| 110 |
supabase.storage.from_(bucket).upload(
|
| 111 |
path=storage_path,
|
| 112 |
file=content,
|
|
@@ -117,11 +131,15 @@ def upload_file_to_supabase(bucket: str, storage_path: str, local_path: Path):
|
|
| 117 |
)
|
| 118 |
|
| 119 |
def redact_request(request_id: str, bucket: str = "doc_storage"):
|
| 120 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
try:
|
| 122 |
-
|
|
|
|
| 123 |
|
| 124 |
-
# Fetch all files for this request_id
|
| 125 |
response = (
|
| 126 |
supabase
|
| 127 |
.from_("request_files")
|
|
@@ -131,9 +149,6 @@ def redact_request(request_id: str, bucket: str = "doc_storage"):
|
|
| 131 |
)
|
| 132 |
|
| 133 |
files = response.data
|
| 134 |
-
|
| 135 |
-
if not files:
|
| 136 |
-
raise Exception(f"No files found for request {request_id}")
|
| 137 |
if not files:
|
| 138 |
raise Exception(f"No files found for request {request_id}")
|
| 139 |
|
|
@@ -142,23 +157,18 @@ def redact_request(request_id: str, bucket: str = "doc_storage"):
|
|
| 142 |
local_upload = UPLOAD_DIR / f"{uuid.uuid4()}.pdf"
|
| 143 |
local_output = OUTPUT_DIR / f"{uuid.uuid4()}_redacted.pdf"
|
| 144 |
|
| 145 |
-
# Download from Supabase
|
| 146 |
download_file_from_supabase(bucket, storage_path, local_upload)
|
| 147 |
-
|
| 148 |
-
# Redact
|
| 149 |
redactor.redact_document(pdf_path=str(local_upload), output_path=str(local_output))
|
| 150 |
-
|
| 151 |
-
# Upload redacted back to same path
|
| 152 |
upload_file_to_supabase(bucket, storage_path, local_output)
|
| 153 |
-
|
| 154 |
-
# Cleanup local files
|
| 155 |
cleanup_temp_files([local_upload, local_output])
|
| 156 |
|
| 157 |
-
|
|
|
|
| 158 |
|
| 159 |
except Exception as e:
|
| 160 |
logger.error(f"Redaction failed for {request_id}: {str(e)}")
|
| 161 |
-
|
|
|
|
| 162 |
|
| 163 |
# ----------------- Existing Endpoints -----------------
|
| 164 |
@app.get("/", response_model=HealthResponse)
|
|
@@ -266,15 +276,21 @@ async def get_stats():
|
|
| 266 |
# ----------------- NEW Endpoints -----------------
|
| 267 |
@app.post("/redact_by_request/{request_id}", response_model=RedactionStatusResponse)
|
| 268 |
async def redact_by_request(request_id: str, background_tasks: BackgroundTasks):
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
| 270 |
return RedactionStatusResponse(
|
| 271 |
request_id=request_id,
|
| 272 |
-
status="
|
| 273 |
files=[],
|
| 274 |
message="Redaction already in progress"
|
| 275 |
)
|
| 276 |
-
|
|
|
|
|
|
|
| 277 |
background_tasks.add_task(redact_request, request_id)
|
|
|
|
| 278 |
return RedactionStatusResponse(
|
| 279 |
request_id=request_id,
|
| 280 |
status="pending",
|
|
@@ -284,13 +300,11 @@ async def redact_by_request(request_id: str, background_tasks: BackgroundTasks):
|
|
| 284 |
|
| 285 |
@app.get("/redaction_status/{request_id}", response_model=RedactionStatusResponse)
|
| 286 |
async def get_redaction_status(request_id: str):
|
| 287 |
-
status =
|
| 288 |
|
| 289 |
-
# Default empty response
|
| 290 |
files: List[str] = []
|
| 291 |
|
| 292 |
-
if status == "
|
| 293 |
-
# Fetch file paths from DB
|
| 294 |
response = (
|
| 295 |
supabase
|
| 296 |
.from_("request_files")
|
|
@@ -298,35 +312,23 @@ async def get_redaction_status(request_id: str):
|
|
| 298 |
.eq("request_id", request_id)
|
| 299 |
.execute()
|
| 300 |
)
|
| 301 |
-
|
| 302 |
if response.data:
|
| 303 |
files = [
|
| 304 |
get_public_url("doc_storage", row["storage_path"])
|
| 305 |
for row in response.data
|
| 306 |
]
|
| 307 |
|
| 308 |
-
message =
|
| 309 |
-
"Redaction completed"
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
else "Request not found"
|
| 316 |
-
)
|
| 317 |
|
| 318 |
return RedactionStatusResponse(
|
| 319 |
request_id=request_id,
|
| 320 |
status=status,
|
| 321 |
files=files,
|
| 322 |
message=message
|
| 323 |
-
)
|
| 324 |
-
|
| 325 |
-
# ----------------- Run Server -----------------
|
| 326 |
-
# if __name__ == "__main__":
|
| 327 |
-
# uvicorn.run(
|
| 328 |
-
# "main:app",
|
| 329 |
-
# host="localhost",
|
| 330 |
-
# port=2700,
|
| 331 |
-
# reload=False
|
| 332 |
-
# )
|
|
|
|
| 45 |
# Initialize redactor
|
| 46 |
redactor = PDFRedactor()
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# ---------------- Response Models ----------------
|
| 49 |
class RedactionEntity(BaseModel):
|
| 50 |
entity_type: str
|
|
|
|
| 65 |
files: List[str]
|
| 66 |
message: str
|
| 67 |
|
|
|
|
| 68 |
class HealthResponse(BaseModel):
|
| 69 |
status: str
|
| 70 |
version: str
|
| 71 |
model_loaded: bool
|
| 72 |
|
| 73 |
+
# ---------------- DB Status Helpers ----------------
|
| 74 |
+
def set_request_status(request_id: str, status: str):
|
| 75 |
+
"""Update the status column in document_requests for the given request_id."""
|
| 76 |
+
supabase.from_("document_requests").update({"status": status}).eq("id", request_id).execute()
|
| 77 |
+
logger.info(f"Request {request_id} status -> {status}")
|
| 78 |
+
|
| 79 |
+
def get_request_status(request_id: str) -> str:
|
| 80 |
+
"""Fetch current status from document_requests."""
|
| 81 |
+
response = (
|
| 82 |
+
supabase
|
| 83 |
+
.from_("document_requests")
|
| 84 |
+
.select("status")
|
| 85 |
+
.eq("id", request_id)
|
| 86 |
+
.maybe_single()
|
| 87 |
+
.execute()
|
| 88 |
+
)
|
| 89 |
+
if response.data:
|
| 90 |
+
return response.data["status"]
|
| 91 |
+
return "not_found"
|
| 92 |
+
|
| 93 |
# ---------------- Helper Functions ----------------
|
| 94 |
def get_public_url(bucket: str, storage_path: str) -> str:
|
| 95 |
return f"{os.getenv('SUPABASE_URL')}/storage/v1/object/public/{bucket}/{storage_path}"
|
| 96 |
+
|
| 97 |
def cleanup_files(job_id: str):
|
| 98 |
"""Clean up temporary files after a delay"""
|
| 99 |
try:
|
|
|
|
| 119 |
|
| 120 |
def upload_file_to_supabase(bucket: str, storage_path: str, local_path: Path):
|
| 121 |
logger.info(f"Uploading {local_path} to {storage_path}")
|
|
|
|
| 122 |
with local_path.open("rb") as f:
|
| 123 |
content = f.read()
|
|
|
|
| 124 |
supabase.storage.from_(bucket).upload(
|
| 125 |
path=storage_path,
|
| 126 |
file=content,
|
|
|
|
| 131 |
)
|
| 132 |
|
| 133 |
def redact_request(request_id: str, bucket: str = "doc_storage"):
|
| 134 |
+
"""
|
| 135 |
+
Background task: redact all files for a given request_id.
|
| 136 |
+
DB writes: 2 total — one at start (redacting), one at end (redacted | failed).
|
| 137 |
+
The 'pending' write is done by the endpoint before this task is dispatched.
|
| 138 |
+
"""
|
| 139 |
try:
|
| 140 |
+
# Write 1: mark as redacting
|
| 141 |
+
set_request_status(request_id, "redacting")
|
| 142 |
|
|
|
|
| 143 |
response = (
|
| 144 |
supabase
|
| 145 |
.from_("request_files")
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
files = response.data
|
|
|
|
|
|
|
|
|
|
| 152 |
if not files:
|
| 153 |
raise Exception(f"No files found for request {request_id}")
|
| 154 |
|
|
|
|
| 157 |
local_upload = UPLOAD_DIR / f"{uuid.uuid4()}.pdf"
|
| 158 |
local_output = OUTPUT_DIR / f"{uuid.uuid4()}_redacted.pdf"
|
| 159 |
|
|
|
|
| 160 |
download_file_from_supabase(bucket, storage_path, local_upload)
|
|
|
|
|
|
|
| 161 |
redactor.redact_document(pdf_path=str(local_upload), output_path=str(local_output))
|
|
|
|
|
|
|
| 162 |
upload_file_to_supabase(bucket, storage_path, local_output)
|
|
|
|
|
|
|
| 163 |
cleanup_temp_files([local_upload, local_output])
|
| 164 |
|
| 165 |
+
# Write 2: mark as redacted
|
| 166 |
+
set_request_status(request_id, "redacted")
|
| 167 |
|
| 168 |
except Exception as e:
|
| 169 |
logger.error(f"Redaction failed for {request_id}: {str(e)}")
|
| 170 |
+
# Write 2 (error path): mark as failed
|
| 171 |
+
set_request_status(request_id, "failed")
|
| 172 |
|
| 173 |
# ----------------- Existing Endpoints -----------------
|
| 174 |
@app.get("/", response_model=HealthResponse)
|
|
|
|
| 276 |
# ----------------- NEW Endpoints -----------------
|
| 277 |
@app.post("/redact_by_request/{request_id}", response_model=RedactionStatusResponse)
|
| 278 |
async def redact_by_request(request_id: str, background_tasks: BackgroundTasks):
|
| 279 |
+
# Check current DB status to avoid re-triggering an in-progress job
|
| 280 |
+
current_status = get_request_status(request_id)
|
| 281 |
+
|
| 282 |
+
if current_status == "redacting":
|
| 283 |
return RedactionStatusResponse(
|
| 284 |
request_id=request_id,
|
| 285 |
+
status="redacting",
|
| 286 |
files=[],
|
| 287 |
message="Redaction already in progress"
|
| 288 |
)
|
| 289 |
+
|
| 290 |
+
# Write 1: set pending before dispatching background task
|
| 291 |
+
set_request_status(request_id, "pending")
|
| 292 |
background_tasks.add_task(redact_request, request_id)
|
| 293 |
+
|
| 294 |
return RedactionStatusResponse(
|
| 295 |
request_id=request_id,
|
| 296 |
status="pending",
|
|
|
|
| 300 |
|
| 301 |
@app.get("/redaction_status/{request_id}", response_model=RedactionStatusResponse)
|
| 302 |
async def get_redaction_status(request_id: str):
|
| 303 |
+
status = get_request_status(request_id)
|
| 304 |
|
|
|
|
| 305 |
files: List[str] = []
|
| 306 |
|
| 307 |
+
if status == "redacted":
|
|
|
|
| 308 |
response = (
|
| 309 |
supabase
|
| 310 |
.from_("request_files")
|
|
|
|
| 312 |
.eq("request_id", request_id)
|
| 313 |
.execute()
|
| 314 |
)
|
|
|
|
| 315 |
if response.data:
|
| 316 |
files = [
|
| 317 |
get_public_url("doc_storage", row["storage_path"])
|
| 318 |
for row in response.data
|
| 319 |
]
|
| 320 |
|
| 321 |
+
message = {
|
| 322 |
+
"redacted": "Redaction completed",
|
| 323 |
+
"pending": "Redaction pending",
|
| 324 |
+
"redacting": "Redaction in progress",
|
| 325 |
+
"failed": "Redaction failed",
|
| 326 |
+
"not_found": "Request not found",
|
| 327 |
+
}.get(status, status)
|
|
|
|
|
|
|
| 328 |
|
| 329 |
return RedactionStatusResponse(
|
| 330 |
request_id=request_id,
|
| 331 |
status=status,
|
| 332 |
files=files,
|
| 333 |
message=message
|
| 334 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|