PII-Masker / app /api.py
abishekcodes's picture
Deployment Test
98abe61
import io
import json
import os
import logging
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import ValidationError
from .models import Entity, AnalysisResponse, RedactRequest
from .document_utils import process_pdf, process_docx, map_ner_results
from transformers import pipeline
from docx import Document
import fitz
# Configure logging
logging.basicConfig(level=logging.INFO)
router = APIRouter()
model_cache = {}
@router.on_event("startup")
def load_model():
model_id = "abishekcodes/pii_model"
model_cache["ner_pipeline"] = pipeline("ner", model=model_id, aggregation_strategy="max")
@router.post("/analyze", response_model=AnalysisResponse)
async def analyze_document(file: UploadFile = File(...)):
# ... (no changes in this function)
file_content = await file.read()
file_stream = io.BytesIO(file_content)
if file.filename.endswith(".pdf"):
full_text, char_map = process_pdf(file_stream)
elif file.filename.endswith(".docx"):
full_text, char_map = process_docx(file_stream)
else:
raise HTTPException(status_code=415, detail="Unsupported file type.")
if not full_text.strip():
return AnalysisResponse(filename=file.filename, entities=[])
ner_results = model_cache["ner_pipeline"](full_text)
entities = map_ner_results(ner_results, char_map, Entity)
return AnalysisResponse(filename=file.filename, entities=entities)
@router.post("/redact")
async def redact_document(file: UploadFile = File(...), entities_to_redact_json: str = Form(...)):
# ROOT CAUSE FIX: Added extensive logging and specific error handling for the 422 error
logging.info(f"Received /redact request for file: {file.filename}")
logging.info(f"Received entities JSON string: {entities_to_redact_json}")
try:
entities_data = json.loads(entities_to_redact_json)
if not isinstance(entities_data, list):
raise TypeError("Entities data is not a list.")
# This is where the 422 error was happening. We now validate with a clear error message.
request_data = RedactRequest(entities_to_redact=entities_data)
except (json.JSONDecodeError, TypeError) as e:
logging.error(f"JSON parsing or Type error: {e}")
raise HTTPException(status_code=400, detail=f"Invalid JSON format: {e}")
except ValidationError as e:
logging.error(f"Pydantic Validation Error: {e.errors()}")
# Provide a detailed error response to the client
raise HTTPException(status_code=422, detail={"message": "Invalid entity structure provided.", "errors": e.errors()})
file_content = await file.read()
file_stream = io.BytesIO(file_content)
output_buffer = io.BytesIO()
# --- Redaction Logic (no changes here) ---
if file.filename.endswith(".pdf"):
doc = fitz.open(stream=file_stream, filetype="pdf")
for entity in request_data.entities_to_redact:
for loc in entity.location:
# Ensure loc has the required keys before accessing
if 'page' in loc and 'bbox' in loc:
page = doc[loc['page']]
page.add_redact_annot(loc['bbox'], fill=(0, 0, 0))
for page in doc:
page.apply_redactions()
doc.save(output_buffer)
doc.close()
media_type = "application/pdf"
elif file.filename.endswith(".docx"):
doc = Document(file_stream)
redacted_runs = set()
for entity in request_data.entities_to_redact:
for loc in entity.location:
if 'p_idx' in loc and 'r_idx' in loc:
p_idx, r_idx = loc['p_idx'], loc['r_idx']
run_id = (p_idx, r_idx)
if run_id not in redacted_runs and p_idx < len(doc.paragraphs) and r_idx < len(doc.paragraphs[p_idx].runs):
run_to_redact = doc.paragraphs[p_idx].runs[r_idx]
run_to_redact.text = '█' * len(run_to_redact.text)
redacted_runs.add(run_id)
doc.save(output_buffer)
media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
else:
raise HTTPException(status_code=415, detail="Unsupported file type.")
output_buffer.seek(0)
logging.info("Redaction successful, sending file back to client.")
return StreamingResponse(
output_buffer,
media_type=media_type,
headers={"Content-Disposition": f"attachment; filename=redacted_{file.filename}"}
)