import io import json import os import logging from fastapi import APIRouter, UploadFile, File, Form, HTTPException from fastapi.responses import StreamingResponse from pydantic import ValidationError from .models import Entity, AnalysisResponse, RedactRequest from .document_utils import process_pdf, process_docx, map_ner_results from transformers import pipeline from docx import Document import fitz # Configure logging logging.basicConfig(level=logging.INFO) router = APIRouter() model_cache = {} @router.on_event("startup") def load_model(): model_id = "abishekcodes/pii_model" model_cache["ner_pipeline"] = pipeline("ner", model=model_id, aggregation_strategy="max") @router.post("/analyze", response_model=AnalysisResponse) async def analyze_document(file: UploadFile = File(...)): # ... (no changes in this function) file_content = await file.read() file_stream = io.BytesIO(file_content) if file.filename.endswith(".pdf"): full_text, char_map = process_pdf(file_stream) elif file.filename.endswith(".docx"): full_text, char_map = process_docx(file_stream) else: raise HTTPException(status_code=415, detail="Unsupported file type.") if not full_text.strip(): return AnalysisResponse(filename=file.filename, entities=[]) ner_results = model_cache["ner_pipeline"](full_text) entities = map_ner_results(ner_results, char_map, Entity) return AnalysisResponse(filename=file.filename, entities=entities) @router.post("/redact") async def redact_document(file: UploadFile = File(...), entities_to_redact_json: str = Form(...)): # ROOT CAUSE FIX: Added extensive logging and specific error handling for the 422 error logging.info(f"Received /redact request for file: {file.filename}") logging.info(f"Received entities JSON string: {entities_to_redact_json}") try: entities_data = json.loads(entities_to_redact_json) if not isinstance(entities_data, list): raise TypeError("Entities data is not a list.") # This is where the 422 error was happening. We now validate with a clear error message. request_data = RedactRequest(entities_to_redact=entities_data) except (json.JSONDecodeError, TypeError) as e: logging.error(f"JSON parsing or Type error: {e}") raise HTTPException(status_code=400, detail=f"Invalid JSON format: {e}") except ValidationError as e: logging.error(f"Pydantic Validation Error: {e.errors()}") # Provide a detailed error response to the client raise HTTPException(status_code=422, detail={"message": "Invalid entity structure provided.", "errors": e.errors()}) file_content = await file.read() file_stream = io.BytesIO(file_content) output_buffer = io.BytesIO() # --- Redaction Logic (no changes here) --- if file.filename.endswith(".pdf"): doc = fitz.open(stream=file_stream, filetype="pdf") for entity in request_data.entities_to_redact: for loc in entity.location: # Ensure loc has the required keys before accessing if 'page' in loc and 'bbox' in loc: page = doc[loc['page']] page.add_redact_annot(loc['bbox'], fill=(0, 0, 0)) for page in doc: page.apply_redactions() doc.save(output_buffer) doc.close() media_type = "application/pdf" elif file.filename.endswith(".docx"): doc = Document(file_stream) redacted_runs = set() for entity in request_data.entities_to_redact: for loc in entity.location: if 'p_idx' in loc and 'r_idx' in loc: p_idx, r_idx = loc['p_idx'], loc['r_idx'] run_id = (p_idx, r_idx) if run_id not in redacted_runs and p_idx < len(doc.paragraphs) and r_idx < len(doc.paragraphs[p_idx].runs): run_to_redact = doc.paragraphs[p_idx].runs[r_idx] run_to_redact.text = '█' * len(run_to_redact.text) redacted_runs.add(run_id) doc.save(output_buffer) media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" else: raise HTTPException(status_code=415, detail="Unsupported file type.") output_buffer.seek(0) logging.info("Redaction successful, sending file back to client.") return StreamingResponse( output_buffer, media_type=media_type, headers={"Content-Disposition": f"attachment; filename=redacted_{file.filename}"} )