Spaces:
Runtime error
Runtime error
| import io | |
| import json | |
| import os | |
| import logging | |
| from fastapi import APIRouter, UploadFile, File, Form, HTTPException | |
| from fastapi.responses import StreamingResponse | |
| from pydantic import ValidationError | |
| from .models import Entity, AnalysisResponse, RedactRequest | |
| from .document_utils import process_pdf, process_docx, map_ner_results | |
| from transformers import pipeline | |
| from docx import Document | |
| import fitz | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| router = APIRouter() | |
| model_cache = {} | |
| def load_model(): | |
| model_id = "abishekcodes/pii_model" | |
| model_cache["ner_pipeline"] = pipeline("ner", model=model_id, aggregation_strategy="max") | |
| async def analyze_document(file: UploadFile = File(...)): | |
| # ... (no changes in this function) | |
| file_content = await file.read() | |
| file_stream = io.BytesIO(file_content) | |
| if file.filename.endswith(".pdf"): | |
| full_text, char_map = process_pdf(file_stream) | |
| elif file.filename.endswith(".docx"): | |
| full_text, char_map = process_docx(file_stream) | |
| else: | |
| raise HTTPException(status_code=415, detail="Unsupported file type.") | |
| if not full_text.strip(): | |
| return AnalysisResponse(filename=file.filename, entities=[]) | |
| ner_results = model_cache["ner_pipeline"](full_text) | |
| entities = map_ner_results(ner_results, char_map, Entity) | |
| return AnalysisResponse(filename=file.filename, entities=entities) | |
| async def redact_document(file: UploadFile = File(...), entities_to_redact_json: str = Form(...)): | |
| # ROOT CAUSE FIX: Added extensive logging and specific error handling for the 422 error | |
| logging.info(f"Received /redact request for file: {file.filename}") | |
| logging.info(f"Received entities JSON string: {entities_to_redact_json}") | |
| try: | |
| entities_data = json.loads(entities_to_redact_json) | |
| if not isinstance(entities_data, list): | |
| raise TypeError("Entities data is not a list.") | |
| # This is where the 422 error was happening. We now validate with a clear error message. | |
| request_data = RedactRequest(entities_to_redact=entities_data) | |
| except (json.JSONDecodeError, TypeError) as e: | |
| logging.error(f"JSON parsing or Type error: {e}") | |
| raise HTTPException(status_code=400, detail=f"Invalid JSON format: {e}") | |
| except ValidationError as e: | |
| logging.error(f"Pydantic Validation Error: {e.errors()}") | |
| # Provide a detailed error response to the client | |
| raise HTTPException(status_code=422, detail={"message": "Invalid entity structure provided.", "errors": e.errors()}) | |
| file_content = await file.read() | |
| file_stream = io.BytesIO(file_content) | |
| output_buffer = io.BytesIO() | |
| # --- Redaction Logic (no changes here) --- | |
| if file.filename.endswith(".pdf"): | |
| doc = fitz.open(stream=file_stream, filetype="pdf") | |
| for entity in request_data.entities_to_redact: | |
| for loc in entity.location: | |
| # Ensure loc has the required keys before accessing | |
| if 'page' in loc and 'bbox' in loc: | |
| page = doc[loc['page']] | |
| page.add_redact_annot(loc['bbox'], fill=(0, 0, 0)) | |
| for page in doc: | |
| page.apply_redactions() | |
| doc.save(output_buffer) | |
| doc.close() | |
| media_type = "application/pdf" | |
| elif file.filename.endswith(".docx"): | |
| doc = Document(file_stream) | |
| redacted_runs = set() | |
| for entity in request_data.entities_to_redact: | |
| for loc in entity.location: | |
| if 'p_idx' in loc and 'r_idx' in loc: | |
| p_idx, r_idx = loc['p_idx'], loc['r_idx'] | |
| run_id = (p_idx, r_idx) | |
| if run_id not in redacted_runs and p_idx < len(doc.paragraphs) and r_idx < len(doc.paragraphs[p_idx].runs): | |
| run_to_redact = doc.paragraphs[p_idx].runs[r_idx] | |
| run_to_redact.text = '█' * len(run_to_redact.text) | |
| redacted_runs.add(run_id) | |
| doc.save(output_buffer) | |
| media_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| else: | |
| raise HTTPException(status_code=415, detail="Unsupported file type.") | |
| output_buffer.seek(0) | |
| logging.info("Redaction successful, sending file back to client.") | |
| return StreamingResponse( | |
| output_buffer, | |
| media_type=media_type, | |
| headers={"Content-Disposition": f"attachment; filename=redacted_{file.filename}"} | |
| ) |