| from fastapi import UploadFile, HTTPException |
| import os |
| from typing import Dict |
| import logging |
| from services.regex_pii_remover import regex_pii_remover |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class FileService: |
| """Service to handle file uploads and text extraction with PII removal""" |
| |
| ALLOWED_EXTENSIONS = {'.txt'} |
| MAX_FILE_SIZE = 10 * 1024 * 1024 |
| |
| @staticmethod |
| def validate_file(file: UploadFile) -> None: |
| """ |
| Validate uploaded file |
| |
| Args: |
| file: Uploaded file object |
| |
| Raises: |
| HTTPException: If file is invalid |
| """ |
| if not file: |
| raise HTTPException(status_code=400, detail="No file provided") |
| |
| file_ext = os.path.splitext(file.filename)[1].lower() |
| if file_ext not in FileService.ALLOWED_EXTENSIONS: |
| raise HTTPException( |
| status_code=400, |
| detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed" |
| ) |
| |
| @staticmethod |
| async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]: |
| """ |
| Extract text content from uploaded file and optionally remove PII using regex |
| |
| Args: |
| file: Uploaded file object |
| remove_pii: Whether to remove PII from extracted text (default: True) |
| |
| Returns: |
| Dictionary containing extracted text, PII removal info, and metadata |
| """ |
| try: |
| |
| FileService.validate_file(file) |
| |
| |
| content = await file.read() |
| |
| |
| file_size = len(content) |
| if file_size > FileService.MAX_FILE_SIZE: |
| raise HTTPException( |
| status_code=400, |
| detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB" |
| ) |
| |
| |
| try: |
| text = content.decode('utf-8') |
| except UnicodeDecodeError: |
| try: |
| text = content.decode('latin-1') |
| except Exception as e: |
| raise HTTPException( |
| status_code=400, |
| detail="Unable to decode file. Please ensure it's a valid text file" |
| ) |
| |
| |
| if not text.strip(): |
| raise HTTPException( |
| status_code=400, |
| detail="File is empty or contains no readable text" |
| ) |
| |
| if len(text.strip()) < 10: |
| raise HTTPException( |
| status_code=400, |
| detail="Extracted text is too short. Please provide more detailed provider notes" |
| ) |
| |
| logger.info(f"Successfully extracted {len(text)} characters from {file.filename}") |
| |
| |
| pii_info = { |
| "pii_removed": False, |
| "pii_count": 0, |
| "pii_details": [] |
| } |
| |
| if remove_pii: |
| logger.info("Removing PII from extracted text using regex patterns...") |
| pii_result = regex_pii_remover.sanitize_provider_notes(text) |
| |
| text = pii_result["sanitized_notes"] |
| pii_info = { |
| "pii_removed": pii_result["was_pii_found"], |
| "pii_count": pii_result["pii_removed_count"], |
| "pii_details": pii_result["pii_details"] |
| } |
| |
| if pii_result["was_pii_found"]: |
| logger.info(f"Removed {pii_result['pii_removed_count']} PII entities using regex") |
| else: |
| logger.info("No PII detected in text") |
| |
| return { |
| "text": text, |
| "filename": file.filename, |
| "file_size": file_size, |
| "text_length": len(text), |
| "pii_info": pii_info |
| } |
| |
| except HTTPException: |
| raise |
| except Exception as e: |
| logger.error(f"Error extracting text from file: {str(e)}") |
| raise HTTPException( |
| status_code=500, |
| detail=f"Error processing file: {str(e)}" |
| ) |
|
|
|
|
| |
| file_service = FileService() |