|
|
from fastapi import UploadFile, HTTPException |
|
|
import os |
|
|
from typing import Dict |
|
|
import logging |
|
|
from services.regex_pii_remover import regex_pii_remover |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class FileService: |
|
|
"""Service to handle file uploads and text extraction with PII removal""" |
|
|
|
|
|
ALLOWED_EXTENSIONS = {'.txt'} |
|
|
MAX_FILE_SIZE = 10 * 1024 * 1024 |
|
|
|
|
|
@staticmethod |
|
|
def validate_file(file: UploadFile) -> None: |
|
|
""" |
|
|
Validate uploaded file |
|
|
|
|
|
Args: |
|
|
file: Uploaded file object |
|
|
|
|
|
Raises: |
|
|
HTTPException: If file is invalid |
|
|
""" |
|
|
if not file: |
|
|
raise HTTPException(status_code=400, detail="No file provided") |
|
|
|
|
|
file_ext = os.path.splitext(file.filename)[1].lower() |
|
|
if file_ext not in FileService.ALLOWED_EXTENSIONS: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed" |
|
|
) |
|
|
|
|
|
@staticmethod |
|
|
async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]: |
|
|
""" |
|
|
Extract text content from uploaded file and optionally remove PII using regex |
|
|
|
|
|
Args: |
|
|
file: Uploaded file object |
|
|
remove_pii: Whether to remove PII from extracted text (default: True) |
|
|
|
|
|
Returns: |
|
|
Dictionary containing extracted text, PII removal info, and metadata |
|
|
""" |
|
|
try: |
|
|
|
|
|
FileService.validate_file(file) |
|
|
|
|
|
|
|
|
content = await file.read() |
|
|
|
|
|
|
|
|
file_size = len(content) |
|
|
if file_size > FileService.MAX_FILE_SIZE: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB" |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
text = content.decode('utf-8') |
|
|
except UnicodeDecodeError: |
|
|
try: |
|
|
text = content.decode('latin-1') |
|
|
except Exception as e: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail="Unable to decode file. Please ensure it's a valid text file" |
|
|
) |
|
|
|
|
|
|
|
|
if not text.strip(): |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail="File is empty or contains no readable text" |
|
|
) |
|
|
|
|
|
if len(text.strip()) < 10: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail="Extracted text is too short. Please provide more detailed provider notes" |
|
|
) |
|
|
|
|
|
logger.info(f"Successfully extracted {len(text)} characters from {file.filename}") |
|
|
|
|
|
|
|
|
pii_info = { |
|
|
"pii_removed": False, |
|
|
"pii_count": 0, |
|
|
"pii_details": [] |
|
|
} |
|
|
|
|
|
if remove_pii: |
|
|
logger.info("Removing PII from extracted text using regex patterns...") |
|
|
pii_result = regex_pii_remover.sanitize_provider_notes(text) |
|
|
|
|
|
text = pii_result["sanitized_notes"] |
|
|
pii_info = { |
|
|
"pii_removed": pii_result["was_pii_found"], |
|
|
"pii_count": pii_result["pii_removed_count"], |
|
|
"pii_details": pii_result["pii_details"] |
|
|
} |
|
|
|
|
|
if pii_result["was_pii_found"]: |
|
|
logger.info(f"Removed {pii_result['pii_removed_count']} PII entities using regex") |
|
|
else: |
|
|
logger.info("No PII detected in text") |
|
|
|
|
|
return { |
|
|
"text": text, |
|
|
"filename": file.filename, |
|
|
"file_size": file_size, |
|
|
"text_length": len(text), |
|
|
"pii_info": pii_info |
|
|
} |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Error extracting text from file: {str(e)}") |
|
|
raise HTTPException( |
|
|
status_code=500, |
|
|
detail=f"Error processing file: {str(e)}" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
file_service = FileService() |