File size: 4,662 Bytes
a8f12f6 d8473b6 a8f12f6 d8473b6 a8f12f6 fd20bd2 a8f12f6 d8473b6 a8f12f6 fd20bd2 a8f12f6 fd20bd2 a8f12f6 0b51e25 fd20bd2 d8473b6 fd20bd2 0b51e25 d8473b6 fd20bd2 d8473b6 fd20bd2 d8473b6 fd20bd2 d8473b6 0b51e25 fd20bd2 0b51e25 a8f12f6 fd20bd2 a8f12f6 0b51e25 a8f12f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from fastapi import UploadFile, HTTPException
import os
from typing import Dict
import logging
from services.regex_pii_remover import regex_pii_remover
logger = logging.getLogger(__name__)
class FileService:
"""Service to handle file uploads and text extraction with PII removal"""
ALLOWED_EXTENSIONS = {'.txt'}
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
@staticmethod
def validate_file(file: UploadFile) -> None:
"""
Validate uploaded file
Args:
file: Uploaded file object
Raises:
HTTPException: If file is invalid
"""
if not file:
raise HTTPException(status_code=400, detail="No file provided")
file_ext = os.path.splitext(file.filename)[1].lower()
if file_ext not in FileService.ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed"
)
@staticmethod
async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
"""
Extract text content from uploaded file and optionally remove PII using regex
Args:
file: Uploaded file object
remove_pii: Whether to remove PII from extracted text (default: True)
Returns:
Dictionary containing extracted text, PII removal info, and metadata
"""
try:
# Validate file
FileService.validate_file(file)
# Read file content
content = await file.read()
# Check file size
file_size = len(content)
if file_size > FileService.MAX_FILE_SIZE:
raise HTTPException(
status_code=400,
detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB"
)
# Decode text
try:
text = content.decode('utf-8')
except UnicodeDecodeError:
try:
text = content.decode('latin-1')
except Exception as e:
raise HTTPException(
status_code=400,
detail="Unable to decode file. Please ensure it's a valid text file"
)
# Validate extracted text
if not text.strip():
raise HTTPException(
status_code=400,
detail="File is empty or contains no readable text"
)
if len(text.strip()) < 10:
raise HTTPException(
status_code=400,
detail="Extracted text is too short. Please provide more detailed provider notes"
)
logger.info(f"Successfully extracted {len(text)} characters from {file.filename}")
# Remove PII using regex if requested
pii_info = {
"pii_removed": False,
"pii_count": 0,
"pii_details": []
}
if remove_pii:
logger.info("Removing PII from extracted text using regex patterns...")
pii_result = regex_pii_remover.sanitize_provider_notes(text)
text = pii_result["sanitized_notes"]
pii_info = {
"pii_removed": pii_result["was_pii_found"],
"pii_count": pii_result["pii_removed_count"],
"pii_details": pii_result["pii_details"]
}
if pii_result["was_pii_found"]:
logger.info(f"Removed {pii_result['pii_removed_count']} PII entities using regex")
else:
logger.info("No PII detected in text")
return {
"text": text,
"filename": file.filename,
"file_size": file_size,
"text_length": len(text),
"pii_info": pii_info
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error extracting text from file: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"Error processing file: {str(e)}"
)
# Singleton instance
file_service = FileService() |