Distopia22's picture
updated route
0b51e25
raw
history blame
4.66 kB
from fastapi import UploadFile, HTTPException
import os
from typing import Dict
import logging
from services.regex_pii_remover import regex_pii_remover
logger = logging.getLogger(__name__)
class FileService:
"""Service to handle file uploads and text extraction with PII removal"""
ALLOWED_EXTENSIONS = {'.txt'}
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
@staticmethod
def validate_file(file: UploadFile) -> None:
"""
Validate uploaded file
Args:
file: Uploaded file object
Raises:
HTTPException: If file is invalid
"""
if not file:
raise HTTPException(status_code=400, detail="No file provided")
file_ext = os.path.splitext(file.filename)[1].lower()
if file_ext not in FileService.ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed"
)
@staticmethod
async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
"""
Extract text content from uploaded file and optionally remove PII using regex
Args:
file: Uploaded file object
remove_pii: Whether to remove PII from extracted text (default: True)
Returns:
Dictionary containing extracted text, PII removal info, and metadata
"""
try:
# Validate file
FileService.validate_file(file)
# Read file content
content = await file.read()
# Check file size
file_size = len(content)
if file_size > FileService.MAX_FILE_SIZE:
raise HTTPException(
status_code=400,
detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB"
)
# Decode text
try:
text = content.decode('utf-8')
except UnicodeDecodeError:
try:
text = content.decode('latin-1')
except Exception as e:
raise HTTPException(
status_code=400,
detail="Unable to decode file. Please ensure it's a valid text file"
)
# Validate extracted text
if not text.strip():
raise HTTPException(
status_code=400,
detail="File is empty or contains no readable text"
)
if len(text.strip()) < 10:
raise HTTPException(
status_code=400,
detail="Extracted text is too short. Please provide more detailed provider notes"
)
logger.info(f"Successfully extracted {len(text)} characters from {file.filename}")
# Remove PII using regex if requested
pii_info = {
"pii_removed": False,
"pii_count": 0,
"pii_details": []
}
if remove_pii:
logger.info("Removing PII from extracted text using regex patterns...")
pii_result = regex_pii_remover.sanitize_provider_notes(text)
text = pii_result["sanitized_notes"]
pii_info = {
"pii_removed": pii_result["was_pii_found"],
"pii_count": pii_result["pii_removed_count"],
"pii_details": pii_result["pii_details"]
}
if pii_result["was_pii_found"]:
logger.info(f"Removed {pii_result['pii_removed_count']} PII entities using regex")
else:
logger.info("No PII detected in text")
return {
"text": text,
"filename": file.filename,
"file_size": file_size,
"text_length": len(text),
"pii_info": pii_info
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error extracting text from file: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"Error processing file: {str(e)}"
)
# Singleton instance
file_service = FileService()