from fastapi import UploadFile, HTTPException import os from typing import Dict import logging from services.pii_detector import pii_detector logger = logging.getLogger(__name__) class FileService: """Service to handle file uploads and text extraction""" ALLOWED_EXTENSIONS = {'.txt'} MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB @staticmethod def validate_file(file: UploadFile) -> None: """ Validate uploaded file Args: file: Uploaded file object Raises: HTTPException: If file is invalid """ # Check if file exists if not file: raise HTTPException(status_code=400, detail="No file provided") # Check file extension file_ext = os.path.splitext(file.filename)[1].lower() if file_ext not in FileService.ALLOWED_EXTENSIONS: raise HTTPException( status_code=400, detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed" ) @staticmethod async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]: """ Extract text content from uploaded file and optionally remove PII Args: file: Uploaded file object remove_pii: Whether to remove PII from extracted text (default: True) Returns: Dictionary containing extracted text, PII removal info, and metadata """ try: # Validate file FileService.validate_file(file) # Read file content content = await file.read() # Check file size file_size = len(content) if file_size > FileService.MAX_FILE_SIZE: raise HTTPException( status_code=400, detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB" ) # Decode text try: text = content.decode('utf-8') except UnicodeDecodeError: try: text = content.decode('latin-1') except Exception as e: raise HTTPException( status_code=400, detail="Unable to decode file. Please ensure it's a valid text file" ) # Validate extracted text if not text.strip(): raise HTTPException( status_code=400, detail="File is empty or contains no readable text" ) if len(text.strip()) < 10: raise HTTPException( status_code=400, detail="Extracted text is too short. Please provide more detailed provider notes" ) logger.info(f"✅ Successfully extracted {len(text)} characters from {file.filename}") # Remove PII if requested pii_info = { "pii_removed": False, "pii_count": 0, "pii_details": [] } if remove_pii: logger.info("🔒 Removing PII from extracted text...") pii_result = pii_detector.remove_pii(text) text = pii_result["sanitized_text"] pii_info = { "pii_removed": pii_result["was_pii_removed"], "pii_count": pii_result["pii_count"], "pii_details": pii_result["pii_detected"] } if pii_result["was_pii_removed"]: logger.info(f"✅ Removed {pii_result['pii_count']} PII entities") else: logger.info("✅ No PII detected in text") return { "text": text, "filename": file.filename, "file_size": file_size, "text_length": len(text), "pii_info": pii_info } except HTTPException: raise except Exception as e: logger.error(f"❌ Error extracting text from file: {str(e)}") raise HTTPException( status_code=500, detail=f"Error processing file: {str(e)}" ) # Singleton instance file_service = FileService()