File size: 4,662 Bytes
a8f12f6
 
 
 
d8473b6
a8f12f6
 
 
 
 
d8473b6
a8f12f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd20bd2
a8f12f6
d8473b6
a8f12f6
 
 
fd20bd2
a8f12f6
 
fd20bd2
a8f12f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b51e25
fd20bd2
d8473b6
fd20bd2
 
 
 
 
 
 
0b51e25
d8473b6
fd20bd2
d8473b6
fd20bd2
d8473b6
 
 
fd20bd2
 
d8473b6
0b51e25
fd20bd2
0b51e25
a8f12f6
 
 
 
 
fd20bd2
 
a8f12f6
 
 
 
 
0b51e25
a8f12f6
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from fastapi import UploadFile, HTTPException
import os
from typing import Dict
import logging
from services.regex_pii_remover import regex_pii_remover

logger = logging.getLogger(__name__)


class FileService:
    """Service to handle file uploads and text extraction with PII removal"""
    
    ALLOWED_EXTENSIONS = {'.txt'}
    MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
    
    @staticmethod
    def validate_file(file: UploadFile) -> None:
        """
        Validate uploaded file
        
        Args:
            file: Uploaded file object
            
        Raises:
            HTTPException: If file is invalid
        """
        if not file:
            raise HTTPException(status_code=400, detail="No file provided")
        
        file_ext = os.path.splitext(file.filename)[1].lower()
        if file_ext not in FileService.ALLOWED_EXTENSIONS:
            raise HTTPException(
                status_code=400, 
                detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed"
            )
    
    @staticmethod
    async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
        """
        Extract text content from uploaded file and optionally remove PII using regex
        
        Args:
            file: Uploaded file object
            remove_pii: Whether to remove PII from extracted text (default: True)
            
        Returns:
            Dictionary containing extracted text, PII removal info, and metadata
        """
        try:
            # Validate file
            FileService.validate_file(file)
            
            # Read file content
            content = await file.read()
            
            # Check file size
            file_size = len(content)
            if file_size > FileService.MAX_FILE_SIZE:
                raise HTTPException(
                    status_code=400,
                    detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB"
                )
            
            # Decode text
            try:
                text = content.decode('utf-8')
            except UnicodeDecodeError:
                try:
                    text = content.decode('latin-1')
                except Exception as e:
                    raise HTTPException(
                        status_code=400,
                        detail="Unable to decode file. Please ensure it's a valid text file"
                    )
            
            # Validate extracted text
            if not text.strip():
                raise HTTPException(
                    status_code=400,
                    detail="File is empty or contains no readable text"
                )
            
            if len(text.strip()) < 10:
                raise HTTPException(
                    status_code=400,
                    detail="Extracted text is too short. Please provide more detailed provider notes"
                )
            
            logger.info(f"Successfully extracted {len(text)} characters from {file.filename}")
            
            # Remove PII using regex if requested
            pii_info = {
                "pii_removed": False,
                "pii_count": 0,
                "pii_details": []
            }
            
            if remove_pii:
                logger.info("Removing PII from extracted text using regex patterns...")
                pii_result = regex_pii_remover.sanitize_provider_notes(text)
                
                text = pii_result["sanitized_notes"]
                pii_info = {
                    "pii_removed": pii_result["was_pii_found"],
                    "pii_count": pii_result["pii_removed_count"],
                    "pii_details": pii_result["pii_details"]
                }
                
                if pii_result["was_pii_found"]:
                    logger.info(f"Removed {pii_result['pii_removed_count']} PII entities using regex")
                else:
                    logger.info("No PII detected in text")
            
            return {
                "text": text,
                "filename": file.filename,
                "file_size": file_size,
                "text_length": len(text),
                "pii_info": pii_info
            }
            
        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"Error extracting text from file: {str(e)}")
            raise HTTPException(
                status_code=500,
                detail=f"Error processing file: {str(e)}"
            )


# Singleton instance
file_service = FileService()