File size: 4,631 Bytes
a8f12f6
 
 
 
fd20bd2
a8f12f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd20bd2
a8f12f6
fd20bd2
a8f12f6
 
 
fd20bd2
a8f12f6
 
fd20bd2
a8f12f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd20bd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8f12f6
 
 
 
 
fd20bd2
 
a8f12f6
 
 
 
 
fd20bd2
a8f12f6
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from fastapi import UploadFile, HTTPException
import os
from typing import Dict
import logging
from services.pii_detector import pii_detector

logger = logging.getLogger(__name__)


class FileService:
    """Service to handle file uploads and text extraction"""
    
    ALLOWED_EXTENSIONS = {'.txt'}
    MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
    
    @staticmethod
    def validate_file(file: UploadFile) -> None:
        """
        Validate uploaded file
        
        Args:
            file: Uploaded file object
            
        Raises:
            HTTPException: If file is invalid
        """
        # Check if file exists
        if not file:
            raise HTTPException(status_code=400, detail="No file provided")
        
        # Check file extension
        file_ext = os.path.splitext(file.filename)[1].lower()
        if file_ext not in FileService.ALLOWED_EXTENSIONS:
            raise HTTPException(
                status_code=400, 
                detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed"
            )
    
    @staticmethod
    async def extract_text_from_file(file: UploadFile, remove_pii: bool = True) -> Dict[str, any]:
        """
        Extract text content from uploaded file and optionally remove PII
        
        Args:
            file: Uploaded file object
            remove_pii: Whether to remove PII from extracted text (default: True)
            
        Returns:
            Dictionary containing extracted text, PII removal info, and metadata
        """
        try:
            # Validate file
            FileService.validate_file(file)
            
            # Read file content
            content = await file.read()
            
            # Check file size
            file_size = len(content)
            if file_size > FileService.MAX_FILE_SIZE:
                raise HTTPException(
                    status_code=400,
                    detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB"
                )
            
            # Decode text
            try:
                text = content.decode('utf-8')
            except UnicodeDecodeError:
                try:
                    text = content.decode('latin-1')
                except Exception as e:
                    raise HTTPException(
                        status_code=400,
                        detail="Unable to decode file. Please ensure it's a valid text file"
                    )
            
            # Validate extracted text
            if not text.strip():
                raise HTTPException(
                    status_code=400,
                    detail="File is empty or contains no readable text"
                )
            
            if len(text.strip()) < 10:
                raise HTTPException(
                    status_code=400,
                    detail="Extracted text is too short. Please provide more detailed provider notes"
                )
            
            logger.info(f"✅ Successfully extracted {len(text)} characters from {file.filename}")
            
            # Remove PII if requested
            pii_info = {
                "pii_removed": False,
                "pii_count": 0,
                "pii_details": []
            }
            
            if remove_pii:
                logger.info("🔒 Removing PII from extracted text...")
                pii_result = pii_detector.remove_pii(text)
                
                text = pii_result["sanitized_text"]
                pii_info = {
                    "pii_removed": pii_result["was_pii_removed"],
                    "pii_count": pii_result["pii_count"],
                    "pii_details": pii_result["pii_detected"]
                }
                
                if pii_result["was_pii_removed"]:
                    logger.info(f"✅ Removed {pii_result['pii_count']} PII entities")
                else:
                    logger.info("✅ No PII detected in text")
            
            return {
                "text": text,
                "filename": file.filename,
                "file_size": file_size,
                "text_length": len(text),
                "pii_info": pii_info
            }
            
        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"❌ Error extracting text from file: {str(e)}")
            raise HTTPException(
                status_code=500,
                detail=f"Error processing file: {str(e)}"
            )


# Singleton instance
file_service = FileService()