|
|
from fastapi import UploadFile, HTTPException |
|
|
import os |
|
|
from typing import Dict |
|
|
import logging |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class FileService: |
|
|
"""Service to handle file uploads and text extraction""" |
|
|
|
|
|
ALLOWED_EXTENSIONS = {'.txt'} |
|
|
MAX_FILE_SIZE = 10 * 1024 * 1024 |
|
|
|
|
|
@staticmethod |
|
|
def validate_file(file: UploadFile) -> None: |
|
|
""" |
|
|
Validate uploaded file |
|
|
|
|
|
Args: |
|
|
file: Uploaded file object |
|
|
|
|
|
Raises: |
|
|
HTTPException: If file is invalid |
|
|
""" |
|
|
|
|
|
if not file: |
|
|
raise HTTPException(status_code=400, detail="No file provided") |
|
|
|
|
|
|
|
|
file_ext = os.path.splitext(file.filename)[1].lower() |
|
|
if file_ext not in FileService.ALLOWED_EXTENSIONS: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail=f"Invalid file type. Only {', '.join(FileService.ALLOWED_EXTENSIONS)} files are allowed" |
|
|
) |
|
|
|
|
|
@staticmethod |
|
|
async def extract_text_from_file(file: UploadFile) -> Dict[str, any]: |
|
|
""" |
|
|
Extract text content from uploaded file |
|
|
|
|
|
Args: |
|
|
file: Uploaded file object |
|
|
|
|
|
Returns: |
|
|
Dictionary containing extracted text and metadata |
|
|
""" |
|
|
try: |
|
|
|
|
|
FileService.validate_file(file) |
|
|
|
|
|
|
|
|
content = await file.read() |
|
|
|
|
|
|
|
|
file_size = len(content) |
|
|
if file_size > FileService.MAX_FILE_SIZE: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail=f"File too large. Maximum size is {FileService.MAX_FILE_SIZE / (1024*1024)} MB" |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
text = content.decode('utf-8') |
|
|
except UnicodeDecodeError: |
|
|
try: |
|
|
text = content.decode('latin-1') |
|
|
except Exception as e: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail="Unable to decode file. Please ensure it's a valid text file" |
|
|
) |
|
|
|
|
|
|
|
|
if not text.strip(): |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail="File is empty or contains no readable text" |
|
|
) |
|
|
|
|
|
if len(text.strip()) < 10: |
|
|
raise HTTPException( |
|
|
status_code=400, |
|
|
detail="Extracted text is too short. Please provide more detailed provider notes" |
|
|
) |
|
|
|
|
|
logger.info(f"Successfully extracted {len(text)} characters from {file.filename}") |
|
|
|
|
|
return { |
|
|
"text": text, |
|
|
"filename": file.filename, |
|
|
"file_size": file_size, |
|
|
"text_length": len(text) |
|
|
} |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Error extracting text from file: {str(e)}") |
|
|
raise HTTPException( |
|
|
status_code=500, |
|
|
detail=f"Error processing file: {str(e)}" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
file_service = FileService() |