rag-visualizer / services /file_validation.py
Ahmed Sadik
fix: correct argument order in get_document_text call for chunk context retrieval
e4662f9
import re
from fastapi import UploadFile
def validate_mime_type(file: UploadFile):
if file.content_type != "application/pdf":
print("[!] validate_mime_type")
return False
return True
def validate_extension(file: UploadFile):
if not file.filename.endswith((".pdf")):
print("[!] validate_extension")
return False
return True
async def validate_magic_bytes(file: UploadFile):
magic_bytes = await file.read(5)
await file.seek(0) # Reset file pointer after reading
if magic_bytes != b"%PDF-" or len(magic_bytes) < 5:
print("[!] validate_magic_bytes")
return False
return True
'''validate the uploaded file'''
async def validate_document(file: UploadFile):
if file.file is None:
print("[!] file is None")
return False
if not validate_mime_type(file):
return False
if not validate_extension(file):
return False
if not await validate_magic_bytes(file):
return False
return True
def sanitize_for_display(filename: str) -> str:
# Remove any characters that are not letters, numbers, spaces, or common punctuation
clean_name = re.sub(r'[^\w \-_.\(\)]', '', filename)
#if it's longer than 50 chars keep the first 20 chars and the last 10
if len(clean_name) > 50:
clean_name = clean_name[:40] + "..." + clean_name[-10:]
return clean_name.strip() or "Untitled Document.pdf"