Spaces:

42charlie
/

rag-visualizer

Sleeping

rag-visualizer / services /file_validation.py

Ahmed Sadik

fix: correct argument order in get_document_text call for chunk context retrieval

e4662f9 26 days ago

1.34 kB

	import re
	from fastapi import UploadFile

	def validate_mime_type(file: UploadFile):
	if file.content_type != "application/pdf":
	print("[!] validate_mime_type")
	return False
	return True
	def validate_extension(file: UploadFile):
	if not file.filename.endswith((".pdf")):
	print("[!] validate_extension")
	return False
	return True
	async def validate_magic_bytes(file: UploadFile):
	magic_bytes = await file.read(5)
	await file.seek(0) # Reset file pointer after reading
	if magic_bytes != b"%PDF-" or len(magic_bytes) < 5:
	print("[!] validate_magic_bytes")
	return False
	return True

	'''validate the uploaded file'''
	async def validate_document(file: UploadFile):
	if file.file is None:
	print("[!] file is None")
	return False
	if not validate_mime_type(file):
	return False
	if not validate_extension(file):
	return False
	if not await validate_magic_bytes(file):
	return False
	return True

	def sanitize_for_display(filename: str) -> str:
	# Remove any characters that are not letters, numbers, spaces, or common punctuation
	clean_name = re.sub(r'[^\w \-_.\(\)]', '', filename)

	#if it's longer than 50 chars keep the first 20 chars and the last 10
	if len(clean_name) > 50:
	clean_name = clean_name[:40] + "..." + clean_name[-10:]

	return clean_name.strip() or "Untitled Document.pdf"