Spaces:

sunny333
/

multimodalRAG

Runtime error

App Files Files Community

multimodalRAG / RAG_MLM /utility.py

sunny333

initial commit

568cd7b 12 months ago

raw

history blame contribute delete

2.36 kB

	import re
	import base64
	from langchain_core.documents import Document

	# helps in detecting base64 encoded strings
	def looks_like_base64(sb):
	"""Check if the string looks like base64"""
	return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None

	# helps in checking if the base64 encoded image is actually an image
	def is_image_data(b64data):
	"""
	Check if the base64 data is an image by looking at the start of the data
	"""
	image_signatures = {
	b"\xff\xd8\xff": "jpg",
	b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
	b"\x47\x49\x46\x38": "gif",
	b"\x52\x49\x46\x46": "webp",
	}
	try:
	header = base64.b64decode(b64data)[:8] # Decode and get the first 8 bytes
	for sig, format in image_signatures.items():
	if header.startswith(sig):
	return True
	return False
	except Exception:
	return False

	# returns a dictionary separating images and text (with table) elements
	def split_image_text_types(docs):
	"""
	Split base64-encoded images and texts (with tables)
	"""
	b64_images = []
	texts = []
	for doc in docs:
	# Check if the document is of type Document and extract page_content if so
	if isinstance(doc, Document):
	doc = doc.page_content.decode('utf-8')
	else:
	doc = doc.decode('utf-8')
	if looks_like_base64(doc) and is_image_data(doc):
	b64_images.append(doc)
	else:
	texts.append(doc)
	return {"images": b64_images, "texts": texts}

	def beautify_output(text_list):
	# Combine list into single text
	raw_text = " ".join(text_list)

	# Remove unwanted characters like [\|<, random numbers between newlines
	cleaned_text = re.sub(r'\[\\|\<\s\d\s*', '', raw_text)
	cleaned_text = re.sub(r'\n+', '\n\n', cleaned_text) # Replace multiple \n with 2 newlines
	cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with single space
	cleaned_text = re.sub(r'([.!?])\s*', r'\1\n\n', cleaned_text) # Newline after periods, exclamation, question marks

	# Remove weird number artifacts (like 24) not attached to a sentence
	cleaned_text = re.sub(r'(\n\n)\d+(\n\n)', r'\1', cleaned_text)

	# Strip leading/trailing spaces
	cleaned_text = cleaned_text.strip()

	return cleaned_text