multimodalRAG / RAG_MLM /utility.py
sunny333's picture
initial commit
568cd7b
import re
import base64
from langchain_core.documents import Document
# helps in detecting base64 encoded strings
def looks_like_base64(sb):
"""Check if the string looks like base64"""
return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None
# helps in checking if the base64 encoded image is actually an image
def is_image_data(b64data):
"""
Check if the base64 data is an image by looking at the start of the data
"""
image_signatures = {
b"\xff\xd8\xff": "jpg",
b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
b"\x47\x49\x46\x38": "gif",
b"\x52\x49\x46\x46": "webp",
}
try:
header = base64.b64decode(b64data)[:8] # Decode and get the first 8 bytes
for sig, format in image_signatures.items():
if header.startswith(sig):
return True
return False
except Exception:
return False
# returns a dictionary separating images and text (with table) elements
def split_image_text_types(docs):
"""
Split base64-encoded images and texts (with tables)
"""
b64_images = []
texts = []
for doc in docs:
# Check if the document is of type Document and extract page_content if so
if isinstance(doc, Document):
doc = doc.page_content.decode('utf-8')
else:
doc = doc.decode('utf-8')
if looks_like_base64(doc) and is_image_data(doc):
b64_images.append(doc)
else:
texts.append(doc)
return {"images": b64_images, "texts": texts}
def beautify_output(text_list):
# Combine list into single text
raw_text = " ".join(text_list)
# Remove unwanted characters like [|<, random numbers between newlines
cleaned_text = re.sub(r'\[\|\<\s*\d*\s*', '', raw_text)
cleaned_text = re.sub(r'\n+', '\n\n', cleaned_text) # Replace multiple \n with 2 newlines
cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with single space
cleaned_text = re.sub(r'([.!?])\s*', r'\1\n\n', cleaned_text) # Newline after periods, exclamation, question marks
# Remove weird number artifacts (like 24) not attached to a sentence
cleaned_text = re.sub(r'(\n\n)\d+(\n\n)', r'\1', cleaned_text)
# Strip leading/trailing spaces
cleaned_text = cleaned_text.strip()
return cleaned_text