Spaces:
Runtime error
Runtime error
| import re | |
| import base64 | |
| from langchain_core.documents import Document | |
| # helps in detecting base64 encoded strings | |
| def looks_like_base64(sb): | |
| """Check if the string looks like base64""" | |
| return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None | |
| # helps in checking if the base64 encoded image is actually an image | |
| def is_image_data(b64data): | |
| """ | |
| Check if the base64 data is an image by looking at the start of the data | |
| """ | |
| image_signatures = { | |
| b"\xff\xd8\xff": "jpg", | |
| b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png", | |
| b"\x47\x49\x46\x38": "gif", | |
| b"\x52\x49\x46\x46": "webp", | |
| } | |
| try: | |
| header = base64.b64decode(b64data)[:8] # Decode and get the first 8 bytes | |
| for sig, format in image_signatures.items(): | |
| if header.startswith(sig): | |
| return True | |
| return False | |
| except Exception: | |
| return False | |
| # returns a dictionary separating images and text (with table) elements | |
| def split_image_text_types(docs): | |
| """ | |
| Split base64-encoded images and texts (with tables) | |
| """ | |
| b64_images = [] | |
| texts = [] | |
| for doc in docs: | |
| # Check if the document is of type Document and extract page_content if so | |
| if isinstance(doc, Document): | |
| doc = doc.page_content.decode('utf-8') | |
| else: | |
| doc = doc.decode('utf-8') | |
| if looks_like_base64(doc) and is_image_data(doc): | |
| b64_images.append(doc) | |
| else: | |
| texts.append(doc) | |
| return {"images": b64_images, "texts": texts} | |
| def beautify_output(text_list): | |
| # Combine list into single text | |
| raw_text = " ".join(text_list) | |
| # Remove unwanted characters like [|<, random numbers between newlines | |
| cleaned_text = re.sub(r'\[\|\<\s*\d*\s*', '', raw_text) | |
| cleaned_text = re.sub(r'\n+', '\n\n', cleaned_text) # Replace multiple \n with 2 newlines | |
| cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with single space | |
| cleaned_text = re.sub(r'([.!?])\s*', r'\1\n\n', cleaned_text) # Newline after periods, exclamation, question marks | |
| # Remove weird number artifacts (like 24) not attached to a sentence | |
| cleaned_text = re.sub(r'(\n\n)\d+(\n\n)', r'\1', cleaned_text) | |
| # Strip leading/trailing spaces | |
| cleaned_text = cleaned_text.strip() | |
| return cleaned_text |