Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -173,7 +173,7 @@ def chunk_markdown(
|
|
| 173 |
chunk_overlap: int = 200,
|
| 174 |
strip_headers: bool = True
|
| 175 |
) -> List[Document]:
|
| 176 |
-
"""Chunks markdown text, preserving headers in metadata."""
|
| 177 |
if not markdown_text_with_images or not markdown_text_with_images.strip():
|
| 178 |
logger.warning("chunk_markdown received empty input.")
|
| 179 |
return []
|
|
@@ -188,12 +188,15 @@ def chunk_markdown(
|
|
| 188 |
header_chunks = markdown_splitter.split_text(markdown_text_with_images)
|
| 189 |
|
| 190 |
if not header_chunks:
|
|
|
|
| 191 |
return []
|
| 192 |
|
| 193 |
final_chunks = []
|
| 194 |
if chunk_size > 0:
|
| 195 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 196 |
-
chunk_size=chunk_size,
|
|
|
|
|
|
|
| 197 |
separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
|
| 198 |
add_start_index=True
|
| 199 |
)
|
|
@@ -206,17 +209,23 @@ def chunk_markdown(
|
|
| 206 |
else:
|
| 207 |
final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
|
| 208 |
|
|
|
|
| 209 |
for chunk in final_chunks:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
images_in_chunk = re.findall(
|
| 211 |
-
r"!\[.*?\]\((data:image/[a-zA-Z
|
| 212 |
chunk.page_content
|
| 213 |
)
|
| 214 |
-
if
|
| 215 |
-
|
| 216 |
-
chunk.metadata["images_base64"] = images_in_chunk
|
| 217 |
|
|
|
|
| 218 |
return final_chunks
|
| 219 |
|
|
|
|
| 220 |
def get_hf_token(explicit_token: str = None) -> str:
|
| 221 |
"""Retrieve Hugging Face token with fallback mechanisms."""
|
| 222 |
global hf_token_global
|
|
|
|
| 173 |
chunk_overlap: int = 200,
|
| 174 |
strip_headers: bool = True
|
| 175 |
) -> List[Document]:
|
| 176 |
+
"""Chunks markdown text, preserving headers in metadata and extracting base64 images."""
|
| 177 |
if not markdown_text_with_images or not markdown_text_with_images.strip():
|
| 178 |
logger.warning("chunk_markdown received empty input.")
|
| 179 |
return []
|
|
|
|
| 188 |
header_chunks = markdown_splitter.split_text(markdown_text_with_images)
|
| 189 |
|
| 190 |
if not header_chunks:
|
| 191 |
+
logger.warning("No chunks created from markdown splitting.")
|
| 192 |
return []
|
| 193 |
|
| 194 |
final_chunks = []
|
| 195 |
if chunk_size > 0:
|
| 196 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 197 |
+
chunk_size=chunk_size,
|
| 198 |
+
chunk_overlap=chunk_overlap,
|
| 199 |
+
length_function=len,
|
| 200 |
separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
|
| 201 |
add_start_index=True
|
| 202 |
)
|
|
|
|
| 209 |
else:
|
| 210 |
final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
|
| 211 |
|
| 212 |
+
# Extract base64 images and add to metadata
|
| 213 |
for chunk in final_chunks:
|
| 214 |
+
if not hasattr(chunk, 'metadata'):
|
| 215 |
+
chunk.metadata = {}
|
| 216 |
+
|
| 217 |
+
# Improved regex to capture full base64 data URI
|
| 218 |
images_in_chunk = re.findall(
|
| 219 |
+
r"!\[.*?\]\((data:image/[a-zA-Z]+;base64,[A-Za-z0-9+/]+={0,2})\)",
|
| 220 |
chunk.page_content
|
| 221 |
)
|
| 222 |
+
chunk.metadata["images_base64"] = images_in_chunk if images_in_chunk else []
|
| 223 |
+
logger.debug(f"Chunk metadata updated with {len(images_in_chunk)} base64 images")
|
|
|
|
| 224 |
|
| 225 |
+
logger.info(f"Created {len(final_chunks)} chunks with base64 metadata")
|
| 226 |
return final_chunks
|
| 227 |
|
| 228 |
+
|
| 229 |
def get_hf_token(explicit_token: str = None) -> str:
|
| 230 |
"""Retrieve Hugging Face token with fallback mechanisms."""
|
| 231 |
global hf_token_global
|