Spaces:
Sleeping
Sleeping
| """ | |
| Hugging Face Persistent Storage Helper | |
| Provides file storage operations using Hugging Face persistent storage directory | |
| """ | |
| import os | |
| import base64 | |
| import logging | |
| from typing import Optional, Dict, List | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| # Hugging Face persistent storage directory | |
| HF_PERSIST_DIR = os.getenv("HF_HOME", "/persistent") | |
| if not os.path.exists(HF_PERSIST_DIR): | |
| # Fallback to local directory if /persistent doesn't exist | |
| HF_PERSIST_DIR = "./hf_persistent" | |
| os.makedirs(HF_PERSIST_DIR, exist_ok=True) | |
| # Base directories for different content types | |
| IMAGES_DIR = os.path.join(HF_PERSIST_DIR, "images") | |
| SUMMARIES_DIR = os.path.join(HF_PERSIST_DIR, "summaries") | |
| FIGURE_INDEX_DIR = os.path.join(HF_PERSIST_DIR, "figure_index") | |
| # Ensure directories exist | |
| os.makedirs(IMAGES_DIR, exist_ok=True) | |
| os.makedirs(SUMMARIES_DIR, exist_ok=True) | |
| os.makedirs(FIGURE_INDEX_DIR, exist_ok=True) | |
| def get_image_path(image_id: str, folder_prefix: str = "pdf_images") -> str: | |
| """Get the file path for an image""" | |
| folder_path = os.path.join(IMAGES_DIR, folder_prefix) | |
| os.makedirs(folder_path, exist_ok=True) | |
| return os.path.join(folder_path, f"{image_id}.jpg") | |
| def get_summary_path(image_id: str) -> str: | |
| """Get the file path for an image summary""" | |
| return os.path.join(SUMMARIES_DIR, f"{image_id}.txt") | |
| async def store_image(image_id: str, image_data: bytes, folder_prefix: str = "pdf_images") -> bool: | |
| """ | |
| Store an image in Hugging Face persistent storage | |
| Args: | |
| image_id: Unique identifier for the image | |
| image_data: Image data as bytes | |
| folder_prefix: Folder prefix (e.g., 'pdf_images', 'pptx_images') | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| image_path = get_image_path(image_id, folder_prefix) | |
| with open(image_path, "wb") as f: | |
| f.write(image_data) | |
| logger.info(f"Stored image in HF persistent storage: {image_path}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error storing image {image_id}: {e}") | |
| return False | |
| async def retrieve_image(image_id: str, folder_prefix: str = "pdf_images") -> Optional[Dict]: | |
| """ | |
| Retrieve an image from Hugging Face persistent storage | |
| Args: | |
| image_id: Unique identifier for the image | |
| folder_prefix: Folder prefix (e.g., 'pdf_images', 'pptx_images') | |
| Returns: | |
| Dict with image data and metadata, or None if not found | |
| """ | |
| try: | |
| image_path = get_image_path(image_id, folder_prefix) | |
| if not os.path.exists(image_path): | |
| logger.warning(f"Image not found: {image_path}") | |
| return None | |
| with open(image_path, "rb") as f: | |
| image_data = f.read() | |
| base64_image = base64.b64encode(image_data).decode('utf-8') | |
| return { | |
| "image_id": image_id, | |
| "base64_data": base64_image, | |
| "url": f"hf://{image_path}", # Pseudo-URL for HF storage | |
| "size": len(image_data), | |
| "folder_prefix": folder_prefix | |
| } | |
| except Exception as e: | |
| logger.error(f"Error retrieving image {image_id}: {e}") | |
| return None | |
| async def store_image_summary(image_id: str, summary: str) -> bool: | |
| """ | |
| Store an image summary in Hugging Face persistent storage | |
| Args: | |
| image_id: Unique identifier for the image | |
| summary: Image summary text | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| summary_path = get_summary_path(image_id) | |
| with open(summary_path, "w", encoding="utf-8") as f: | |
| f.write(summary) | |
| logger.info(f"Stored image summary in HF persistent storage: {summary_path}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error storing image summary for {image_id}: {e}") | |
| return False | |
| async def retrieve_image_summary(image_id: str) -> Optional[str]: | |
| """ | |
| Retrieve an image summary from Hugging Face persistent storage | |
| Args: | |
| image_id: Unique identifier for the image | |
| Returns: | |
| str: Image summary if found, None otherwise | |
| """ | |
| try: | |
| summary_path = get_summary_path(image_id) | |
| if not os.path.exists(summary_path): | |
| logger.warning(f"Image summary not found: {summary_path}") | |
| return None | |
| with open(summary_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| except Exception as e: | |
| logger.error(f"Error retrieving image summary for {image_id}: {e}") | |
| return None | |
| async def get_all_image_summaries() -> Dict[str, str]: | |
| """ | |
| Retrieve all image summaries from Hugging Face persistent storage | |
| Returns: | |
| Dict mapping image_id to summary | |
| """ | |
| summaries = {} | |
| try: | |
| if os.path.exists(SUMMARIES_DIR): | |
| for filename in os.listdir(SUMMARIES_DIR): | |
| if filename.endswith('.txt'): | |
| image_id = filename.replace('.txt', '') | |
| summary = await retrieve_image_summary(image_id) | |
| if summary: | |
| summaries[image_id] = summary | |
| logger.info(f"Retrieved {len(summaries)} image summaries from HF persistent storage") | |
| except Exception as e: | |
| logger.error(f"Error retrieving all image summaries: {e}") | |
| return summaries | |
| async def clear_all_images(folder_prefix: Optional[str] = None) -> bool: | |
| """ | |
| Clear all images from Hugging Face persistent storage | |
| Args: | |
| folder_prefix: Optional folder prefix to clear (e.g., 'pdf_images') | |
| If None, clears all folders | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| total_deleted = 0 | |
| if folder_prefix: | |
| folder_path = os.path.join(IMAGES_DIR, folder_prefix) | |
| if os.path.exists(folder_path): | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith('.jpg'): | |
| file_path = os.path.join(folder_path, filename) | |
| os.remove(file_path) | |
| total_deleted += 1 | |
| else: | |
| # Clear all folders | |
| if os.path.exists(IMAGES_DIR): | |
| for folder_name in os.listdir(IMAGES_DIR): | |
| folder_path = os.path.join(IMAGES_DIR, folder_name) | |
| if os.path.isdir(folder_path): | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith('.jpg'): | |
| file_path = os.path.join(folder_path, filename) | |
| os.remove(file_path) | |
| total_deleted += 1 | |
| logger.info(f"Deleted {total_deleted} images from HF persistent storage") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error clearing images: {e}") | |
| return False | |
| async def clear_all_summaries() -> bool: | |
| """ | |
| Clear all image summaries from Hugging Face persistent storage | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| total_deleted = 0 | |
| if os.path.exists(SUMMARIES_DIR): | |
| for filename in os.listdir(SUMMARIES_DIR): | |
| if filename.endswith('.txt'): | |
| file_path = os.path.join(SUMMARIES_DIR, filename) | |
| os.remove(file_path) | |
| total_deleted += 1 | |
| logger.info(f"Deleted {total_deleted} image summaries from HF persistent storage") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error clearing summaries: {e}") | |
| return False | |
| async def store_figure_index_entry(figure_key: str, figure_metadata: Dict) -> bool: | |
| """ | |
| Store a figure index entry in Hugging Face persistent storage | |
| Args: | |
| figure_key: Figure key (e.g., "figure_5") | |
| figure_metadata: Dictionary containing figure metadata | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| import json | |
| index_path = os.path.join(FIGURE_INDEX_DIR, f"{figure_key}.json") | |
| with open(index_path, "w", encoding="utf-8") as f: | |
| json.dump(figure_metadata, f, ensure_ascii=False, indent=2) | |
| logger.info(f"Stored figure index entry in HF persistent storage: {index_path}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error storing figure index entry {figure_key}: {e}") | |
| return False | |
| async def get_figure_index_entry(figure_key: str) -> Optional[Dict]: | |
| """ | |
| Retrieve a figure index entry from Hugging Face persistent storage | |
| Args: | |
| figure_key: Figure key (e.g., "figure_5") | |
| Returns: | |
| Dict: Figure metadata if found, None otherwise | |
| """ | |
| try: | |
| import json | |
| index_path = os.path.join(FIGURE_INDEX_DIR, f"{figure_key}.json") | |
| if not os.path.exists(index_path): | |
| logger.warning(f"Figure index entry not found: {index_path}") | |
| return None | |
| with open(index_path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| except Exception as e: | |
| logger.error(f"Error retrieving figure index entry {figure_key}: {e}") | |
| return None |