rag-app / hf_storage.py
bhavinmatariya's picture
Upload 13 files
3506c42 verified
"""
Hugging Face Persistent Storage Helper
Provides file storage operations using Hugging Face persistent storage directory
"""
import os
import base64
import logging
from typing import Optional, Dict, List
from pathlib import Path
logger = logging.getLogger(__name__)
# Hugging Face persistent storage directory
HF_PERSIST_DIR = os.getenv("HF_HOME", "/persistent")
if not os.path.exists(HF_PERSIST_DIR):
# Fallback to local directory if /persistent doesn't exist
HF_PERSIST_DIR = "./hf_persistent"
os.makedirs(HF_PERSIST_DIR, exist_ok=True)
# Base directories for different content types
IMAGES_DIR = os.path.join(HF_PERSIST_DIR, "images")
SUMMARIES_DIR = os.path.join(HF_PERSIST_DIR, "summaries")
FIGURE_INDEX_DIR = os.path.join(HF_PERSIST_DIR, "figure_index")
# Ensure directories exist
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(SUMMARIES_DIR, exist_ok=True)
os.makedirs(FIGURE_INDEX_DIR, exist_ok=True)
def get_image_path(image_id: str, folder_prefix: str = "pdf_images") -> str:
"""Get the file path for an image"""
folder_path = os.path.join(IMAGES_DIR, folder_prefix)
os.makedirs(folder_path, exist_ok=True)
return os.path.join(folder_path, f"{image_id}.jpg")
def get_summary_path(image_id: str) -> str:
"""Get the file path for an image summary"""
return os.path.join(SUMMARIES_DIR, f"{image_id}.txt")
async def store_image(image_id: str, image_data: bytes, folder_prefix: str = "pdf_images") -> bool:
"""
Store an image in Hugging Face persistent storage
Args:
image_id: Unique identifier for the image
image_data: Image data as bytes
folder_prefix: Folder prefix (e.g., 'pdf_images', 'pptx_images')
Returns:
bool: True if successful, False otherwise
"""
try:
image_path = get_image_path(image_id, folder_prefix)
with open(image_path, "wb") as f:
f.write(image_data)
logger.info(f"Stored image in HF persistent storage: {image_path}")
return True
except Exception as e:
logger.error(f"Error storing image {image_id}: {e}")
return False
async def retrieve_image(image_id: str, folder_prefix: str = "pdf_images") -> Optional[Dict]:
"""
Retrieve an image from Hugging Face persistent storage
Args:
image_id: Unique identifier for the image
folder_prefix: Folder prefix (e.g., 'pdf_images', 'pptx_images')
Returns:
Dict with image data and metadata, or None if not found
"""
try:
image_path = get_image_path(image_id, folder_prefix)
if not os.path.exists(image_path):
logger.warning(f"Image not found: {image_path}")
return None
with open(image_path, "rb") as f:
image_data = f.read()
base64_image = base64.b64encode(image_data).decode('utf-8')
return {
"image_id": image_id,
"base64_data": base64_image,
"url": f"hf://{image_path}", # Pseudo-URL for HF storage
"size": len(image_data),
"folder_prefix": folder_prefix
}
except Exception as e:
logger.error(f"Error retrieving image {image_id}: {e}")
return None
async def store_image_summary(image_id: str, summary: str) -> bool:
"""
Store an image summary in Hugging Face persistent storage
Args:
image_id: Unique identifier for the image
summary: Image summary text
Returns:
bool: True if successful, False otherwise
"""
try:
summary_path = get_summary_path(image_id)
with open(summary_path, "w", encoding="utf-8") as f:
f.write(summary)
logger.info(f"Stored image summary in HF persistent storage: {summary_path}")
return True
except Exception as e:
logger.error(f"Error storing image summary for {image_id}: {e}")
return False
async def retrieve_image_summary(image_id: str) -> Optional[str]:
"""
Retrieve an image summary from Hugging Face persistent storage
Args:
image_id: Unique identifier for the image
Returns:
str: Image summary if found, None otherwise
"""
try:
summary_path = get_summary_path(image_id)
if not os.path.exists(summary_path):
logger.warning(f"Image summary not found: {summary_path}")
return None
with open(summary_path, "r", encoding="utf-8") as f:
return f.read()
except Exception as e:
logger.error(f"Error retrieving image summary for {image_id}: {e}")
return None
async def get_all_image_summaries() -> Dict[str, str]:
"""
Retrieve all image summaries from Hugging Face persistent storage
Returns:
Dict mapping image_id to summary
"""
summaries = {}
try:
if os.path.exists(SUMMARIES_DIR):
for filename in os.listdir(SUMMARIES_DIR):
if filename.endswith('.txt'):
image_id = filename.replace('.txt', '')
summary = await retrieve_image_summary(image_id)
if summary:
summaries[image_id] = summary
logger.info(f"Retrieved {len(summaries)} image summaries from HF persistent storage")
except Exception as e:
logger.error(f"Error retrieving all image summaries: {e}")
return summaries
async def clear_all_images(folder_prefix: Optional[str] = None) -> bool:
"""
Clear all images from Hugging Face persistent storage
Args:
folder_prefix: Optional folder prefix to clear (e.g., 'pdf_images')
If None, clears all folders
Returns:
bool: True if successful, False otherwise
"""
try:
total_deleted = 0
if folder_prefix:
folder_path = os.path.join(IMAGES_DIR, folder_prefix)
if os.path.exists(folder_path):
for filename in os.listdir(folder_path):
if filename.endswith('.jpg'):
file_path = os.path.join(folder_path, filename)
os.remove(file_path)
total_deleted += 1
else:
# Clear all folders
if os.path.exists(IMAGES_DIR):
for folder_name in os.listdir(IMAGES_DIR):
folder_path = os.path.join(IMAGES_DIR, folder_name)
if os.path.isdir(folder_path):
for filename in os.listdir(folder_path):
if filename.endswith('.jpg'):
file_path = os.path.join(folder_path, filename)
os.remove(file_path)
total_deleted += 1
logger.info(f"Deleted {total_deleted} images from HF persistent storage")
return True
except Exception as e:
logger.error(f"Error clearing images: {e}")
return False
async def clear_all_summaries() -> bool:
"""
Clear all image summaries from Hugging Face persistent storage
Returns:
bool: True if successful, False otherwise
"""
try:
total_deleted = 0
if os.path.exists(SUMMARIES_DIR):
for filename in os.listdir(SUMMARIES_DIR):
if filename.endswith('.txt'):
file_path = os.path.join(SUMMARIES_DIR, filename)
os.remove(file_path)
total_deleted += 1
logger.info(f"Deleted {total_deleted} image summaries from HF persistent storage")
return True
except Exception as e:
logger.error(f"Error clearing summaries: {e}")
return False
async def store_figure_index_entry(figure_key: str, figure_metadata: Dict) -> bool:
"""
Store a figure index entry in Hugging Face persistent storage
Args:
figure_key: Figure key (e.g., "figure_5")
figure_metadata: Dictionary containing figure metadata
Returns:
bool: True if successful, False otherwise
"""
try:
import json
index_path = os.path.join(FIGURE_INDEX_DIR, f"{figure_key}.json")
with open(index_path, "w", encoding="utf-8") as f:
json.dump(figure_metadata, f, ensure_ascii=False, indent=2)
logger.info(f"Stored figure index entry in HF persistent storage: {index_path}")
return True
except Exception as e:
logger.error(f"Error storing figure index entry {figure_key}: {e}")
return False
async def get_figure_index_entry(figure_key: str) -> Optional[Dict]:
"""
Retrieve a figure index entry from Hugging Face persistent storage
Args:
figure_key: Figure key (e.g., "figure_5")
Returns:
Dict: Figure metadata if found, None otherwise
"""
try:
import json
index_path = os.path.join(FIGURE_INDEX_DIR, f"{figure_key}.json")
if not os.path.exists(index_path):
logger.warning(f"Figure index entry not found: {index_path}")
return None
with open(index_path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
logger.error(f"Error retrieving figure index entry {figure_key}: {e}")
return None