Spaces:
Sleeping
Sleeping
File size: 2,183 Bytes
712b309 1f22ed8 712b309 8cb5b3d c3a4b6a 712b309 c3a4b6a 712b309 1f22ed8 8cb5b3d 1f22ed8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
"""Contains helper functions that are used in the RAG pipeline."""
import os
import gc
import json
import torch
import shutil
from typing import List, Dict
import faiss
import numpy as np
def save_cache(data: List[Dict], filepath: str) -> None:
"""Saving the chunks and the embeddings for easy retrieval in .json format"""
try:
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"Failed to save cache to {filepath}: {e}")
def load_cache(filepath: str) -> List[Dict]:
"""Loading the saved cache"""
if os.path.exists(filepath):
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"Failed to load cache from {filepath}: {e}")
return []
# Vector Store Helper Functions using IndexFlatIP (for semantic search)
def init_faiss_indexflatip(embedding_dim:int) -> faiss.IndexFlatIP:
index = faiss.IndexFlatIP(embedding_dim)
return index
def add_embeddings_to_index(index, embeddings: np.ndarray):
if embeddings.size > 0: # Embedding array is not empty
index.add(embeddings.astype(np.float32))
def search_faiss_index(index, query_embedding: np.ndarray, k: int = 5):
# Ensure query_embedding is 2D
if query_embedding.ndim == 1:
query_embedding = query_embedding.reshape(1, -1)
distances, indices = index.search(query_embedding.astype(np.float32), k)
return distances, indices
def save_faiss_index(index, filepath: str):
faiss.write_index(index, filepath)
def load_faiss_index(filepath: str):
return faiss.read_index(filepath)
# Deleting extracted images directory after captioning
def cleanup_images(image_dir: str):
try:
shutil.rmtree(image_dir)
print(f"[INFO] Cleaned up extracted images directory: {image_dir}")
except Exception as e:
print(f"[WARNING] Failed to delete some images in {image_dir}: {e}")
# Just being agnostic because my space may only be using CPU but why not?
def clear_gpu_cache():
"""Clear GPU cache and run garbage collection(saving on memory)."""
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
|