import ast import faiss import logging import numpy as np import pandas as pd from datasets import load_dataset app_logger = logging.getLogger("app_logger") error_logger = logging.getLogger("error_logger") class DataLoader: def __init__(self): self.caption_dataset = load_dataset("DvorakInnovationAI/rt-genai-dataset-v1", revision="openai-embeddings") self.ideas_dataset = load_dataset("DvorakInnovationAI/rt-genai-imdb-ideas-v1", revision='openai-embeddings') def _load_vector_index(self , dataset): df = dataset["train"].to_pandas() df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x) embeddings = np.vstack(df['embeddings'].values).astype('float32') faiss.normalize_L2(embeddings) index = faiss.IndexFlatIP(embeddings.shape[1]) index.add(embeddings) return df, embeddings, index def load_caption(self): return self._load_vector_index(self.caption_dataset) def load_imdb_ideas(self): return self._load_vector_index(self.ideas_dataset) try: data_loader = DataLoader() app_logger.info('Dataset loaded from Hugging Face.') except Exception as e: error_logger.error('Unable to load dataset:', e) raise try: caption_df, caption_embeddings, caption_index = data_loader.load_caption() ideas_df , ideas_embeddings , ideas_index = data_loader.load_imdb_ideas() app_logger.info('Loaded the embeddings.') except Exception as e: error_logger.error('Unable to load the embeddings:', e)