Spaces:
Sleeping
Sleeping
File size: 1,892 Bytes
5c271a3 a0929ab b4fb6ac 5c271a3 a6a0614 5c271a3 a6a0614 5c271a3 a6a0614 5c271a3 6874dac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import ast
import faiss
import logging
import numpy as np
import pandas as pd
from datasets import load_dataset
app_logger = logging.getLogger("app_logger")
error_logger = logging.getLogger("error_logger")
class DataLoader:
def __init__(self):
self.caption_dataset = load_dataset("DvorakInnovationAI/rt-genai-dataset-v1", revision="openai-embeddings")
self.ideas_dataset = load_dataset("DvorakInnovationAI/rt-genai-imdb-ideas-v1", revision='openai-embeddings')
self.api_knowledge_dataset = load_dataset("subashdvorak/api-kb-analytics", revision="openai-embeddings")
def _load_vector_index(self , dataset):
df = dataset["train"].to_pandas()
df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
embeddings = np.vstack(df['embeddings'].values).astype('float32')
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
return df, embeddings, index
def load_caption(self):
return self._load_vector_index(self.caption_dataset)
def load_imdb_ideas(self):
return self._load_vector_index(self.ideas_dataset)
def load_api_knowledge(self):
return self._load_vector_index(self.api_knowledge_dataset)
try:
data_loader = DataLoader()
app_logger.info('Dataset loaded from Hugging Face.')
except Exception as e:
error_logger.error('Unable to load dataset:', e)
raise
try:
caption_df, caption_embeddings, caption_index = data_loader.load_caption()
ideas_df , ideas_embeddings , ideas_index = data_loader.load_imdb_ideas()
api_knowledge_df , api_embeddings , api_index = data_loader.load_api_knowledge()
app_logger.info('Loaded the embeddings.')
except Exception as e:
error_logger.error('Unable to load the embeddings:', e)
|