File size: 1,892 Bytes
5c271a3
 
 
 
 
a0929ab
b4fb6ac
5c271a3
 
 
 
 
 
 
a6a0614
5c271a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6a0614
 
 
5c271a3
 
 
 
 
 
 
 
 
 
 
 
a6a0614
5c271a3
 
 
 
6874dac
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import ast
import faiss
import logging
import numpy as np
import pandas as pd
from datasets import load_dataset

app_logger = logging.getLogger("app_logger")
error_logger = logging.getLogger("error_logger")

class DataLoader:
    def __init__(self):
        self.caption_dataset = load_dataset("DvorakInnovationAI/rt-genai-dataset-v1", revision="openai-embeddings")
        self.ideas_dataset = load_dataset("DvorakInnovationAI/rt-genai-imdb-ideas-v1", revision='openai-embeddings')
        self.api_knowledge_dataset = load_dataset("subashdvorak/api-kb-analytics", revision="openai-embeddings")
    
    def _load_vector_index(self , dataset):
        df = dataset["train"].to_pandas()
        df['embeddings'] = df['embeddings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        embeddings = np.vstack(df['embeddings'].values).astype('float32')
        faiss.normalize_L2(embeddings)
        index = faiss.IndexFlatIP(embeddings.shape[1])
        index.add(embeddings)
        return df, embeddings, index
    
    def load_caption(self):
        return self._load_vector_index(self.caption_dataset)
    
    def load_imdb_ideas(self):
        return self._load_vector_index(self.ideas_dataset)
    
    def load_api_knowledge(self):
        return self._load_vector_index(self.api_knowledge_dataset)

try:
    data_loader = DataLoader()
    app_logger.info('Dataset loaded from Hugging Face.')
    
except Exception as e:
    error_logger.error('Unable to load dataset:', e)
    raise

try:
    caption_df, caption_embeddings, caption_index = data_loader.load_caption()
    ideas_df , ideas_embeddings , ideas_index = data_loader.load_imdb_ideas()
    api_knowledge_df , api_embeddings , api_index = data_loader.load_api_knowledge()
    app_logger.info('Loaded the embeddings.')

except Exception as e:
    error_logger.error('Unable to load the embeddings:', e)