File size: 2,388 Bytes
e6e69dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ef3ac7
e6e69dc
 
 
 
 
 
1ef3ac7
e6e69dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
from typing import Sequence

import numpy as np
import openai
from docarray.documents import TextDoc
from docarray.index import InMemoryExactNNIndex
from docarray.typing import NdArray


class OpenaiEmbeddingDoc(TextDoc):
    embedding: NdArray[1536] | None

    @staticmethod
    def create_embeddings(docs: Sequence['OpenaiEmbeddingDoc'], **kwargs):
        if len(docs) > 16:  # max allowed by azure
            for i in range(0, len(docs), 16):
                print(f"Processing 16 starting from index {i}")
                OpenaiEmbeddingDoc.create_embeddings(docs[i:i+16], **kwargs)
        else:
            texts = [d.text for d in docs]
            kwargs.setdefault('api_')
            response = openai.Embedding.create(
                input=texts,
                api_key=os.environ.get('OPENAI_API_KEY', kwargs.get('api_key')),
                **kwargs  # API key, model/engine, api_type, api_date, api_
            )
            embeddings = response['data']
            assert(len(embeddings) == len(docs))
            for obj in embeddings:
                doc = docs[obj['index']]
                doc.embedding = np.array(obj['embedding'])


def embed(text: str, **kwargs) -> np.ndarray[1536]:
    response = openai.Embedding.create(
        input=text,
        api_key=os.environ.get('OPENAI_API_KEY', kwargs.get('api_key')),
        **kwargs
    )
    return np.array(response['data'][0]['embedding'])


class RestaurantDescription(OpenaiEmbeddingDoc):
    id: str = ''  # a number string
    name: str
    name_alt: str | None
    intro: str
    categories: list[str]
    dishes: list[str]
    rating: float  # 0-1
    price: int  # HKD
    info_url: str
    image_url: str
    location: list[str]


class Category(OpenaiEmbeddingDoc):
    id: str = ''  # same as text
    restaurants: list[str]  # list of ids? or we could just search the restaurants?


class Dish(OpenaiEmbeddingDoc):
    """
    Note: Not all dish names are meaningful, e.g., 'Trip to Bali', 'Oakland Breeze'
    May include duplicates?
    """
    id: str = ''  # same as text
    restaurants: list[str]  # list of ids


restaurant_index = InMemoryExactNNIndex[RestaurantDescription](index_file_path='data/restaurants.bin')
category_index = InMemoryExactNNIndex[Category](index_file_path='data/categories.bin')
dish_index = InMemoryExactNNIndex[Dish](index_file_path='data/dishes.bin')