restaurants

Runtime error

App Files Files Community

briankchan commited on Aug 13, 2023

Commit

e6e69dc

1 Parent(s): 5a768b8

Add app

Browse files

Files changed (9) hide show

.gitignore +2 -0
app.py +144 -21
azure_openai.py +98 -0
chain.py +18 -4
data.py +72 -0
launch.py +6 -0
requirements.txt +1 -0
scripts/__init__.py +0 -0
scripts/create_embeddings.py +101 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 .chainlit/chat.db
 # Created by https://www.toptal.com/developers/gitignore/api/python,intellij+all,visualstudiocode
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij+all,visualstudiocode
@@ -283,3 +284,4 @@ pyrightconfig.json
 .ionide
 # End of https://www.toptal.com/developers/gitignore/api/python,intellij+all,visualstudiocode

 .chainlit/chat.db
+.chainlit/chat_files
 # Created by https://www.toptal.com/developers/gitignore/api/python,intellij+all,visualstudiocode
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij+all,visualstudiocode
 .ionide
 # End of https://www.toptal.com/developers/gitignore/api/python,intellij+all,visualstudiocode
+/.chainlit/chat_files/3105f452-a667-4cd0-b40c-7502c8bc62d2/1bf41236-505a-44b3-9ac4-55efa37a393d.txt

app.py CHANGED Viewed

@@ -1,33 +1,156 @@
-import asyncio
 import chainlit as cl
 from chain import Chain
 @cl.on_chat_start
 async def start_chat():
-    chain = Chain(None)
-    await chain.text("I will count to 5. How many concurrent times should I count?")
 @cl.on_message
 async def on_message(message: str, message_id: str):
-    chain = Chain(message_id)
-    try:
-        num = int(message)
-    except ValueError:
-        await chain.text_stream("Sorry, that doesn't look like an integer to me.", final=True)
-        return
-    if num > 10:
-        await chain.text_stream("Whoa, let's try a smaller number. (Max 10.)", final=True)
-        return
-    await chain.text("Alright, here we go:")
-    coroutines = []
-    for i in range(num):
-        coroutines.append(chain.text_stream("1 2 3 4 5", delay=1, name=f"Counter {i + 1}"))
-    await asyncio.gather(*coroutines)
-    await chain.text_stream("Okay, I'm done counting now.", final=True)

+import re
+import tomllib
 import chainlit as cl
+from docarray.index.abstract import BaseDocIndex
+from azure_openai import AzureOpenaiSettings, AzureOpenaiEmbeddings, patch_chainlit
 from chain import Chain
+from data import embed, restaurant_index, RestaurantDescription
+embedding_settings = AzureOpenaiEmbeddings.load_from_env().to_settings_dict()
+patch_chainlit()
+def search_embeddings(query: str, doc_index: BaseDocIndex):
+    vec = embed(query, **embedding_settings)
+    docs, scores = doc_index.find(vec, 'embedding', 5)
+    return docs
 @cl.on_chat_start
 async def start_chat():
+    cl.user_session.set("history", [])
 @cl.on_message
 async def on_message(message: str, message_id: str):
+    history = cl.user_session.get("history")
+    # update history
+    history.append({"role": "user", "content": message})
+    # build AI response
+    chain = Chain(message_id, llm_settings=AzureOpenaiSettings.load_from_env())
+    query_msg = await chain.llm(
+        """
+        You are a conversation summarizer that condenses a conversation between
+        a human and AI into a search query that can be used to find relevant
+        restaurants.
+        Conversation history
+        ============
+        {history}
+        ============
+        From this conversation, create a search query that would fit the human's needs.
+        Do not say anything else; just the query.
+        """,
+        history=format_history(history),
+    )
+    results = search_embeddings(query_msg.content, restaurant_index)
+    await chain.text(str(list(results)))  # TODO maybe json format would be better?
+    restaurants = "\n".join(f"- ID: {r.id} | Description: {r.text}" for r in results)
+    final_choices_msg = await chain.llm(
+        """
+        You are a search engine for restaurants.
+        Output the restaurant IDs for the best matches to the following query:
+        ----
+        {query}
+        ============
+        List of restaurants
+        ----
+        {restaurants}
+        ============
+        Output your final answer as a TOML blob.
+        Each restaurant should have a key for its ID, with a
+        boolean value, where true means the restaurant is a good fit.
+        For example:
+        ---
+        [answer]
+        101 = false
+        1350 = true
+        02458 = false
+        9315 = true
+        128974 = true
+        ============
+        Make include IDs of ALL restaurants, but only mark true for ones that fit the query.
+        """,
+        query=query_msg.content,
+        restaurants=restaurants
+    )
+    # match = re.match(r'```\s*toml\s*(.*)\s*```', final_choices_msg.content, re.DOTALL)
+    # toml_string = match.group(1)
+    toml_string = final_choices_msg.content
+    # don't output just the good values, since GPT doesn't think about each option
+    # final_ids = [x.strip() for x in final_choices_msg.content.split(',')]
+    # don't use json because curly braces brakes the template code...
+    # final_ids = json.loads(json_string)
+    # TOML is easy to write and parse for both machines and humans :)
+    obj = tomllib.loads(toml_string)
+    final_ids = [id for id, val in obj['answer'].items() if val]
+    for i, id in enumerate(final_ids[:3]):
+        id = str(id)
+        restaurant: RestaurantDescription = restaurant_index[id]  # why no automatic typing?
+        msg = await chain.text(f"Option {i}", final=True)
+        msg.elements = [
+            # note: image always displays above text
+            cl.Image(name=restaurant.name, url=restaurant.image_url, display='inline', size='small'),
+            cl.Text(name=restaurant.name, content=restaurant.text, display='inline'),
+            # TODO text could also include categories/dishes/rating/price
+        ]
+        msg.actions = [
+            cl.Action(name='book', value=id, label='Book', description='Click to book this restaurant'),
+        ]
+        await msg.update()
+    # TODO what should the history include? ids only? or also descriptions?
+    # history.append({"role": "assistant", "content": response.content})
+NAMES = {
+    # 'system': '',
+    'user': 'Human',
+    'assistant': 'AI',
+}
+def format_history(history: list[dict]) -> str:
+    """Formats list of messages into a single string."""
+    strings = [f'{NAMES[m["role"]]}: {m["content"]}' for m in history]
+    return "\n".join(strings)

azure_openai.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+from typing import Self
+from chainlit import LLMSettings
+from chainlit.telemetry import trace_event
+from chainlit.types import CompletionRequest
+from pydantic.dataclasses import dataclass
+from starlette.responses import PlainTextResponse
+@dataclass
+class AzureOpenaiSettings(LLMSettings):
+    api_type: str = 'azure'
+    api_base: str = ''
+    engine: str = ''
+    api_version: str = '2023-05-15'
+    def to_settings_dict(self):
+        return {
+            **super().to_settings_dict(),
+            "api_type": self.api_type,
+            "api_base": self.api_base,
+            "api_version": self.api_version,
+            "engine": self.engine,
+        }
+    @classmethod
+    def load_from_env(cls: type[Self], *args, **kwargs) -> Self:
+        return cls(
+            *args,
+            api_type='azure',
+            api_base=os.environ.get('AZURE_OPENAI_ENDPOINT'),
+            engine=os.environ.get('AZURE_OPENAI_DEPLOYMENT'),
+            api_version=os.environ.get('AZURE_OPENAI_VERSION', '2023-05-15'),
+            **kwargs,
+        )
+@dataclass
+class AzureOpenaiEmbeddings:
+    api_type: str = 'azure'
+    api_base: str = ''
+    engine: str = ''
+    api_version: str = '2023-05-15'
+    def to_settings_dict(self):
+        return {
+            "api_type": self.api_type,
+            "api_base": self.api_base,
+            "api_version": self.api_version,
+            "engine": self.engine,
+        }
+    @classmethod
+    def load_from_env(cls: type[Self], *args, **kwargs) -> Self:
+        return cls(
+            *args,
+            api_type='azure',
+            api_base=os.environ.get('AZURE_OPENAI_ENDPOINT'),
+            engine=os.environ.get('AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT'),
+            api_version=os.environ.get('AZURE_OPENAI_VERSION', '2023-05-15'),
+            **kwargs,
+        )
+def patch_chainlit():
+    from chainlit.server import app
+    # replace playground's completion endpoint with one that uses custom openai settings
+    app.router.routes = list(filter(lambda route: route.path != '/completion', app.router.routes))
+    @app.post("/completion")
+    async def completion(request: CompletionRequest):
+        """Handle a completion request from the prompt playground."""
+        import openai
+        trace_event("completion")
+        api_key = request.userEnv.get("OPENAI_API_KEY", os.environ.get("OPENAI_API_KEY"))
+        stop = request.settings.stop
+        # OpenAI doesn't support an empty stop array, clear it
+        if isinstance(stop, list) and len(stop) == 0:
+            stop = None
+        response = await openai.ChatCompletion.acreate(
+            api_key=api_key,
+            messages=[{"role": "user", "content": request.prompt}],
+            stop=stop,
+            # **completion.settings.to_settings_dict(),
+            # HACK: hard-code llm settings
+            **dict(api_type='azure', api_base=os.environ.get('AZURE_OPENAI_ENDPOINT'),
+                   engine=os.environ.get('AZURE_OPENAI_DEPLOYMENT'),
+                   api_version=os.environ.get('AZURE_OPENAI_VERSION', '2023-05-15')
+                   ),
+        )
+        return PlainTextResponse(content=response["choices"][0]["message"]["content"])

chain.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import os
 import re
 import chainlit as cl
 import openai
@@ -8,6 +9,14 @@ from chainlit import LLMSettings
 from chainlit.config import config
 # TODO each chain should be able to make a child chain?
 # root = Chain()
 # first = root.child("something")
@@ -26,11 +35,12 @@ class Chain:
             **kwargs,
         )
-    async def text(self, text, final=False, name=None):
         message = self.make_message(content=text, final=final, name=name)
         await message.send()
-    async def text_stream(self, text: str, delay=.1, name=None, final=False):
         message = self.make_message(content='', final=final, name=name)
         tokens = text.split(" ")
         first = True
@@ -41,8 +51,12 @@ class Chain:
             await asyncio.sleep(delay)
             first = False
         await message.send()
-    async def llm(self, template, *args, name=None, final=False, **kwargs) -> str:
         variables = re.findall(r'\{(.*?)}', template)
         if len(args) > 1:
             raise RuntimeError("If there is more than one argument, use kwargs")
@@ -66,4 +80,4 @@ class Chain:
             await message.stream_token(token)
         await message.send()
-        return message.content

 import asyncio
 import os
 import re
+from inspect import cleandoc
 import chainlit as cl
 import openai
 from chainlit.config import config
+def replace_newlines(match: re.Match) -> str:
+    newlines = match.group(0)
+    count = len(newlines)
+    if count <= 1:
+        return " "
+    return newlines[1:]
 # TODO each chain should be able to make a child chain?
 # root = Chain()
 # first = root.child("something")
             **kwargs,
         )
+    async def text(self, text, final=False, name=None) -> cl.Message:
         message = self.make_message(content=text, final=final, name=name)
         await message.send()
+        return message
+    async def text_stream(self, text: str, delay=.1, name=None, final=False) -> cl.Message:
         message = self.make_message(content='', final=final, name=name)
         tokens = text.split(" ")
         first = True
             await asyncio.sleep(delay)
             first = False
         await message.send()
+        return message
+    async def llm(self, template, *args, name=None, final=False, **kwargs) -> cl.Message:
+        template = cleandoc(template)
+        template = re.sub('\n+', replace_newlines, template)  # remove a newline
         variables = re.findall(r'\{(.*?)}', template)
         if len(args) > 1:
             raise RuntimeError("If there is more than one argument, use kwargs")
             await message.stream_token(token)
         await message.send()
+        return message

data.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+from typing import Sequence
+import numpy as np
+import openai
+from docarray.documents import TextDoc
+from docarray.index import InMemoryExactNNIndex
+from docarray.typing import NdArray
+class OpenaiEmbeddingDoc(TextDoc):
+    embedding: NdArray[1536] | None
+    @staticmethod
+    def create_embeddings(docs: Sequence['OpenaiEmbeddingDoc'], **kwargs):
+        if len(docs) > 16:  # max allowed by azure
+            for i in range(0, len(docs), 16):
+                print(f"Processing 16 starting from index {i}")
+                OpenaiEmbeddingDoc.create_embeddings(docs[i:i+16], **kwargs)
+        else:
+            texts = [d.text for d in docs]
+            kwargs.setdefault('api_')
+            response = openai.Embedding.create(
+                input=texts,
+                api_key=os.environ.get('OPENAI_API_KEY', kwargs.get('api_key')),
+                **kwargs  # API key, model/engine, api_type, api_date, api_
+            )
+            embeddings = response['data']
+            assert(len(embeddings) == len(docs))
+            for obj in embeddings:
+                doc = docs[obj['index']]
+                doc.embedding = np.array(obj['embedding'])
+def embed(text: str, **kwargs) -> np.ndarray[1536]:
+    response = openai.Embedding.create(
+        input=text,
+        api_key=os.environ.get('OPENAI_API_KEY', kwargs.get('api_key')),
+        **kwargs
+    )
+    return np.array(response['data'][0]['embedding'])
+class RestaurantDescription(OpenaiEmbeddingDoc):
+    id: str = ''  # a number string
+    name: str
+    name_alt: str | None
+    categories: list[str]
+    dishes: list[str]
+    rating: float  # 0-1
+    price: int  # HKD
+    info_url: str
+    image_url: str
+class Category(OpenaiEmbeddingDoc):
+    id: str = ''  # same as text
+    restaurants: list[str]  # list of ids? or we could just search the restaurants?
+class Dish(OpenaiEmbeddingDoc):
+    """
+    Note: Not all dish names are meaningful, e.g., 'Trip to Bali', 'Oakland Breeze'
+    May include duplicates?
+    """
+    id: str = ''  # same as text
+    restaurants: list[str]  # list of ids
+restaurant_index = InMemoryExactNNIndex[RestaurantDescription](index_file_path='data/restaurants.bin')
+category_index = InMemoryExactNNIndex[Category](index_file_path='data/categories.bin')
+dish_index = InMemoryExactNNIndex[Dish](index_file_path='data/dishes.bin')

launch.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from chainlit.cli import run_chainlit
+from chainlit.config import config
+if __name__ == '__main__':
+    config.run.watch = True
+    run_chainlit('app.py')

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	chainlit==0.6.2


1	chainlit==0.6.2
2	+ docarray>=0.37.0

scripts/__init__.py ADDED Viewed

File without changes

scripts/create_embeddings.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import csv
+from ast import literal_eval
+from pathlib import Path
+from typing import TypeVar
+from docarray import DocList
+from dotenv import load_dotenv
+from azure_openai import AzureOpenaiEmbeddings
+from data import RestaurantDescription, restaurant_index, Dish, Category, dish_index, category_index
+def calculate_rating(low: str, medium: str, high: str) -> float:
+    low = int(low)
+    medium = int(medium)
+    high = int(high)
+    total = low + medium + high
+    return (medium*0.7 + high) / total
+def normalize_dish(dish_name: str) -> str:
+    output = dish_name.replace('\xa0', '')
+    return output.title()
+T = TypeVar('T')
+def add_to_all(restaurant: RestaurantDescription, keys: list[str], mapping: dict[T], cls: type[T]):
+    keys = set(keys)  # guard against duplicates
+    for k in keys:
+        v = mapping.get(k)
+        if v is None:
+            v = mapping[k] = cls(id=k, text=k, restaurants=[])
+        v.restaurants.append(restaurant.id)
+restaurants, dish_list, category_list = None, None, None
+def main():
+    global restaurants, dish_list, category_list
+    load_dotenv()
+    csv_file = Path('restaurants.csv')
+    restaurants = DocList[RestaurantDescription]()
+    dishes = {}
+    categories = {}
+    with csv_file.open(encoding='utf-8-sig', newline='') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            if row['name_lang2']:
+                name = row['name_lang2']
+                name_alt = row['name_lang1']
+            else:
+                name = row['name_lang1']
+                name_alt = None
+            ds = literal_eval(row['dishes'])
+            ds = [normalize_dish(d) for d in ds]
+            cs = literal_eval(row['categories'])
+            r = RestaurantDescription(
+                embedding=None,  # batch create all embeddings later
+                id=row['id'],
+                name=name,
+                name_alt=name_alt,
+                text=row['intro'],
+                price=int(row['price']),
+                rating=calculate_rating(row['score_cry'], row['score_o_k'], row['score_smile']),
+                categories=cs,
+                dishes=ds,
+                info_url=row['poi_url'],
+                image_url=row['door_photos'],
+            )
+            restaurants.append(r)
+            add_to_all(r, ds, dishes, Dish)
+            add_to_all(r, cs, categories, Category)
+    dish_list = DocList[Dish](dishes.values())
+    category_list = DocList[Category](categories.values())
+    embedding_settings = AzureOpenaiEmbeddings.load_from_env()
+    RestaurantDescription.create_embeddings(restaurants, **embedding_settings.to_settings_dict())
+    Dish.create_embeddings(dish_list, **embedding_settings.to_settings_dict())
+    Category.create_embeddings(category_list, **embedding_settings.to_settings_dict())
+    restaurant_index.index(restaurants)
+    dish_index.index(dish_list)
+    category_index.index(category_list)
+    restaurant_index.persist()
+    dish_index.persist()
+    category_index.persist()
+if __name__ == '__main__':
+    main()