File size: 16,604 Bytes
421ff50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b71265c
 
 
 
 
669b468
 
 
 
 
b71265c
 
 
669b468
deffc44
 
 
 
9e89d52
669b468
deffc44
669b468
14c5535
 
68faea2
 
 
669b468
b71265c
 
e433e22
b71265c
669b468
8179b31
669b468
 
b71265c
 
669b468
 
14c5535
 
b71265c
 
669b468
 
 
 
 
 
b71265c
 
 
 
669b468
b71265c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43ee39e
b71265c
 
 
 
 
 
 
669b468
b71265c
 
669b468
b71265c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669b468
 
b71265c
669b468
 
 
b71265c
669b468
b71265c
 
669b468
 
b71265c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669b468
b71265c
 
 
 
 
 
 
 
 
 
 
 
 
 
669b468
 
b71265c
669b468
b71265c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669b468
 
 
b71265c
 
 
 
669b468
b71265c
c4ca511
b71265c
 
 
 
 
 
 
 
 
 
 
 
 
669b468
b71265c
 
 
 
 
 
 
 
669b468
 
b71265c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef2acaf
b71265c
 
 
ef2acaf
b71265c
 
ef2acaf
b71265c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# prioritized todos
# supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
# - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
# - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
# TODO add discord/github/google auth...via custom js? see supabase docs
# - make users maintainers of their own curations, restrict add perms, introduce edit/delete/clone perms
# - add stars to curations+users profile -> display starred curations first, then sort by most popular
# - securely store user's openai key in supabase for convenience
# TODO better ai arch
# - eg let user customize instr (via langchain's jinji support?)
# - better: make easy to experiment with langchain's chains/agents
# - maybe something like model_laboratory with gradio's Parallel block?
# - account for mpnet's limit of 384 word pieces per chunk (is it done already?)
# - - more deliberate chunking strat in general
# TODO summarize a vid (and optionally add to curation)
# TODO support yt playlists and yt channels in addition to just one-off videos
# - can i make this really easy to add via a well designed api?

# unprioritized todos
# TODO some inline todos below that should reduce need to reset/rollback DBs
# - how to easily rollback bad data?
# TODO harrison thinks editing vectorDB abstraction to consume Embedding class vs func is a good approach -> need to PR this
# TODO can i generalize the query filter approach (add to langchain?) to remove coupling to pinecone?
# - i believe elastic8.5 supports rdb and vdb, but need nontrivial specs to run it i think
# TODO user prefs data model (their curations)
# TODO finalize deployment strategy
# - supabase free tier for db + blob storage of transcripts
# - hf space to host model computations (langchain bits need to run here)
# - replit or supabase to host edge functiosn to call hf space
# TODO gradio global state to track recently asked questions from everyone
# TODO create pinecone index without indexing text metadata field for performance: https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
# TODO could use pinecone namespace per embedding model

# TODO deploy txtai to fly.io free tier? not sure compute reqs
# - or haystack?
# - both these come with many features out of the box

import os
import json
import uuid

import openai

import spacy
import en_core_web_sm

import gradio as gr
from gradio import blocks

import pinecone
from supabase import create_client, Client

from langchain.vectorstores import Pinecone
from langchain.text_splitter import SpacyTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi

from typing import Any, Callable, Dict, Iterable, List, Optional

Embedder = HuggingFaceEmbeddings().embed_query
Model_name = HuggingFaceEmbeddings().model_name 

PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
pinecone.init(
           api_key=PINECONE_APIKEY,
           environment='us-west1-gcp' # only option for for free tier
        )

class MyPinecone(Pinecone):
  def add_texts(
        self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.
        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        # Embed and create the documents
        docs = []
        ids = []
        for i, text in enumerate(texts):
            id = str(uuid.uuid4())
            embedding = self._embedding_function(text)#.tolist()
            metadata = metadatas[i] if metadatas else {}
            metadata[self._text_key] = text
            docs.append((id, embedding, metadata))
            ids.append(id)
        # upsert to Pinecone
        self._index.upsert(vectors=docs)
        return ids

Pinecone_index = pinecone.Index('semantic-curations')
Vdb = MyPinecone(Pinecone_index, Embedder, "text")

def supa_all(supa_data) -> List[dict]:
  datajson = json.loads(supa_data.json())
  return datajson['data']

def transcript2chunks(transcript):
  print("starting transcript2chunks")
  # TODO what's a good chunk_size?
  # TODO should store as metadata in dbs
  r = SpacyTextSplitter(chunk_size = 2000).split_text(transcript)
  print("finished chunking")
  return r

def video_id_to_media_id(video_id: str) -> Optional[str]:
  rows = supa_all(supabase.table('ingested_youtube_videos').select('media_id').eq('video_id', video_id).execute())
  print(rows)
  if len(rows) == 1:
    return rows[0]['media_id']
  else:
    return None
    
# returns curation_ids that already have the video_id
def check_curations_with_video(video_id: str) -> List[str]:
  media_id = video_id_to_media_id(video_id)
  print(f"media_id {media_id}")
  if media_id is None:
    return []
  data = supa_all(supabase.table("junction_curations").select("curation_id").eq('media_id', media_id).execute())
  in_curations = [r['curation_id'] for r in data]
  return in_curations

def yt2transcript(video_id):
  print(f"\n\nstarting yt2transcript on id: {video_id}")
  # data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
  data = YouTubeTranscriptApi.get_transcript(video_id)
  transcript = ' '.join([x['text'] for x in data])
  print("got transcript")
  # TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
  # TODO ought to store timestamp of chunks in metadata for better Sources.
  # - instead of splitting transcript into chunks, can i merge these fragments into approp size? langchain has merge func
  return transcript

def yt_id2name(video_id: str) -> str:
  video = YouTube(f"https://www.youtube.com/watch?v={video_id}")
  return video.title

# db guarantees name is unique across rows
def curation_name2id() -> dict:
  rows = supa_all(supabase.table("curations_metadata").select("curation_id, name").execute())
  c = {}
  for r in rows:
    c[r['name']] = r['curation_id']
  return c

def get_curation_names():
  d = curation_name2id()
  return list(d.keys())

def get_curations_and_videos():
  rows = supa_all(supabase.table("curations_metadata").select("curation_id, name, media_id:ingested_youtube_videos ( video_name )").execute())
  row_d = {}
  for r in rows:
    for m in r['media_id']:
      row_d.setdefault(r['name'], []).append(m['video_name'])
  return row_d

def gen_curation_md():
  output = ""
  for curation_name,video_names in get_curations_and_videos().items():
    output += f"\n## {curation_name}\n"
    output += "1. " + "\n1. ".join(video_names)
  return output

def ingest_video(video_id: str, selected_curation_names: List[str], new_curation: str = ""):
  video_id = video_id.strip()
  if new_curation:
    curcur = curation_name2id()
    if new_curation in curcur.keys():
      return "dupe curation name", gr.update(), gr.update(), gr.update()
    # add to db here, which will autogen the id
    supabase.table("curations_metadata").insert({"name": new_curation}).execute()
    selected_curation_names.append(new_curation)
  if not selected_curation_names: # contains new_curation at this point
    return "need >=1 curations", gr.update(), gr.update(), gr.update()

  cur_dict = curation_name2id()
  selected_curation_ids = [cur_dict[n] for n in selected_curation_names]
  existing_curations_with_video = check_curations_with_video(video_id)
  curations_to_add_video_to = list(set(selected_curation_ids).difference(set(existing_curations_with_video)))
  goal_curations_with_video = existing_curations_with_video + curations_to_add_video_to
  if not curations_to_add_video_to: # video already in all selected curations
    return "dupe video", gr.update(), gr.update(), gr.update()

  if len(existing_curations_with_video) == 0: # no curations have the video, we need to add it to vector db
    assert(goal_curations_with_video == curations_to_add_video_to) # this should be true in this case
    print("new video, processing\n")

    try:
      video_name = yt_id2name(video_id)
    except Exception as e:
      # TODO undo new_curation create supabase.table("curations_metadata").insert({"name": new_curation}).execute()
      #      - in all try/catches. maybe have upper try/catch to do this in one place. extract 
      return f"Error loading video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()

    try:
      transcript = yt2transcript(video_id)
    except Exception as e:
      return f"Error fetching transcripts for video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()

    chunks = transcript2chunks(transcript)
    metadatas = [{'video_id': video_id, 'video_name': video_name, 'curation_ids': goal_curations_with_video} for c in chunks] # *len() was buggy?

    #import pprint
    #for i, c in enumerate(chunks):
    # print(f"{i}: {c}")
    #print(metadata)
    print("embedding & uploading to vector db TODO how to get progress from langchain?\n")

    # TODO consider storing chunk text in supabase - maybe get more storage out of pinecone's s1 if supabase's free tier is sufficient
    chunk_ids = Vdb.add_texts(chunks, metadatas)
    print("bookkeeping supabase with new video\n")

    inserted_row = supa_all(supabase.table("ingested_youtube_videos").insert({"video_id": video_id,
                                                                              "video_name": video_name}).execute())[0]
    data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
                                                          'num_chunks': len(chunks),
                                                          'embedding_model': str(Model_name),
                                                          'transcribed_by': 'youtube_transcript_api'}).execute()
    print("\t- transcripts\n")                                          
    data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': inserted_row['media_id']} for c in goal_curations_with_video]).execute()
    print("\t- curations\n")                                          
    data = supabase.table('junction_vectors').insert(  [{'chunk_id':    c, 'media_id': inserted_row['media_id']} for c in chunk_ids                ]).execute()
    print("\t- vectors\n")                                          
  else: # some curations already ahve video, so no need to chunk+embed+insert into vector db. just adjust bookkeeping in vector db + supa
    print("video already in vector db, updating metadata to include selected curations\n")
    # get media_id of given video
    media_id = video_id_to_media_id(video_id)

    # get chunk_ids for the video
    chunk_rows = supa_all(supabase.table("junction_vectors").select("chunk_id").eq('media_id', media_id).execute())

    # then update metadata of both supabase and vectorDB to include new curations
    for r in chunk_rows:
      update_response = Pinecone_index.update(
        id=r['chunk_id'],
        set_metadata={'curation_ids': goal_curations_with_video}
      )
      # TODO error check update_response
    data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': media_id} for c in curations_to_add_video_to]).execute()
  
  #curation_ids = [cur_dict[name] for name in curations_to_add_video_to]

  status = "Status: Done! Video added, thanks for contributing :D"
  return status, gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())

def query_llm(prompt):
  response = openai.Completion.create(
      prompt=prompt,
      temperature=0,
      max_tokens=400,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0,
      #stop=stop_sequence,
      model=f'text-davinci-003'
  )
  #print(response)
  return response["choices"][0]["text"].strip()


# this needn't be in hf space, as it will just call out to openai and the db
# but why not host it here since it's free vs replits 2 cents/day
def ask_question(question: str, openai_apikey: str, curation_names: List[str]):
  if not question or not openai_apikey or not curation_names:
    return "error: need all inputs", ""
  openai.api_key = openai_apikey
  # query vector db for topk chunks
  # can't use langchain bc we are using pinecone metadata filtering
  q_embedding = Embedder(question)#.tolist()
  curations_dict = curation_name2id()
  curation_ids = [curations_dict[name] for name in curation_names]
  results = Pinecone_index.query(vector=q_embedding, filter={'curation_ids': {"$in": curation_ids}}, top_k=5, include_metadata=True)
  #pprint.pprint(results)
  # TODO add filters to langchain's pinecone impl?
  sources = {}
  chunks = []
  for r in results['matches']:
    chunk = r['metadata']['text']
    chunks.append(chunk)
    video_name = r['metadata']['video_name']
    sources.setdefault(video_name, []).append(chunk)
  sources_md = "## Sources\n" + "\n\n".join([f"### {name}\n" + "\n\n---\n\n".join([f'{c}' for c in chunks]) for name, chunks in sources.items()])
  # format prompt (textwrap to guarantee length?)
  instr = "Answer the question based on the context below, and if the question can't be answered based on the context, say 'I don't know'.\n\nContext:\n- "
  prompt = instr + "\n- ".join(chunks) + f"\n\nQuestion: {question}\n\nAnswer:"
  #pprint.pprint(prompt)

  try:
    answer = "## Answer\n" + query_llm(prompt)
  except Exception as e:
    answer = f"Error: {e}"

  # query llm and return output and topk
  return answer, sources_md

with gr.Blocks() as demo:
  curations_from_db = get_curation_names()
  refresh_button = gr.Button("Synchronize data (with other user's changes)")
  with gr.Tab("Ask a question"):
    q = gr.Textbox(label="Your question")
    openai_apikey = gr.Textbox(label="OpenAI API Key", type="password")
    curation_names_1 = gr.CheckboxGroup(choices=curations_from_db, label="Curations to query")
    button = gr.Button("Submit")
    answer = gr.Markdown(value="")
    sources = gr.Markdown(value="")
    button.click(ask_question, inputs=[q, openai_apikey, curation_names_1], outputs=[answer, sources])
  with gr.Tab("Browse & Organize Curations"):
    def refresh_curation_accordion():
      output = gen_curation_md()
      return gr.update(value=output)
      #md.change(fn=refresh_curation_accordion, inputs=[curation_names_1], outputs=[md])
#    for name,id in curation_name2id().items():
#      print(id,name,rows)
#      accordions_state[name] = {'gr_obj': gr.Accordion(name), 'rows': []}
#      with accordions_state[name]['gr_obj']:
#        for i,medium in enumerate(row_d[id]):
#          accordions_state[name]['rows'].append(gr.Row(variant='compact'))
#          with accordions_state[name]['rows'][i]:
#            gr.Markdown(medium['video_name'])
          #delete_button = gr.Button("Delete from Curation")
          #delete_button.click(...)
    #refresh_button = gr.Button("Refresh curations")
    md = gr.Markdown(gen_curation_md())
    #refresh_button.click(fn=refresh_curation_accordion, inputs=[], outputs=[md])
  with gr.Tab("Add data to Curations"):
    gr.Markdown("An hour's worth of video seems to take about a minute to upload (ymmv).")
    video_id = gr.Textbox(label="Youtube video id (NOT full url)", placeholder="lvh3g7eszVQ")
    curation_names_2 = gr.CheckboxGroup(choices=curations_from_db, label="Add to existing Curations")
    new_curation = gr.Textbox(label="and/or add to new Curation")
    button = gr.Button("Submit")
    status_field = gr.Markdown()
    submit_click = button.click(ingest_video, inputs=[video_id, curation_names_2, new_curation], outputs=[status_field, curation_names_1, curation_names_2, md])
    # TODO need to undo rdb and vdb state if cancel clicked
    #cancel_button = gr.Button("Cancel", cancels=[submit_click])
      
  def refresh_all_curation_lists():
    return gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())
  refresh_button.click(fn=refresh_all_curation_lists, inputs=[], outputs=[curation_names_1, curation_names_2, md])
demo.launch()