Spaces:

marcgreen
/

semantic_curations

Runtime error

App Files Files Community

marcgreen commited on Dec 24, 2022

Commit

b71265c

1 Parent(s): bba49a3

mvp

Browse files

Files changed (1) hide show

app.py +284 -149

app.py CHANGED Viewed

@@ -1,199 +1,334 @@
-"""Interface for Pinecone vector stores."""
-import uuid
-import pinecone
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Iterable, List, Optional
-from langchain.docstore.document import Document
-from langchain.embeddings.base import Embeddings
-from langchain.vectorstores.base import VectorStore
-class Pinecone(VectorStore):
-    """Interface for vector stores."""
-    def _query():
-      pass
-    def __init__(
-        self, api_key: str, index_name: str, embedding_function: Callable
-    ):
-        """Initialize with necessary components."""
-        try:
-            import pinecone
-        except ImportError:
-            raise ValueError(
-                "Could not import pinecone python package. "
-                "Please install it with `pip install pinecone-client`."
-            )
-        self.embedding_function = embedding_function
-        self.index_name = index_name
-        #try:
-        pinecone.init(
-           api_key=api_key,
-           environment='us-west1-gcp' # only option for for free tier
-        )
-        #except ValueError as e:
-        #    raise ValueError(
-        #        f"Your elasticsearch client string is misformatted. Got error: {e} "
-         #   )
-        self.client = pinecone
-    def add_texts(
-        self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
-    ) -> None:
-        """Run more texts through the embeddings and add to the vectorstore."""
-        index = self.client.Index(self.index_name)
-        batch_size = 16 # recommended limit is 100 vectors
-        for i in range(0, len(texts), batch_size):
-          i_end = min(i+batch_size, len(texts))
-          text_batch = texts[i:i_end]
-          metadata_batch = metadatas[i:i_end] if metadatas else [{}] * (i_end-i)
-          embedding_batch = self.embedding_function(text_batch) # [[0] * 768] * (i_end - i) #
-          to_upsert = [
-              (
-                  str(uuid.uuid4()), # id that we currently don't care about
-                  embedding.tolist(),
-                  dict(
-                      {"text": text},
-                      **metadata # if 'text' in here too, it takes precendence
-                  )
-              ) for text, embedding, metadata in zip(text_batch, embedding_batch, metadata_batch)
-          ]
-          index.upsert(vectors=to_upsert)
-    def similarity_search(self, query: str, k: int = 5) -> List[Document]:
-        """Return docs most similar to query."""
-        index = self.client.Index(self.index_name)
-        matches = index.query(
-            #namespace="example-namespace",
-            top_k=k,
-            include_values=True,
-            include_metadata=True,
-            vector=query,
-            #filter={
-            #    "genre": {"$in": ["comedy", "documentary", "drama"]}
-            #}
-        )
-        documents = [
-            Document(page_content=match["metadata"]["text"], metadata=match["metadata"]) for match in matches
-        ]
-        return documents
-    @classmethod
-    def from_texts(
-        cls,
-        texts: List[str],
-        embedding: Embeddings,
-        metadatas: Optional[List[dict]] = None,
-        **kwargs: Any
-    ) -> "VectorStore":
-        """Return VectorStore initialized from texts and embeddings."""
-# TODO fill out other 2 methods for Pinecone Vectore Store and ask if harrison would be open to a PR
 # TODO account for mpnet's limit of 384 word pieces per chunk (is it done already?)
-# DONE need to check if embeddings exist for given video id before generating embeddings
 # supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
 # - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
 # - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
 # TODO user prefs data model (their curations)
 # - meh not needed at first
-# DONE design main workflows
-# DONE curation data model
-# TODO frontend (discord bot or gradio or)
-# - i also want to be able to give it a yt vid and have it summarize it for me
-# TODO workflow for curating videos into sets (aka Curations)
-# TODO workflow to ask Curations a question
-# - LEFT OFF here
 # TODO support yt playlists in addition to just one-off videos
 # - can i make this really easy to add via a well designed api?
 # TODO finalize deployment strategy
 # - supabase free tier for db + blob storage of transcripts
 # - hf space to host model computations (langchain bits need to run here)
 # - replit or supabase to host edge functiosn to call hf space
-# TODO gradio session state to track recently asked questions
 import os
 import json
 import gradio as gr
 from langchain.text_splitter import SpacyTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
 from youtube_transcript_api import YouTubeTranscriptApi
-embedder = HuggingFaceEmbeddings().embed_documents
-model_name = HuggingFaceEmbeddings().model_name
-from supabase import create_client, Client
 PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
 SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
 SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
 supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
-pinecone = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
-def transcript2chunks(transcript):
-  return SpacyTextSplitter().split_text(transcript)
-def ingest_transcript(transcript):
-  p = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
-  chunks = transcript2chunks(transcript)
-  p.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
-def already_ingested(yt_video_id: str):
-  data = supabase.table("ingested_youtube_videos").select("*", count="estimated").eq('video_id', yt_video_id).execute()
-  return data.count > 0
 def yt2transcript(video_id):
-  print(f"\n\nid: {video_id}")
   # data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
   data = YouTubeTranscriptApi.get_transcript(video_id)
   transcript = ' '.join([x['text'] for x in data])
   # TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
   return transcript
-def ingest_video(video_id):
-  if already_ingested(video_id):
-    return "dupe"
-  else:
-    transcript = yt2transcript(video_id)
     chunks = transcript2chunks(transcript)
-    pinecone.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
-    inserted_row_data = supabase.table("ingested_youtube_videos").insert({"video_id": video_id}).execute()
-    inserted_row_json = json.loads(inserted_row_data.json())
-    print(inserted_row_json)
-    inserted_row = inserted_row_json['data'][0]
     data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
                                                           'num_chunks': len(chunks),
-                                                          'embedding_model': str(embedder),
                                                           'transcribed_by': 'youtube_transcript_api'}).execute()
-    return f"ingested {len(chunks)} chunks"
 # this needn't be in hf space, as it will just call out to openai and the db
 # but why not host it here since it's free vs replits 2 cents/day
-def ask_question(question: str, instruction: str, curation_ids: List[str]):
   # query vector db for topk chunks
-  topk = pinecone.similarity_search(question, k = 10)
   # format prompt (textwrap to guarantee length?)
   # query llm and return output and topk
-  pass
-# this needn't be in hf space, as it will just interact with db's community and user Curations
-# but why not host it here
-# will be updating the pinecone metadata/namespace with teh curation ids it's a part of
-# - but that would tie the app to pinecone....
-# - so maybe I do want to use elastic 8,5 since it can do both? need nontrivial specs
-# - using metadata would let us assign multiple curation ids to each video, whereas namespace could only be 1
-def organize_curations():
-  pass
-demo = gr.Interface(fn=ingest_video, inputs=["text"], outputs=["text"])
-                    #article="\n".join([f"- {k}: " + v.replace("\n"," ") for k,v in instructions.items()]))
-demo.launch()
-#ingest_video(video_id)

+# TODO some inline todos below that should reduce need to reset/rollback DBs
+# - how to easily rollback bad data?
+# TODO harrison thinks editing vectorDB abstraction to consume Embedding class vs func is a good approach -> need to PR this
+# TODO can i generalize the query filter approach (add to langchain?) to remove coupling to pinecone?
+# - i believe elastic8.5 supports rdb and vdb, but need nontrivial specs to run it i think
 # TODO account for mpnet's limit of 384 word pieces per chunk (is it done already?)
 # supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
 # - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
 # - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
 # TODO user prefs data model (their curations)
 # - meh not needed at first
+# TODO summarize a vid (and optionally add to curation)
 # TODO support yt playlists in addition to just one-off videos
 # - can i make this really easy to add via a well designed api?
 # TODO finalize deployment strategy
 # - supabase free tier for db + blob storage of transcripts
 # - hf space to host model computations (langchain bits need to run here)
 # - replit or supabase to host edge functiosn to call hf space
+# TODO gradio global state to track recently asked questions from everyone
+# TODO add discord/github/google auth...via custom js? see supabase docs
+# - make users maintainers of their own curations, restrict add perms, introduce edit/delete/clone perms
+# - add stars to curations+users profile -> display starred curations first, then sort by most popular
+# - securely store user's openai key in supabase for convenience
+# TODO create pinecone index without indexing text metadata field for performance: https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
+# TODO could use pinecone namespace per embedding model
+# TODO let user customize instr (via langchain's jinji support?)
+# - better: make easy to experiment with langchain's chains/agents
+# - maybe something like model_laboratory with gradio's Parallel block?
 import os
 import json
 import gradio as gr
+from gradio import blocks
+from supabase import create_client, Client
 from langchain.text_splitter import SpacyTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
+from pytube import YouTube
 from youtube_transcript_api import YouTubeTranscriptApi
+Embedder = HuggingFaceEmbeddings().embed_query
+Model_name = HuggingFaceEmbeddings().model_name
 PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
 SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
 SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
 supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+pinecone.init(
+           api_key=PINECONE_APIKEY,
+           environment='us-west1-gcp' # only option for for free tier
+        )
+class MyPinecone(Pinecone):
+  def add_texts(
+        self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
+    ) -> List[str]:
+        """Run more texts through the embeddings and add to the vectorstore.
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+        """
+        # Embed and create the documents
+        docs = []
+        ids = []
+        for i, text in enumerate(texts):
+            id = str(uuid.uuid4())
+            embedding = self._embedding_function(text).tolist()
+            metadata = metadatas[i] if metadatas else {}
+            metadata[self._text_key] = text
+            docs.append((id, embedding, metadata))
+            ids.append(id)
+        # upsert to Pinecone
+        self._index.upsert(vectors=docs)
+        return ids
+Pinecone_index = pinecone.Index('semantic-curations')
+Vdb = MyPinecone(Pinecone_index, Embedder, "text")
+def supa_all(supa_data) -> List[dict]:
+  datajson = json.loads(supa_data.json())
+  return datajson['data']
+def transcript2chunks(transcript):
+  print("starting transcript2chunks")
+  # TODO what's a good chunk_size?
+  # TODO should store as metadata in dbs
+  r = SpacyTextSplitter(chunk_size = 2000).split_text(transcript)
+  print("finished chunking")
+  return r
+def video_id_to_media_id(video_id: str) -> Optional[str]:
+  rows = supa_all(supabase.table('ingested_youtube_videos').select('media_id').eq('video_id', video_id).execute())
+  print(rows)
+  if len(rows) == 1:
+    return rows[0]['media_id']
+  else:
+    return None
+# returns curation_ids that already have the video_id
+def check_curations_with_video(video_id: str) -> List[str]:
+  media_id = video_id_to_media_id(video_id)
+  print(f"media_id {media_id}")
+  if media_id is None:
+    return []
+  data = supa_all(supabase.table("junction_curations").select("curation_id").eq('media_id', media_id).execute())
+  in_curations = [r['curation_id'] for r in data]
+  return in_curations
 def yt2transcript(video_id):
+  print(f"\n\nstarting yt2transcript on id: {video_id}")
   # data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
   data = YouTubeTranscriptApi.get_transcript(video_id)
   transcript = ' '.join([x['text'] for x in data])
+  print("got transcript")
   # TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
+  # TODO ought to store timestamp of chunks in metadata for better Sources.
+  # - instead of splitting transcript into chunks, can i merge these fragments into approp size? langchain has merge func
   return transcript
+def yt_id2name(video_id: str) -> str:
+  video = YouTube(f"https://www.youtube.com/watch?v={video_id}")
+  return video.title
+# db guarantees name is unique across rows
+def curation_name2id() -> dict:
+  rows = supa_all(supabase.table("curations_metadata").select("curation_id, name").execute())
+  c = {}
+  for r in rows:
+    c[r['name']] = r['curation_id']
+  return c
+def get_curation_names():
+  d = curation_name2id()
+  return list(d.keys())
+def get_curations_and_videos():
+  rows = supa_all(supabase.table("curations_metadata").select("curation_id, name, media_id:ingested_youtube_videos ( video_name )").execute())
+  row_d = {}
+  for r in rows:
+    for m in r['media_id']:
+      row_d.setdefault(r['name'], []).append(m['video_name'])
+  return row_d
+def gen_curation_md():
+  output = ""
+  for curation_name,video_names in get_curations_and_videos().items():
+    output += f"\n## {curation_name}\n"
+    output += "1. " + "\n1. ".join(video_names)
+  return output
+def ingest_video(video_id: str, selected_curation_names: List[str], new_curation: str = ""):
+  video_id = video_id.strip()
+  if new_curation:
+    curcur = curation_name2id()
+    if new_curation in curcur.keys():
+      return "dupe curation name", gr.update(), gr.update(), gr.update()
+    # add to db here, which will autogen the id
+    supabase.table("curations_metadata").insert({"name": new_curation}).execute()
+    selected_curation_names.append(new_curation)
+  if not selected_curation_names: # contains new_curation at this point
+    return "need >=1 curations", gr.update(), gr.update(), gr.update()
+  cur_dict = curation_name2id()
+  selected_curation_ids = [cur_dict[n] for n in selected_curation_names]
+  existing_curations_with_video = check_curations_with_video(video_id)
+  curations_to_add_video_to = list(set(selected_curation_ids).difference(set(existing_curations_with_video)))
+  goal_curations_with_video = existing_curations_with_video + curations_to_add_video_to
+  if not curations_to_add_video_to: # video already in all selected curations
+    return "dupe video", gr.update(), gr.update(), gr.update()
+  if len(existing_curations_with_video) == 0: # no curations have the video, we need to add it to vector db
+    assert(goal_curations_with_video == curations_to_add_video_to) # this should be true in this case
+    print("new video, processing\n")
+    try:
+      video_name = yt_id2name(video_id)
+    except Exception as e:
+      # TODO undo new_curation create supabase.table("curations_metadata").insert({"name": new_curation}).execute()
+      #      - in all try/catches. maybe have upper try/catch to do this in one place. extract
+      return f"Error loading video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()
+    try:
+      transcript = yt2transcript(video_id)
+    except Exception as e:
+      return f"Error fetching transcripts for video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()
     chunks = transcript2chunks(transcript)
+    metadatas = [{'video_id': video_id, 'video_name': video_name, 'curation_ids': goal_curations_with_video} for c in chunks] # *len() was buggy?
+    #import pprint
+    #for i, c in enumerate(chunks):
+    # print(f"{i}: {c}")
+    #print(metadata)
+    print("embedding & uploading to vector db TODO how to get progress from langchain?\n")
+    # TODO consider storing chunk text in supabase - maybe get more storage out of pinecone's s1 if supabase's free tier is sufficient
+    chunk_ids = Vdb.add_texts(chunks, metadatas)
+    print("bookkeeping supabase with new video\n")
+    inserted_row = supa_all(supabase.table("ingested_youtube_videos").insert({"video_id": video_id,
+                                                                              "video_name": video_name}).execute())[0]
     data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
                                                           'num_chunks': len(chunks),
+                                                          'embedding_model': str(Model_name),
                                                           'transcribed_by': 'youtube_transcript_api'}).execute()
+    print("\t- transcripts\n")
+    data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': inserted_row['media_id']} for c in goal_curations_with_video]).execute()
+    print("\t- curations\n")
+    data = supabase.table('junction_vectors').insert(  [{'chunk_id':    c, 'media_id': inserted_row['media_id']} for c in chunk_ids                ]).execute()
+    print("\t- vectors\n")
+  else: # some curations already ahve video, so no need to chunk+embed+insert into vector db. just adjust bookkeeping in vector db + supa
+    print("video already in vector db, updating metadata to include selected curations\n")
+    # get media_id of given video
+    media_id = video_id_to_media_id(video_id)
+    # get chunk_ids for the video
+    chunk_rows = supa_all(supabase.table("junction_vectors").select("chunk_id").eq('media_id', media_id).execute())
+    # then update metadata of both supabase and vectorDB to include new curations
+    for r in chunk_rows:
+      update_response = Pinecone_index.update(
+        id=r['chunk_id'],
+        set_metadata={'curation_ids': goal_curations_with_video}
+      )
+      # TODO error check update_response
+    data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': media_id} for c in curations_to_add_video_to]).execute()
+  #curation_ids = [cur_dict[name] for name in curations_to_add_video_to]
+  status = "Status: Done! Video added, thanks for contributing :D"
+  return status, gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())
+def query_llm(prompt):
+  response = openai.Completion.create(
+      prompt=prompt,
+      temperature=0,
+      max_tokens=400,
+      top_p=1,
+      frequency_penalty=0,
+      presence_penalty=0,
+      #stop=stop_sequence,
+      model=f'text-davinci-003'
+  )
+  #print(response)
+  return response["choices"][0]["text"].strip()
 # this needn't be in hf space, as it will just call out to openai and the db
 # but why not host it here since it's free vs replits 2 cents/day
+def ask_question(question: str, openai_apikey: str, curation_names: List[str]):
+  if not question or not openai_apikey or not curation_names:
+    return "error: need all inputs", ""
+  openai.api_key = openai_apikey
   # query vector db for topk chunks
+  # can't use langchain bc we are using pinecone metadata filtering
+  q_embedding = Embedder(question).tolist()
+  curations_dict = curation_name2id()
+  curation_ids = [curations_dict[name] for name in curation_names]
+  results = Pinecone_index.query(vector=q_embedding, filter={'curation_ids': {"$in": curation_ids}}, top_k=5, include_metadata=True)
+  #pprint.pprint(results)
+  # TODO add filters to langchain's pinecone impl?
+  sources = {}
+  chunks = []
+  for r in results['matches']:
+    chunk = r['metadata']['text']
+    chunks.append(chunk)
+    video_name = r['metadata']['video_name']
+    sources.setdefault(video_name, []).append(chunk)
+  sources_md = "## Sources\n" + "\n\n".join([f"### {name}\n" + "\n\n---\n\n".join([f'{c}' for c in chunks]) for name, chunks in sources.items()])
   # format prompt (textwrap to guarantee length?)
+  instr = "Answer the question based on the context below, and if the question can't be answered based on the context, say 'I don't know'.\n\nContext:\n- "
+  prompt = instr + "\n- ".join(chunks) + f"\n\nQuestion: {question}\n\nAnswer:"
+  #pprint.pprint(prompt)
+  try:
+    answer = "## Answer\n" + query_llm(prompt)
+  except Exception as e:
+    answer = f"Error: {e}"
   # query llm and return output and topk
+  return answer, sources_md
+with gr.Blocks() as demo:
+  curations_from_db = get_curation_names()
+  refresh_button = gr.Button("Synchronize data (with other user's changes)")
+  with gr.Tab("Ask a question"):
+    q = gr.Textbox(label="Your question")
+    openai_apikey = gr.Textbox(label="OpenAI API Key", type="password")
+    curation_names_1 = gr.CheckboxGroup(choices=curations_from_db, label="Curations to query")
+    button = gr.Button("Submit")
+    answer = gr.Markdown(value="")
+    sources = gr.Markdown(value="")
+    button.click(ask_question, inputs=[q, openai_apikey, curation_names_1], outputs=[answer, sources])
+  with gr.Tab("Browse & Organize Curations"):
+    def refresh_curation_accordion():
+      output = gen_curation_md()
+      return gr.update(value=output)
+      #md.change(fn=refresh_curation_accordion, inputs=[curation_names_1], outputs=[md])
+#    for name,id in curation_name2id().items():
+#      print(id,name,rows)
+#      accordions_state[name] = {'gr_obj': gr.Accordion(name), 'rows': []}
+#      with accordions_state[name]['gr_obj']:
+#        for i,medium in enumerate(row_d[id]):
+#          accordions_state[name]['rows'].append(gr.Row(variant='compact'))
+#          with accordions_state[name]['rows'][i]:
+#            gr.Markdown(medium['video_name'])
+          #delete_button = gr.Button("Delete from Curation")
+          #delete_button.click(...)
+    #refresh_button = gr.Button("Refresh curations")
+    md = gr.Markdown(gen_curation_md())
+    #refresh_button.click(fn=refresh_curation_accordion, inputs=[], outputs=[md])
+  with gr.Tab("Add data to Curations"):
+    gr.Markdown("An hour's worth of video seems to take about a minute to upload (ymmv).")
+    video_id = gr.Textbox(label="Youtube video id (NOT full url)", placeholder="lvh3g7eszVQ")
+    curation_names_2 = gr.CheckboxGroup(choices=curations_from_db,
+                                      #isible=len(Cur_keys) > 0,
+                                      label="Add to existing Curations")
+    new_curation = gr.Textbox(label="and/or add to new Curation")
+    button = gr.Button("Submit")
+    status_field = gr.Markdown()
+    # TODO need to undo rdb and vdb state if cancel clicked
+    #submit_click = button.click(ingest_video, inputs=[video_id, curation_names_2, new_curation], outputs=[status_field, curation_names_1, curation_names_2, md])
+    #cancel_button = gr.Button("Cancel", cancels=[submit_click])
+  def refresh_all_curation_lists():
+    return gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())
+  refresh_button.click(fn=refresh_all_curation_lists, inputs=[], outputs=[curation_names_1, curation_names_2, md])
+demo.launch()