Spaces:

marcgreen
/

semantic_curations

Runtime error

App Files Files Community

marcgreen commited on Dec 21, 2022

Commit

669b468

1 Parent(s): 26ca549

copy insertion logic (+ WIP langchain pinecone impl) from colab

Browse files

Files changed (1) hide show

app.py +198 -0

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""Interface for Pinecone vector stores."""
+import uuid
+import pinecone
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Iterable, List, Optional
+from langchain.docstore.document import Document
+from langchain.embeddings.base import Embeddings
+from langchain.vectorstores.base import VectorStore
+class Pinecone(VectorStore):
+    """Interface for vector stores."""
+    def _query():
+      pass
+    def __init__(
+        self, api_key: str, index_name: str, embedding_function: Callable
+    ):
+        """Initialize with necessary components."""
+        try:
+            import pinecone
+        except ImportError:
+            raise ValueError(
+                "Could not import pinecone python package. "
+                "Please install it with `pip install pinecone-client`."
+            )
+        self.embedding_function = embedding_function
+        self.index_name = index_name
+        #try:
+        pinecone.init(
+           api_key=api_key,
+           environment='us-west1-gcp' # only option for for free tier
+        )
+        #except ValueError as e:
+        #    raise ValueError(
+        #        f"Your elasticsearch client string is misformatted. Got error: {e} "
+         #   )
+        self.client = pinecone
+    def add_texts(
+        self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
+    ) -> None:
+        """Run more texts through the embeddings and add to the vectorstore."""
+        index = self.client.Index(self.index_name)
+        batch_size = 16 # recommended limit is 100 vectors
+        for i in range(0, len(texts), batch_size):
+          i_end = min(i+batch_size, len(texts))
+          text_batch = texts[i:i_end]
+          metadata_batch = metadatas[i:i_end] if metadatas else [{}] * (i_end-i)
+          embedding_batch = self.embedding_function(text_batch) # [[0] * 768] * (i_end - i) #
+          to_upsert = [
+              (
+                  str(uuid.uuid4()), # id that we currently don't care about
+                  embedding.tolist(),
+                  dict(
+                      {"text": text},
+                      **metadata # if 'text' in here too, it takes precendence
+                  )
+              ) for text, embedding, metadata in zip(text_batch, embedding_batch, metadata_batch)
+          ]
+          index.upsert(vectors=to_upsert)
+    def similarity_search(self, query: str, k: int = 5) -> List[Document]:
+        """Return docs most similar to query."""
+        index = self.client.Index(self.index_name)
+        matches = index.query(
+            #namespace="example-namespace",
+            top_k=k,
+            include_values=True,
+            include_metadata=True,
+            vector=query,
+            #filter={
+            #    "genre": {"$in": ["comedy", "documentary", "drama"]}
+            #}
+        )
+        documents = [
+            Document(page_content=match["metadata"]["text"], metadata=match["metadata"]) for match in matches
+        ]
+        return documents
+    @classmethod
+    def from_texts(
+        cls,
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any
+    ) -> "VectorStore":
+        """Return VectorStore initialized from texts and embeddings."""
+# TODO fill out other 2 methods for Pinecone Vectore Store and ask if harrison would be open to a PR
+# TODO account for mpnet's limit of 384 word pieces per chunk (is it done already?)
+# DONE need to check if embeddings exist for given video id before generating embeddings
+# supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
+# - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
+# - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
+# TODO user prefs data model (their curations)
+# - meh not needed at first
+# DONE design main workflows
+# DONE curation data model
+# TODO frontend (discord bot or gradio or)
+# - i also want to be able to give it a yt vid and have it summarize it for me
+# TODO workflow for curating videos into sets (aka Curations)
+# TODO workflow to ask Curations a question
+# - LEFT OFF here
+# TODO support yt playlists in addition to just one-off videos
+# - can i make this really easy to add via a well designed api?
+# TODO finalize deployment strategy
+# - supabase free tier for db + blob storage of transcripts
+# - hf space to host model computations (langchain bits need to run here)
+# - replit or supabase to host edge functiosn to call hf space
+# TODO gradio session state to track recently asked questions
+import json
+import gradio as gr
+from langchain.text_splitter import SpacyTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from youtube_transcript_api import YouTubeTranscriptApi
+embedder = HuggingFaceEmbeddings().embed_documents
+model_name = HuggingFaceEmbeddings().model_name
+from supabase import create_client, Client
+PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
+SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
+SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+pinecone = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
+def transcript2chunks(transcript):
+  return SpacyTextSplitter().split_text(transcript)
+def ingest_transcript(transcript):
+  p = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
+  chunks = transcript2chunks(transcript)
+  p.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
+def already_ingested(yt_video_id: str):
+  data = supabase.table("ingested_youtube_videos").select("*", count="estimated").eq('video_id', yt_video_id).execute()
+  return data.count > 0
+def yt2transcript(video_id):
+  print(f"\n\nid: {video_id}")
+  # data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
+  data = YouTubeTranscriptApi.get_transcript(video_id)
+  transcript = ' '.join([x['text'] for x in data])
+  # TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
+  return transcript
+def ingest_video(video_id):
+  if already_ingested(video_id):
+    return "dupe"
+  else:
+    transcript = yt2transcript(video_id)
+    chunks = transcript2chunks(transcript)
+    pinecone.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
+    inserted_row_data = supabase.table("ingested_youtube_videos").insert({"video_id": video_id}).execute()
+    inserted_row_json = json.loads(inserted_row_data.json())
+    print(inserted_row_json)
+    inserted_row = inserted_row_json['data'][0]
+    data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
+                                                          'num_chunks': len(chunks),
+                                                          'embedding_model': str(embedder),
+                                                          'transcribed_by': 'youtube_transcript_api'}).execute()
+    return f"ingested {len(chunks)} chunks"
+# this needn't be in hf space, as it will just call out to openai and the db
+# but why not host it here since it's free vs replits 2 cents/day
+def ask_question(question: str, instruction: str, curation_ids: List[str]):
+  # query vector db for topk chunks
+  topk = pinecone.similarity_search(question, k = 10)
+  # format prompt (textwrap to guarantee length?)
+  # query llm and return output and topk
+  pass
+# this needn't be in hf space, as it will just interact with db's community and user Curations
+# but why not host it here
+# will be updating the pinecone metadata/namespace with teh curation ids it's a part of
+# - but that would tie the app to pinecone....
+# - so maybe I do want to use elastic 8,5 since it can do both? need nontrivial specs
+# - using metadata would let us assign multiple curation ids to each video, whereas namespace could only be 1
+def organize_curations():
+  pass
+demo = gr.Interface(fn=ingest_video, inputs=["text"], outputs=["text"])
+                    #article="\n".join([f"- {k}: " + v.replace("\n"," ") for k,v in instructions.items()]))
+demo.launch()
+#ingest_video(video_id)