Spaces:
Runtime error
Runtime error
copy insertion logic (+ WIP langchain pinecone impl) from colab
Browse files
app.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Interface for Pinecone vector stores."""
|
| 2 |
+
import uuid
|
| 3 |
+
import pinecone
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from typing import Any, Callable, Dict, Iterable, List, Optional
|
| 6 |
+
|
| 7 |
+
from langchain.docstore.document import Document
|
| 8 |
+
from langchain.embeddings.base import Embeddings
|
| 9 |
+
from langchain.vectorstores.base import VectorStore
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Pinecone(VectorStore):
|
| 13 |
+
"""Interface for vector stores."""
|
| 14 |
+
|
| 15 |
+
def _query():
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
def __init__(
|
| 19 |
+
self, api_key: str, index_name: str, embedding_function: Callable
|
| 20 |
+
):
|
| 21 |
+
"""Initialize with necessary components."""
|
| 22 |
+
try:
|
| 23 |
+
import pinecone
|
| 24 |
+
except ImportError:
|
| 25 |
+
raise ValueError(
|
| 26 |
+
"Could not import pinecone python package. "
|
| 27 |
+
"Please install it with `pip install pinecone-client`."
|
| 28 |
+
)
|
| 29 |
+
self.embedding_function = embedding_function
|
| 30 |
+
self.index_name = index_name
|
| 31 |
+
#try:
|
| 32 |
+
pinecone.init(
|
| 33 |
+
api_key=api_key,
|
| 34 |
+
environment='us-west1-gcp' # only option for for free tier
|
| 35 |
+
)
|
| 36 |
+
#except ValueError as e:
|
| 37 |
+
# raise ValueError(
|
| 38 |
+
# f"Your elasticsearch client string is misformatted. Got error: {e} "
|
| 39 |
+
# )
|
| 40 |
+
self.client = pinecone
|
| 41 |
+
|
| 42 |
+
def add_texts(
|
| 43 |
+
self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
|
| 44 |
+
) -> None:
|
| 45 |
+
"""Run more texts through the embeddings and add to the vectorstore."""
|
| 46 |
+
index = self.client.Index(self.index_name)
|
| 47 |
+
batch_size = 16 # recommended limit is 100 vectors
|
| 48 |
+
for i in range(0, len(texts), batch_size):
|
| 49 |
+
i_end = min(i+batch_size, len(texts))
|
| 50 |
+
text_batch = texts[i:i_end]
|
| 51 |
+
metadata_batch = metadatas[i:i_end] if metadatas else [{}] * (i_end-i)
|
| 52 |
+
embedding_batch = self.embedding_function(text_batch) # [[0] * 768] * (i_end - i) #
|
| 53 |
+
to_upsert = [
|
| 54 |
+
(
|
| 55 |
+
str(uuid.uuid4()), # id that we currently don't care about
|
| 56 |
+
embedding.tolist(),
|
| 57 |
+
dict(
|
| 58 |
+
{"text": text},
|
| 59 |
+
**metadata # if 'text' in here too, it takes precendence
|
| 60 |
+
)
|
| 61 |
+
) for text, embedding, metadata in zip(text_batch, embedding_batch, metadata_batch)
|
| 62 |
+
]
|
| 63 |
+
index.upsert(vectors=to_upsert)
|
| 64 |
+
|
| 65 |
+
def similarity_search(self, query: str, k: int = 5) -> List[Document]:
|
| 66 |
+
"""Return docs most similar to query."""
|
| 67 |
+
index = self.client.Index(self.index_name)
|
| 68 |
+
matches = index.query(
|
| 69 |
+
#namespace="example-namespace",
|
| 70 |
+
top_k=k,
|
| 71 |
+
include_values=True,
|
| 72 |
+
include_metadata=True,
|
| 73 |
+
vector=query,
|
| 74 |
+
#filter={
|
| 75 |
+
# "genre": {"$in": ["comedy", "documentary", "drama"]}
|
| 76 |
+
#}
|
| 77 |
+
)
|
| 78 |
+
documents = [
|
| 79 |
+
Document(page_content=match["metadata"]["text"], metadata=match["metadata"]) for match in matches
|
| 80 |
+
]
|
| 81 |
+
return documents
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def from_texts(
|
| 85 |
+
cls,
|
| 86 |
+
texts: List[str],
|
| 87 |
+
embedding: Embeddings,
|
| 88 |
+
metadatas: Optional[List[dict]] = None,
|
| 89 |
+
**kwargs: Any
|
| 90 |
+
) -> "VectorStore":
|
| 91 |
+
"""Return VectorStore initialized from texts and embeddings."""
|
| 92 |
+
|
| 93 |
+
# TODO fill out other 2 methods for Pinecone Vectore Store and ask if harrison would be open to a PR
|
| 94 |
+
# TODO account for mpnet's limit of 384 word pieces per chunk (is it done already?)
|
| 95 |
+
# DONE need to check if embeddings exist for given video id before generating embeddings
|
| 96 |
+
# supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
|
| 97 |
+
# - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
|
| 98 |
+
# - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
|
| 99 |
+
# TODO user prefs data model (their curations)
|
| 100 |
+
# - meh not needed at first
|
| 101 |
+
# DONE design main workflows
|
| 102 |
+
# DONE curation data model
|
| 103 |
+
# TODO frontend (discord bot or gradio or)
|
| 104 |
+
# - i also want to be able to give it a yt vid and have it summarize it for me
|
| 105 |
+
# TODO workflow for curating videos into sets (aka Curations)
|
| 106 |
+
# TODO workflow to ask Curations a question
|
| 107 |
+
# - LEFT OFF here
|
| 108 |
+
# TODO support yt playlists in addition to just one-off videos
|
| 109 |
+
# - can i make this really easy to add via a well designed api?
|
| 110 |
+
# TODO finalize deployment strategy
|
| 111 |
+
# - supabase free tier for db + blob storage of transcripts
|
| 112 |
+
# - hf space to host model computations (langchain bits need to run here)
|
| 113 |
+
# - replit or supabase to host edge functiosn to call hf space
|
| 114 |
+
# TODO gradio session state to track recently asked questions
|
| 115 |
+
|
| 116 |
+
import json
|
| 117 |
+
|
| 118 |
+
import gradio as gr
|
| 119 |
+
|
| 120 |
+
from langchain.text_splitter import SpacyTextSplitter
|
| 121 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
| 122 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 123 |
+
|
| 124 |
+
embedder = HuggingFaceEmbeddings().embed_documents
|
| 125 |
+
model_name = HuggingFaceEmbeddings().model_name
|
| 126 |
+
|
| 127 |
+
from supabase import create_client, Client
|
| 128 |
+
|
| 129 |
+
PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
|
| 130 |
+
SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
|
| 131 |
+
SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
|
| 132 |
+
|
| 133 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 134 |
+
pinecone = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
|
| 135 |
+
|
| 136 |
+
def transcript2chunks(transcript):
|
| 137 |
+
return SpacyTextSplitter().split_text(transcript)
|
| 138 |
+
|
| 139 |
+
def ingest_transcript(transcript):
|
| 140 |
+
p = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
|
| 141 |
+
chunks = transcript2chunks(transcript)
|
| 142 |
+
p.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
|
| 143 |
+
|
| 144 |
+
def already_ingested(yt_video_id: str):
|
| 145 |
+
data = supabase.table("ingested_youtube_videos").select("*", count="estimated").eq('video_id', yt_video_id).execute()
|
| 146 |
+
return data.count > 0
|
| 147 |
+
|
| 148 |
+
def yt2transcript(video_id):
|
| 149 |
+
print(f"\n\nid: {video_id}")
|
| 150 |
+
# data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
|
| 151 |
+
data = YouTubeTranscriptApi.get_transcript(video_id)
|
| 152 |
+
transcript = ' '.join([x['text'] for x in data])
|
| 153 |
+
# TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
|
| 154 |
+
return transcript
|
| 155 |
+
|
| 156 |
+
def ingest_video(video_id):
|
| 157 |
+
if already_ingested(video_id):
|
| 158 |
+
return "dupe"
|
| 159 |
+
else:
|
| 160 |
+
transcript = yt2transcript(video_id)
|
| 161 |
+
chunks = transcript2chunks(transcript)
|
| 162 |
+
pinecone.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
|
| 163 |
+
inserted_row_data = supabase.table("ingested_youtube_videos").insert({"video_id": video_id}).execute()
|
| 164 |
+
inserted_row_json = json.loads(inserted_row_data.json())
|
| 165 |
+
print(inserted_row_json)
|
| 166 |
+
inserted_row = inserted_row_json['data'][0]
|
| 167 |
+
|
| 168 |
+
data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
|
| 169 |
+
'num_chunks': len(chunks),
|
| 170 |
+
'embedding_model': str(embedder),
|
| 171 |
+
'transcribed_by': 'youtube_transcript_api'}).execute()
|
| 172 |
+
return f"ingested {len(chunks)} chunks"
|
| 173 |
+
|
| 174 |
+
# this needn't be in hf space, as it will just call out to openai and the db
|
| 175 |
+
# but why not host it here since it's free vs replits 2 cents/day
|
| 176 |
+
def ask_question(question: str, instruction: str, curation_ids: List[str]):
|
| 177 |
+
# query vector db for topk chunks
|
| 178 |
+
topk = pinecone.similarity_search(question, k = 10)
|
| 179 |
+
|
| 180 |
+
# format prompt (textwrap to guarantee length?)
|
| 181 |
+
|
| 182 |
+
# query llm and return output and topk
|
| 183 |
+
pass
|
| 184 |
+
|
| 185 |
+
# this needn't be in hf space, as it will just interact with db's community and user Curations
|
| 186 |
+
# but why not host it here
|
| 187 |
+
# will be updating the pinecone metadata/namespace with teh curation ids it's a part of
|
| 188 |
+
# - but that would tie the app to pinecone....
|
| 189 |
+
# - so maybe I do want to use elastic 8,5 since it can do both? need nontrivial specs
|
| 190 |
+
# - using metadata would let us assign multiple curation ids to each video, whereas namespace could only be 1
|
| 191 |
+
def organize_curations():
|
| 192 |
+
pass
|
| 193 |
+
|
| 194 |
+
demo = gr.Interface(fn=ingest_video, inputs=["text"], outputs=["text"])
|
| 195 |
+
#article="\n".join([f"- {k}: " + v.replace("\n"," ") for k,v in instructions.items()]))
|
| 196 |
+
demo.launch()
|
| 197 |
+
|
| 198 |
+
#ingest_video(video_id)
|