Spaces:
Runtime error
Runtime error
File size: 16,604 Bytes
421ff50 b71265c 669b468 b71265c 669b468 deffc44 9e89d52 669b468 deffc44 669b468 14c5535 68faea2 669b468 b71265c e433e22 b71265c 669b468 8179b31 669b468 b71265c 669b468 14c5535 b71265c 669b468 b71265c 669b468 b71265c 43ee39e b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c 669b468 b71265c c4ca511 b71265c 669b468 b71265c 669b468 b71265c ef2acaf b71265c ef2acaf b71265c ef2acaf b71265c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 |
# prioritized todos
# supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
# - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
# - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
# TODO add discord/github/google auth...via custom js? see supabase docs
# - make users maintainers of their own curations, restrict add perms, introduce edit/delete/clone perms
# - add stars to curations+users profile -> display starred curations first, then sort by most popular
# - securely store user's openai key in supabase for convenience
# TODO better ai arch
# - eg let user customize instr (via langchain's jinji support?)
# - better: make easy to experiment with langchain's chains/agents
# - maybe something like model_laboratory with gradio's Parallel block?
# - account for mpnet's limit of 384 word pieces per chunk (is it done already?)
# - - more deliberate chunking strat in general
# TODO summarize a vid (and optionally add to curation)
# TODO support yt playlists and yt channels in addition to just one-off videos
# - can i make this really easy to add via a well designed api?
# unprioritized todos
# TODO some inline todos below that should reduce need to reset/rollback DBs
# - how to easily rollback bad data?
# TODO harrison thinks editing vectorDB abstraction to consume Embedding class vs func is a good approach -> need to PR this
# TODO can i generalize the query filter approach (add to langchain?) to remove coupling to pinecone?
# - i believe elastic8.5 supports rdb and vdb, but need nontrivial specs to run it i think
# TODO user prefs data model (their curations)
# TODO finalize deployment strategy
# - supabase free tier for db + blob storage of transcripts
# - hf space to host model computations (langchain bits need to run here)
# - replit or supabase to host edge functiosn to call hf space
# TODO gradio global state to track recently asked questions from everyone
# TODO create pinecone index without indexing text metadata field for performance: https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
# TODO could use pinecone namespace per embedding model
# TODO deploy txtai to fly.io free tier? not sure compute reqs
# - or haystack?
# - both these come with many features out of the box
import os
import json
import uuid
import openai
import spacy
import en_core_web_sm
import gradio as gr
from gradio import blocks
import pinecone
from supabase import create_client, Client
from langchain.vectorstores import Pinecone
from langchain.text_splitter import SpacyTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from typing import Any, Callable, Dict, Iterable, List, Optional
Embedder = HuggingFaceEmbeddings().embed_query
Model_name = HuggingFaceEmbeddings().model_name
PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
pinecone.init(
api_key=PINECONE_APIKEY,
environment='us-west1-gcp' # only option for for free tier
)
class MyPinecone(Pinecone):
def add_texts(
self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
Returns:
List of ids from adding the texts into the vectorstore.
"""
# Embed and create the documents
docs = []
ids = []
for i, text in enumerate(texts):
id = str(uuid.uuid4())
embedding = self._embedding_function(text)#.tolist()
metadata = metadatas[i] if metadatas else {}
metadata[self._text_key] = text
docs.append((id, embedding, metadata))
ids.append(id)
# upsert to Pinecone
self._index.upsert(vectors=docs)
return ids
Pinecone_index = pinecone.Index('semantic-curations')
Vdb = MyPinecone(Pinecone_index, Embedder, "text")
def supa_all(supa_data) -> List[dict]:
datajson = json.loads(supa_data.json())
return datajson['data']
def transcript2chunks(transcript):
print("starting transcript2chunks")
# TODO what's a good chunk_size?
# TODO should store as metadata in dbs
r = SpacyTextSplitter(chunk_size = 2000).split_text(transcript)
print("finished chunking")
return r
def video_id_to_media_id(video_id: str) -> Optional[str]:
rows = supa_all(supabase.table('ingested_youtube_videos').select('media_id').eq('video_id', video_id).execute())
print(rows)
if len(rows) == 1:
return rows[0]['media_id']
else:
return None
# returns curation_ids that already have the video_id
def check_curations_with_video(video_id: str) -> List[str]:
media_id = video_id_to_media_id(video_id)
print(f"media_id {media_id}")
if media_id is None:
return []
data = supa_all(supabase.table("junction_curations").select("curation_id").eq('media_id', media_id).execute())
in_curations = [r['curation_id'] for r in data]
return in_curations
def yt2transcript(video_id):
print(f"\n\nstarting yt2transcript on id: {video_id}")
# data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
data = YouTubeTranscriptApi.get_transcript(video_id)
transcript = ' '.join([x['text'] for x in data])
print("got transcript")
# TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
# TODO ought to store timestamp of chunks in metadata for better Sources.
# - instead of splitting transcript into chunks, can i merge these fragments into approp size? langchain has merge func
return transcript
def yt_id2name(video_id: str) -> str:
video = YouTube(f"https://www.youtube.com/watch?v={video_id}")
return video.title
# db guarantees name is unique across rows
def curation_name2id() -> dict:
rows = supa_all(supabase.table("curations_metadata").select("curation_id, name").execute())
c = {}
for r in rows:
c[r['name']] = r['curation_id']
return c
def get_curation_names():
d = curation_name2id()
return list(d.keys())
def get_curations_and_videos():
rows = supa_all(supabase.table("curations_metadata").select("curation_id, name, media_id:ingested_youtube_videos ( video_name )").execute())
row_d = {}
for r in rows:
for m in r['media_id']:
row_d.setdefault(r['name'], []).append(m['video_name'])
return row_d
def gen_curation_md():
output = ""
for curation_name,video_names in get_curations_and_videos().items():
output += f"\n## {curation_name}\n"
output += "1. " + "\n1. ".join(video_names)
return output
def ingest_video(video_id: str, selected_curation_names: List[str], new_curation: str = ""):
video_id = video_id.strip()
if new_curation:
curcur = curation_name2id()
if new_curation in curcur.keys():
return "dupe curation name", gr.update(), gr.update(), gr.update()
# add to db here, which will autogen the id
supabase.table("curations_metadata").insert({"name": new_curation}).execute()
selected_curation_names.append(new_curation)
if not selected_curation_names: # contains new_curation at this point
return "need >=1 curations", gr.update(), gr.update(), gr.update()
cur_dict = curation_name2id()
selected_curation_ids = [cur_dict[n] for n in selected_curation_names]
existing_curations_with_video = check_curations_with_video(video_id)
curations_to_add_video_to = list(set(selected_curation_ids).difference(set(existing_curations_with_video)))
goal_curations_with_video = existing_curations_with_video + curations_to_add_video_to
if not curations_to_add_video_to: # video already in all selected curations
return "dupe video", gr.update(), gr.update(), gr.update()
if len(existing_curations_with_video) == 0: # no curations have the video, we need to add it to vector db
assert(goal_curations_with_video == curations_to_add_video_to) # this should be true in this case
print("new video, processing\n")
try:
video_name = yt_id2name(video_id)
except Exception as e:
# TODO undo new_curation create supabase.table("curations_metadata").insert({"name": new_curation}).execute()
# - in all try/catches. maybe have upper try/catch to do this in one place. extract
return f"Error loading video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()
try:
transcript = yt2transcript(video_id)
except Exception as e:
return f"Error fetching transcripts for video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()
chunks = transcript2chunks(transcript)
metadatas = [{'video_id': video_id, 'video_name': video_name, 'curation_ids': goal_curations_with_video} for c in chunks] # *len() was buggy?
#import pprint
#for i, c in enumerate(chunks):
# print(f"{i}: {c}")
#print(metadata)
print("embedding & uploading to vector db TODO how to get progress from langchain?\n")
# TODO consider storing chunk text in supabase - maybe get more storage out of pinecone's s1 if supabase's free tier is sufficient
chunk_ids = Vdb.add_texts(chunks, metadatas)
print("bookkeeping supabase with new video\n")
inserted_row = supa_all(supabase.table("ingested_youtube_videos").insert({"video_id": video_id,
"video_name": video_name}).execute())[0]
data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
'num_chunks': len(chunks),
'embedding_model': str(Model_name),
'transcribed_by': 'youtube_transcript_api'}).execute()
print("\t- transcripts\n")
data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': inserted_row['media_id']} for c in goal_curations_with_video]).execute()
print("\t- curations\n")
data = supabase.table('junction_vectors').insert( [{'chunk_id': c, 'media_id': inserted_row['media_id']} for c in chunk_ids ]).execute()
print("\t- vectors\n")
else: # some curations already ahve video, so no need to chunk+embed+insert into vector db. just adjust bookkeeping in vector db + supa
print("video already in vector db, updating metadata to include selected curations\n")
# get media_id of given video
media_id = video_id_to_media_id(video_id)
# get chunk_ids for the video
chunk_rows = supa_all(supabase.table("junction_vectors").select("chunk_id").eq('media_id', media_id).execute())
# then update metadata of both supabase and vectorDB to include new curations
for r in chunk_rows:
update_response = Pinecone_index.update(
id=r['chunk_id'],
set_metadata={'curation_ids': goal_curations_with_video}
)
# TODO error check update_response
data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': media_id} for c in curations_to_add_video_to]).execute()
#curation_ids = [cur_dict[name] for name in curations_to_add_video_to]
status = "Status: Done! Video added, thanks for contributing :D"
return status, gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())
def query_llm(prompt):
response = openai.Completion.create(
prompt=prompt,
temperature=0,
max_tokens=400,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
#stop=stop_sequence,
model=f'text-davinci-003'
)
#print(response)
return response["choices"][0]["text"].strip()
# this needn't be in hf space, as it will just call out to openai and the db
# but why not host it here since it's free vs replits 2 cents/day
def ask_question(question: str, openai_apikey: str, curation_names: List[str]):
if not question or not openai_apikey or not curation_names:
return "error: need all inputs", ""
openai.api_key = openai_apikey
# query vector db for topk chunks
# can't use langchain bc we are using pinecone metadata filtering
q_embedding = Embedder(question)#.tolist()
curations_dict = curation_name2id()
curation_ids = [curations_dict[name] for name in curation_names]
results = Pinecone_index.query(vector=q_embedding, filter={'curation_ids': {"$in": curation_ids}}, top_k=5, include_metadata=True)
#pprint.pprint(results)
# TODO add filters to langchain's pinecone impl?
sources = {}
chunks = []
for r in results['matches']:
chunk = r['metadata']['text']
chunks.append(chunk)
video_name = r['metadata']['video_name']
sources.setdefault(video_name, []).append(chunk)
sources_md = "## Sources\n" + "\n\n".join([f"### {name}\n" + "\n\n---\n\n".join([f'{c}' for c in chunks]) for name, chunks in sources.items()])
# format prompt (textwrap to guarantee length?)
instr = "Answer the question based on the context below, and if the question can't be answered based on the context, say 'I don't know'.\n\nContext:\n- "
prompt = instr + "\n- ".join(chunks) + f"\n\nQuestion: {question}\n\nAnswer:"
#pprint.pprint(prompt)
try:
answer = "## Answer\n" + query_llm(prompt)
except Exception as e:
answer = f"Error: {e}"
# query llm and return output and topk
return answer, sources_md
with gr.Blocks() as demo:
curations_from_db = get_curation_names()
refresh_button = gr.Button("Synchronize data (with other user's changes)")
with gr.Tab("Ask a question"):
q = gr.Textbox(label="Your question")
openai_apikey = gr.Textbox(label="OpenAI API Key", type="password")
curation_names_1 = gr.CheckboxGroup(choices=curations_from_db, label="Curations to query")
button = gr.Button("Submit")
answer = gr.Markdown(value="")
sources = gr.Markdown(value="")
button.click(ask_question, inputs=[q, openai_apikey, curation_names_1], outputs=[answer, sources])
with gr.Tab("Browse & Organize Curations"):
def refresh_curation_accordion():
output = gen_curation_md()
return gr.update(value=output)
#md.change(fn=refresh_curation_accordion, inputs=[curation_names_1], outputs=[md])
# for name,id in curation_name2id().items():
# print(id,name,rows)
# accordions_state[name] = {'gr_obj': gr.Accordion(name), 'rows': []}
# with accordions_state[name]['gr_obj']:
# for i,medium in enumerate(row_d[id]):
# accordions_state[name]['rows'].append(gr.Row(variant='compact'))
# with accordions_state[name]['rows'][i]:
# gr.Markdown(medium['video_name'])
#delete_button = gr.Button("Delete from Curation")
#delete_button.click(...)
#refresh_button = gr.Button("Refresh curations")
md = gr.Markdown(gen_curation_md())
#refresh_button.click(fn=refresh_curation_accordion, inputs=[], outputs=[md])
with gr.Tab("Add data to Curations"):
gr.Markdown("An hour's worth of video seems to take about a minute to upload (ymmv).")
video_id = gr.Textbox(label="Youtube video id (NOT full url)", placeholder="lvh3g7eszVQ")
curation_names_2 = gr.CheckboxGroup(choices=curations_from_db, label="Add to existing Curations")
new_curation = gr.Textbox(label="and/or add to new Curation")
button = gr.Button("Submit")
status_field = gr.Markdown()
submit_click = button.click(ingest_video, inputs=[video_id, curation_names_2, new_curation], outputs=[status_field, curation_names_1, curation_names_2, md])
# TODO need to undo rdb and vdb state if cancel clicked
#cancel_button = gr.Button("Cancel", cancels=[submit_click])
def refresh_all_curation_lists():
return gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())
refresh_button.click(fn=refresh_all_curation_lists, inputs=[], outputs=[curation_names_1, curation_names_2, md])
demo.launch() |