marcgreen commited on
Commit
b71265c
·
1 Parent(s): bba49a3
Files changed (1) hide show
  1. app.py +284 -149
app.py CHANGED
@@ -1,199 +1,334 @@
1
- """Interface for Pinecone vector stores."""
2
- import uuid
3
- import pinecone
4
- from abc import ABC, abstractmethod
5
- from typing import Any, Callable, Dict, Iterable, List, Optional
6
-
7
- from langchain.docstore.document import Document
8
- from langchain.embeddings.base import Embeddings
9
- from langchain.vectorstores.base import VectorStore
10
-
11
-
12
- class Pinecone(VectorStore):
13
- """Interface for vector stores."""
14
-
15
- def _query():
16
- pass
17
-
18
- def __init__(
19
- self, api_key: str, index_name: str, embedding_function: Callable
20
- ):
21
- """Initialize with necessary components."""
22
- try:
23
- import pinecone
24
- except ImportError:
25
- raise ValueError(
26
- "Could not import pinecone python package. "
27
- "Please install it with `pip install pinecone-client`."
28
- )
29
- self.embedding_function = embedding_function
30
- self.index_name = index_name
31
- #try:
32
- pinecone.init(
33
- api_key=api_key,
34
- environment='us-west1-gcp' # only option for for free tier
35
- )
36
- #except ValueError as e:
37
- # raise ValueError(
38
- # f"Your elasticsearch client string is misformatted. Got error: {e} "
39
- # )
40
- self.client = pinecone
41
-
42
- def add_texts(
43
- self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
44
- ) -> None:
45
- """Run more texts through the embeddings and add to the vectorstore."""
46
- index = self.client.Index(self.index_name)
47
- batch_size = 16 # recommended limit is 100 vectors
48
- for i in range(0, len(texts), batch_size):
49
- i_end = min(i+batch_size, len(texts))
50
- text_batch = texts[i:i_end]
51
- metadata_batch = metadatas[i:i_end] if metadatas else [{}] * (i_end-i)
52
- embedding_batch = self.embedding_function(text_batch) # [[0] * 768] * (i_end - i) #
53
- to_upsert = [
54
- (
55
- str(uuid.uuid4()), # id that we currently don't care about
56
- embedding.tolist(),
57
- dict(
58
- {"text": text},
59
- **metadata # if 'text' in here too, it takes precendence
60
- )
61
- ) for text, embedding, metadata in zip(text_batch, embedding_batch, metadata_batch)
62
- ]
63
- index.upsert(vectors=to_upsert)
64
-
65
- def similarity_search(self, query: str, k: int = 5) -> List[Document]:
66
- """Return docs most similar to query."""
67
- index = self.client.Index(self.index_name)
68
- matches = index.query(
69
- #namespace="example-namespace",
70
- top_k=k,
71
- include_values=True,
72
- include_metadata=True,
73
- vector=query,
74
- #filter={
75
- # "genre": {"$in": ["comedy", "documentary", "drama"]}
76
- #}
77
- )
78
- documents = [
79
- Document(page_content=match["metadata"]["text"], metadata=match["metadata"]) for match in matches
80
- ]
81
- return documents
82
-
83
- @classmethod
84
- def from_texts(
85
- cls,
86
- texts: List[str],
87
- embedding: Embeddings,
88
- metadatas: Optional[List[dict]] = None,
89
- **kwargs: Any
90
- ) -> "VectorStore":
91
- """Return VectorStore initialized from texts and embeddings."""
92
-
93
- # TODO fill out other 2 methods for Pinecone Vectore Store and ask if harrison would be open to a PR
94
  # TODO account for mpnet's limit of 384 word pieces per chunk (is it done already?)
95
- # DONE need to check if embeddings exist for given video id before generating embeddings
96
  # supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
97
  # - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
98
  # - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
99
  # TODO user prefs data model (their curations)
100
  # - meh not needed at first
101
- # DONE design main workflows
102
- # DONE curation data model
103
- # TODO frontend (discord bot or gradio or)
104
- # - i also want to be able to give it a yt vid and have it summarize it for me
105
- # TODO workflow for curating videos into sets (aka Curations)
106
- # TODO workflow to ask Curations a question
107
- # - LEFT OFF here
108
  # TODO support yt playlists in addition to just one-off videos
109
  # - can i make this really easy to add via a well designed api?
110
  # TODO finalize deployment strategy
111
  # - supabase free tier for db + blob storage of transcripts
112
  # - hf space to host model computations (langchain bits need to run here)
113
  # - replit or supabase to host edge functiosn to call hf space
114
- # TODO gradio session state to track recently asked questions
 
 
 
 
 
 
 
 
 
115
 
116
  import os
117
  import json
118
 
119
  import gradio as gr
 
 
 
120
 
121
  from langchain.text_splitter import SpacyTextSplitter
122
  from langchain.embeddings import HuggingFaceEmbeddings
 
 
123
  from youtube_transcript_api import YouTubeTranscriptApi
124
 
125
- embedder = HuggingFaceEmbeddings().embed_documents
126
- model_name = HuggingFaceEmbeddings().model_name
127
-
128
- from supabase import create_client, Client
129
 
130
  PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
131
  SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
132
  SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
133
 
134
  supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
135
- pinecone = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
 
 
 
136
 
137
- def transcript2chunks(transcript):
138
- return SpacyTextSplitter().split_text(transcript)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
- def ingest_transcript(transcript):
141
- p = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
142
- chunks = transcript2chunks(transcript)
143
- p.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
144
 
145
- def already_ingested(yt_video_id: str):
146
- data = supabase.table("ingested_youtube_videos").select("*", count="estimated").eq('video_id', yt_video_id).execute()
147
- return data.count > 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  def yt2transcript(video_id):
150
- print(f"\n\nid: {video_id}")
151
  # data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
152
  data = YouTubeTranscriptApi.get_transcript(video_id)
153
  transcript = ' '.join([x['text'] for x in data])
 
154
  # TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
 
 
155
  return transcript
156
 
157
- def ingest_video(video_id):
158
- if already_ingested(video_id):
159
- return "dupe"
160
- else:
161
- transcript = yt2transcript(video_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  chunks = transcript2chunks(transcript)
163
- pinecone.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
164
- inserted_row_data = supabase.table("ingested_youtube_videos").insert({"video_id": video_id}).execute()
165
- inserted_row_json = json.loads(inserted_row_data.json())
166
- print(inserted_row_json)
167
- inserted_row = inserted_row_json['data'][0]
168
-
 
 
 
 
 
 
 
 
169
  data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
170
  'num_chunks': len(chunks),
171
- 'embedding_model': str(embedder),
172
  'transcribed_by': 'youtube_transcript_api'}).execute()
173
- return f"ingested {len(chunks)} chunks"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  # this needn't be in hf space, as it will just call out to openai and the db
176
  # but why not host it here since it's free vs replits 2 cents/day
177
- def ask_question(question: str, instruction: str, curation_ids: List[str]):
 
 
 
178
  # query vector db for topk chunks
179
- topk = pinecone.similarity_search(question, k = 10)
180
-
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  # format prompt (textwrap to guarantee length?)
 
 
 
 
 
 
 
 
182
 
183
  # query llm and return output and topk
184
- pass
185
-
186
- # this needn't be in hf space, as it will just interact with db's community and user Curations
187
- # but why not host it here
188
- # will be updating the pinecone metadata/namespace with teh curation ids it's a part of
189
- # - but that would tie the app to pinecone....
190
- # - so maybe I do want to use elastic 8,5 since it can do both? need nontrivial specs
191
- # - using metadata would let us assign multiple curation ids to each video, whereas namespace could only be 1
192
- def organize_curations():
193
- pass
194
-
195
- demo = gr.Interface(fn=ingest_video, inputs=["text"], outputs=["text"])
196
- #article="\n".join([f"- {k}: " + v.replace("\n"," ") for k,v in instructions.items()]))
197
- demo.launch()
198
-
199
- #ingest_video(video_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TODO some inline todos below that should reduce need to reset/rollback DBs
2
+ # - how to easily rollback bad data?
3
+ # TODO harrison thinks editing vectorDB abstraction to consume Embedding class vs func is a good approach -> need to PR this
4
+ # TODO can i generalize the query filter approach (add to langchain?) to remove coupling to pinecone?
5
+ # - i believe elastic8.5 supports rdb and vdb, but need nontrivial specs to run it i think
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # TODO account for mpnet's limit of 384 word pieces per chunk (is it done already?)
 
7
  # supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
8
  # - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
9
  # - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
10
  # TODO user prefs data model (their curations)
11
  # - meh not needed at first
12
+ # TODO summarize a vid (and optionally add to curation)
 
 
 
 
 
 
13
  # TODO support yt playlists in addition to just one-off videos
14
  # - can i make this really easy to add via a well designed api?
15
  # TODO finalize deployment strategy
16
  # - supabase free tier for db + blob storage of transcripts
17
  # - hf space to host model computations (langchain bits need to run here)
18
  # - replit or supabase to host edge functiosn to call hf space
19
+ # TODO gradio global state to track recently asked questions from everyone
20
+ # TODO add discord/github/google auth...via custom js? see supabase docs
21
+ # - make users maintainers of their own curations, restrict add perms, introduce edit/delete/clone perms
22
+ # - add stars to curations+users profile -> display starred curations first, then sort by most popular
23
+ # - securely store user's openai key in supabase for convenience
24
+ # TODO create pinecone index without indexing text metadata field for performance: https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
25
+ # TODO could use pinecone namespace per embedding model
26
+ # TODO let user customize instr (via langchain's jinji support?)
27
+ # - better: make easy to experiment with langchain's chains/agents
28
+ # - maybe something like model_laboratory with gradio's Parallel block?
29
 
30
  import os
31
  import json
32
 
33
  import gradio as gr
34
+ from gradio import blocks
35
+
36
+ from supabase import create_client, Client
37
 
38
  from langchain.text_splitter import SpacyTextSplitter
39
  from langchain.embeddings import HuggingFaceEmbeddings
40
+
41
+ from pytube import YouTube
42
  from youtube_transcript_api import YouTubeTranscriptApi
43
 
44
+ Embedder = HuggingFaceEmbeddings().embed_query
45
+ Model_name = HuggingFaceEmbeddings().model_name
 
 
46
 
47
  PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
48
  SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
49
  SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
50
 
51
  supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
52
+ pinecone.init(
53
+ api_key=PINECONE_APIKEY,
54
+ environment='us-west1-gcp' # only option for for free tier
55
+ )
56
 
57
+ class MyPinecone(Pinecone):
58
+ def add_texts(
59
+ self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
60
+ ) -> List[str]:
61
+ """Run more texts through the embeddings and add to the vectorstore.
62
+ Args:
63
+ texts: Iterable of strings to add to the vectorstore.
64
+ metadatas: Optional list of metadatas associated with the texts.
65
+ Returns:
66
+ List of ids from adding the texts into the vectorstore.
67
+ """
68
+ # Embed and create the documents
69
+ docs = []
70
+ ids = []
71
+ for i, text in enumerate(texts):
72
+ id = str(uuid.uuid4())
73
+ embedding = self._embedding_function(text).tolist()
74
+ metadata = metadatas[i] if metadatas else {}
75
+ metadata[self._text_key] = text
76
+ docs.append((id, embedding, metadata))
77
+ ids.append(id)
78
+ # upsert to Pinecone
79
+ self._index.upsert(vectors=docs)
80
+ return ids
81
 
82
+ Pinecone_index = pinecone.Index('semantic-curations')
83
+ Vdb = MyPinecone(Pinecone_index, Embedder, "text")
 
 
84
 
85
+ def supa_all(supa_data) -> List[dict]:
86
+ datajson = json.loads(supa_data.json())
87
+ return datajson['data']
88
+
89
+ def transcript2chunks(transcript):
90
+ print("starting transcript2chunks")
91
+ # TODO what's a good chunk_size?
92
+ # TODO should store as metadata in dbs
93
+ r = SpacyTextSplitter(chunk_size = 2000).split_text(transcript)
94
+ print("finished chunking")
95
+ return r
96
+
97
+ def video_id_to_media_id(video_id: str) -> Optional[str]:
98
+ rows = supa_all(supabase.table('ingested_youtube_videos').select('media_id').eq('video_id', video_id).execute())
99
+ print(rows)
100
+ if len(rows) == 1:
101
+ return rows[0]['media_id']
102
+ else:
103
+ return None
104
+
105
+ # returns curation_ids that already have the video_id
106
+ def check_curations_with_video(video_id: str) -> List[str]:
107
+ media_id = video_id_to_media_id(video_id)
108
+ print(f"media_id {media_id}")
109
+ if media_id is None:
110
+ return []
111
+ data = supa_all(supabase.table("junction_curations").select("curation_id").eq('media_id', media_id).execute())
112
+ in_curations = [r['curation_id'] for r in data]
113
+ return in_curations
114
 
115
  def yt2transcript(video_id):
116
+ print(f"\n\nstarting yt2transcript on id: {video_id}")
117
  # data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
118
  data = YouTubeTranscriptApi.get_transcript(video_id)
119
  transcript = ' '.join([x['text'] for x in data])
120
+ print("got transcript")
121
  # TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
122
+ # TODO ought to store timestamp of chunks in metadata for better Sources.
123
+ # - instead of splitting transcript into chunks, can i merge these fragments into approp size? langchain has merge func
124
  return transcript
125
 
126
+ def yt_id2name(video_id: str) -> str:
127
+ video = YouTube(f"https://www.youtube.com/watch?v={video_id}")
128
+ return video.title
129
+
130
+ # db guarantees name is unique across rows
131
+ def curation_name2id() -> dict:
132
+ rows = supa_all(supabase.table("curations_metadata").select("curation_id, name").execute())
133
+ c = {}
134
+ for r in rows:
135
+ c[r['name']] = r['curation_id']
136
+ return c
137
+
138
+ def get_curation_names():
139
+ d = curation_name2id()
140
+ return list(d.keys())
141
+
142
+ def get_curations_and_videos():
143
+ rows = supa_all(supabase.table("curations_metadata").select("curation_id, name, media_id:ingested_youtube_videos ( video_name )").execute())
144
+ row_d = {}
145
+ for r in rows:
146
+ for m in r['media_id']:
147
+ row_d.setdefault(r['name'], []).append(m['video_name'])
148
+ return row_d
149
+
150
+ def gen_curation_md():
151
+ output = ""
152
+ for curation_name,video_names in get_curations_and_videos().items():
153
+ output += f"\n## {curation_name}\n"
154
+ output += "1. " + "\n1. ".join(video_names)
155
+ return output
156
+
157
+ def ingest_video(video_id: str, selected_curation_names: List[str], new_curation: str = ""):
158
+ video_id = video_id.strip()
159
+ if new_curation:
160
+ curcur = curation_name2id()
161
+ if new_curation in curcur.keys():
162
+ return "dupe curation name", gr.update(), gr.update(), gr.update()
163
+ # add to db here, which will autogen the id
164
+ supabase.table("curations_metadata").insert({"name": new_curation}).execute()
165
+ selected_curation_names.append(new_curation)
166
+ if not selected_curation_names: # contains new_curation at this point
167
+ return "need >=1 curations", gr.update(), gr.update(), gr.update()
168
+
169
+ cur_dict = curation_name2id()
170
+ selected_curation_ids = [cur_dict[n] for n in selected_curation_names]
171
+ existing_curations_with_video = check_curations_with_video(video_id)
172
+ curations_to_add_video_to = list(set(selected_curation_ids).difference(set(existing_curations_with_video)))
173
+ goal_curations_with_video = existing_curations_with_video + curations_to_add_video_to
174
+ if not curations_to_add_video_to: # video already in all selected curations
175
+ return "dupe video", gr.update(), gr.update(), gr.update()
176
+
177
+ if len(existing_curations_with_video) == 0: # no curations have the video, we need to add it to vector db
178
+ assert(goal_curations_with_video == curations_to_add_video_to) # this should be true in this case
179
+ print("new video, processing\n")
180
+
181
+ try:
182
+ video_name = yt_id2name(video_id)
183
+ except Exception as e:
184
+ # TODO undo new_curation create supabase.table("curations_metadata").insert({"name": new_curation}).execute()
185
+ # - in all try/catches. maybe have upper try/catch to do this in one place. extract
186
+ return f"Error loading video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()
187
+
188
+ try:
189
+ transcript = yt2transcript(video_id)
190
+ except Exception as e:
191
+ return f"Error fetching transcripts for video with id '{video_id}'. Exception: {e}", gr.update(), gr.update(), gr.update()
192
+
193
  chunks = transcript2chunks(transcript)
194
+ metadatas = [{'video_id': video_id, 'video_name': video_name, 'curation_ids': goal_curations_with_video} for c in chunks] # *len() was buggy?
195
+
196
+ #import pprint
197
+ #for i, c in enumerate(chunks):
198
+ # print(f"{i}: {c}")
199
+ #print(metadata)
200
+ print("embedding & uploading to vector db TODO how to get progress from langchain?\n")
201
+
202
+ # TODO consider storing chunk text in supabase - maybe get more storage out of pinecone's s1 if supabase's free tier is sufficient
203
+ chunk_ids = Vdb.add_texts(chunks, metadatas)
204
+ print("bookkeeping supabase with new video\n")
205
+
206
+ inserted_row = supa_all(supabase.table("ingested_youtube_videos").insert({"video_id": video_id,
207
+ "video_name": video_name}).execute())[0]
208
  data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
209
  'num_chunks': len(chunks),
210
+ 'embedding_model': str(Model_name),
211
  'transcribed_by': 'youtube_transcript_api'}).execute()
212
+ print("\t- transcripts\n")
213
+ data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': inserted_row['media_id']} for c in goal_curations_with_video]).execute()
214
+ print("\t- curations\n")
215
+ data = supabase.table('junction_vectors').insert( [{'chunk_id': c, 'media_id': inserted_row['media_id']} for c in chunk_ids ]).execute()
216
+ print("\t- vectors\n")
217
+ else: # some curations already ahve video, so no need to chunk+embed+insert into vector db. just adjust bookkeeping in vector db + supa
218
+ print("video already in vector db, updating metadata to include selected curations\n")
219
+ # get media_id of given video
220
+ media_id = video_id_to_media_id(video_id)
221
+
222
+ # get chunk_ids for the video
223
+ chunk_rows = supa_all(supabase.table("junction_vectors").select("chunk_id").eq('media_id', media_id).execute())
224
+
225
+ # then update metadata of both supabase and vectorDB to include new curations
226
+ for r in chunk_rows:
227
+ update_response = Pinecone_index.update(
228
+ id=r['chunk_id'],
229
+ set_metadata={'curation_ids': goal_curations_with_video}
230
+ )
231
+ # TODO error check update_response
232
+ data = supabase.table('junction_curations').insert([{'curation_id': c, 'media_id': media_id} for c in curations_to_add_video_to]).execute()
233
+
234
+ #curation_ids = [cur_dict[name] for name in curations_to_add_video_to]
235
+
236
+ status = "Status: Done! Video added, thanks for contributing :D"
237
+ return status, gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())
238
+
239
+ def query_llm(prompt):
240
+ response = openai.Completion.create(
241
+ prompt=prompt,
242
+ temperature=0,
243
+ max_tokens=400,
244
+ top_p=1,
245
+ frequency_penalty=0,
246
+ presence_penalty=0,
247
+ #stop=stop_sequence,
248
+ model=f'text-davinci-003'
249
+ )
250
+ #print(response)
251
+ return response["choices"][0]["text"].strip()
252
+
253
 
254
  # this needn't be in hf space, as it will just call out to openai and the db
255
  # but why not host it here since it's free vs replits 2 cents/day
256
+ def ask_question(question: str, openai_apikey: str, curation_names: List[str]):
257
+ if not question or not openai_apikey or not curation_names:
258
+ return "error: need all inputs", ""
259
+ openai.api_key = openai_apikey
260
  # query vector db for topk chunks
261
+ # can't use langchain bc we are using pinecone metadata filtering
262
+ q_embedding = Embedder(question).tolist()
263
+ curations_dict = curation_name2id()
264
+ curation_ids = [curations_dict[name] for name in curation_names]
265
+ results = Pinecone_index.query(vector=q_embedding, filter={'curation_ids': {"$in": curation_ids}}, top_k=5, include_metadata=True)
266
+ #pprint.pprint(results)
267
+ # TODO add filters to langchain's pinecone impl?
268
+ sources = {}
269
+ chunks = []
270
+ for r in results['matches']:
271
+ chunk = r['metadata']['text']
272
+ chunks.append(chunk)
273
+ video_name = r['metadata']['video_name']
274
+ sources.setdefault(video_name, []).append(chunk)
275
+ sources_md = "## Sources\n" + "\n\n".join([f"### {name}\n" + "\n\n---\n\n".join([f'{c}' for c in chunks]) for name, chunks in sources.items()])
276
  # format prompt (textwrap to guarantee length?)
277
+ instr = "Answer the question based on the context below, and if the question can't be answered based on the context, say 'I don't know'.\n\nContext:\n- "
278
+ prompt = instr + "\n- ".join(chunks) + f"\n\nQuestion: {question}\n\nAnswer:"
279
+ #pprint.pprint(prompt)
280
+
281
+ try:
282
+ answer = "## Answer\n" + query_llm(prompt)
283
+ except Exception as e:
284
+ answer = f"Error: {e}"
285
 
286
  # query llm and return output and topk
287
+ return answer, sources_md
288
+
289
+ with gr.Blocks() as demo:
290
+ curations_from_db = get_curation_names()
291
+ refresh_button = gr.Button("Synchronize data (with other user's changes)")
292
+ with gr.Tab("Ask a question"):
293
+ q = gr.Textbox(label="Your question")
294
+ openai_apikey = gr.Textbox(label="OpenAI API Key", type="password")
295
+ curation_names_1 = gr.CheckboxGroup(choices=curations_from_db, label="Curations to query")
296
+ button = gr.Button("Submit")
297
+ answer = gr.Markdown(value="")
298
+ sources = gr.Markdown(value="")
299
+ button.click(ask_question, inputs=[q, openai_apikey, curation_names_1], outputs=[answer, sources])
300
+ with gr.Tab("Browse & Organize Curations"):
301
+ def refresh_curation_accordion():
302
+ output = gen_curation_md()
303
+ return gr.update(value=output)
304
+ #md.change(fn=refresh_curation_accordion, inputs=[curation_names_1], outputs=[md])
305
+ # for name,id in curation_name2id().items():
306
+ # print(id,name,rows)
307
+ # accordions_state[name] = {'gr_obj': gr.Accordion(name), 'rows': []}
308
+ # with accordions_state[name]['gr_obj']:
309
+ # for i,medium in enumerate(row_d[id]):
310
+ # accordions_state[name]['rows'].append(gr.Row(variant='compact'))
311
+ # with accordions_state[name]['rows'][i]:
312
+ # gr.Markdown(medium['video_name'])
313
+ #delete_button = gr.Button("Delete from Curation")
314
+ #delete_button.click(...)
315
+ #refresh_button = gr.Button("Refresh curations")
316
+ md = gr.Markdown(gen_curation_md())
317
+ #refresh_button.click(fn=refresh_curation_accordion, inputs=[], outputs=[md])
318
+ with gr.Tab("Add data to Curations"):
319
+ gr.Markdown("An hour's worth of video seems to take about a minute to upload (ymmv).")
320
+ video_id = gr.Textbox(label="Youtube video id (NOT full url)", placeholder="lvh3g7eszVQ")
321
+ curation_names_2 = gr.CheckboxGroup(choices=curations_from_db,
322
+ #isible=len(Cur_keys) > 0,
323
+ label="Add to existing Curations")
324
+ new_curation = gr.Textbox(label="and/or add to new Curation")
325
+ button = gr.Button("Submit")
326
+ status_field = gr.Markdown()
327
+ # TODO need to undo rdb and vdb state if cancel clicked
328
+ #submit_click = button.click(ingest_video, inputs=[video_id, curation_names_2, new_curation], outputs=[status_field, curation_names_1, curation_names_2, md])
329
+ #cancel_button = gr.Button("Cancel", cancels=[submit_click])
330
+
331
+ def refresh_all_curation_lists():
332
+ return gr.update(choices=get_curation_names()), gr.update(choices=get_curation_names()), gr.update(value=gen_curation_md())
333
+ refresh_button.click(fn=refresh_all_curation_lists, inputs=[], outputs=[curation_names_1, curation_names_2, md])
334
+ demo.launch()