marcgreen commited on
Commit
669b468
·
1 Parent(s): 26ca549

copy insertion logic (+ WIP langchain pinecone impl) from colab

Browse files
Files changed (1) hide show
  1. app.py +198 -0
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Interface for Pinecone vector stores."""
2
+ import uuid
3
+ import pinecone
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Callable, Dict, Iterable, List, Optional
6
+
7
+ from langchain.docstore.document import Document
8
+ from langchain.embeddings.base import Embeddings
9
+ from langchain.vectorstores.base import VectorStore
10
+
11
+
12
+ class Pinecone(VectorStore):
13
+ """Interface for vector stores."""
14
+
15
+ def _query():
16
+ pass
17
+
18
+ def __init__(
19
+ self, api_key: str, index_name: str, embedding_function: Callable
20
+ ):
21
+ """Initialize with necessary components."""
22
+ try:
23
+ import pinecone
24
+ except ImportError:
25
+ raise ValueError(
26
+ "Could not import pinecone python package. "
27
+ "Please install it with `pip install pinecone-client`."
28
+ )
29
+ self.embedding_function = embedding_function
30
+ self.index_name = index_name
31
+ #try:
32
+ pinecone.init(
33
+ api_key=api_key,
34
+ environment='us-west1-gcp' # only option for for free tier
35
+ )
36
+ #except ValueError as e:
37
+ # raise ValueError(
38
+ # f"Your elasticsearch client string is misformatted. Got error: {e} "
39
+ # )
40
+ self.client = pinecone
41
+
42
+ def add_texts(
43
+ self, texts: Iterable[str], metadatas: Optional[List[dict]] = None
44
+ ) -> None:
45
+ """Run more texts through the embeddings and add to the vectorstore."""
46
+ index = self.client.Index(self.index_name)
47
+ batch_size = 16 # recommended limit is 100 vectors
48
+ for i in range(0, len(texts), batch_size):
49
+ i_end = min(i+batch_size, len(texts))
50
+ text_batch = texts[i:i_end]
51
+ metadata_batch = metadatas[i:i_end] if metadatas else [{}] * (i_end-i)
52
+ embedding_batch = self.embedding_function(text_batch) # [[0] * 768] * (i_end - i) #
53
+ to_upsert = [
54
+ (
55
+ str(uuid.uuid4()), # id that we currently don't care about
56
+ embedding.tolist(),
57
+ dict(
58
+ {"text": text},
59
+ **metadata # if 'text' in here too, it takes precendence
60
+ )
61
+ ) for text, embedding, metadata in zip(text_batch, embedding_batch, metadata_batch)
62
+ ]
63
+ index.upsert(vectors=to_upsert)
64
+
65
+ def similarity_search(self, query: str, k: int = 5) -> List[Document]:
66
+ """Return docs most similar to query."""
67
+ index = self.client.Index(self.index_name)
68
+ matches = index.query(
69
+ #namespace="example-namespace",
70
+ top_k=k,
71
+ include_values=True,
72
+ include_metadata=True,
73
+ vector=query,
74
+ #filter={
75
+ # "genre": {"$in": ["comedy", "documentary", "drama"]}
76
+ #}
77
+ )
78
+ documents = [
79
+ Document(page_content=match["metadata"]["text"], metadata=match["metadata"]) for match in matches
80
+ ]
81
+ return documents
82
+
83
+ @classmethod
84
+ def from_texts(
85
+ cls,
86
+ texts: List[str],
87
+ embedding: Embeddings,
88
+ metadatas: Optional[List[dict]] = None,
89
+ **kwargs: Any
90
+ ) -> "VectorStore":
91
+ """Return VectorStore initialized from texts and embeddings."""
92
+
93
+ # TODO fill out other 2 methods for Pinecone Vectore Store and ask if harrison would be open to a PR
94
+ # TODO account for mpnet's limit of 384 word pieces per chunk (is it done already?)
95
+ # DONE need to check if embeddings exist for given video id before generating embeddings
96
+ # supabase to store index (apparently can't rely on vector db to do it?) and user's curations / popular curations
97
+ # - paused after 1 week inactivity (and i believe pinecone index DELETED after some days of inactivity?!)
98
+ # - - TODO backup both pinecone and supabase daily (this should count as the activity), and make publicly accessible
99
+ # TODO user prefs data model (their curations)
100
+ # - meh not needed at first
101
+ # DONE design main workflows
102
+ # DONE curation data model
103
+ # TODO frontend (discord bot or gradio or)
104
+ # - i also want to be able to give it a yt vid and have it summarize it for me
105
+ # TODO workflow for curating videos into sets (aka Curations)
106
+ # TODO workflow to ask Curations a question
107
+ # - LEFT OFF here
108
+ # TODO support yt playlists in addition to just one-off videos
109
+ # - can i make this really easy to add via a well designed api?
110
+ # TODO finalize deployment strategy
111
+ # - supabase free tier for db + blob storage of transcripts
112
+ # - hf space to host model computations (langchain bits need to run here)
113
+ # - replit or supabase to host edge functiosn to call hf space
114
+ # TODO gradio session state to track recently asked questions
115
+
116
+ import json
117
+
118
+ import gradio as gr
119
+
120
+ from langchain.text_splitter import SpacyTextSplitter
121
+ from langchain.embeddings import HuggingFaceEmbeddings
122
+ from youtube_transcript_api import YouTubeTranscriptApi
123
+
124
+ embedder = HuggingFaceEmbeddings().embed_documents
125
+ model_name = HuggingFaceEmbeddings().model_name
126
+
127
+ from supabase import create_client, Client
128
+
129
+ PINECONE_APIKEY: str = os.environ.get("PINECONE_APIKEY")
130
+ SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
131
+ SUPABASE_KEY: str = os.environ.get("SUPABASE_KEY")
132
+
133
+ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
134
+ pinecone = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
135
+
136
+ def transcript2chunks(transcript):
137
+ return SpacyTextSplitter().split_text(transcript)
138
+
139
+ def ingest_transcript(transcript):
140
+ p = Pinecone(PINECONE_APIKEY, 'semantic-curations', embedder)
141
+ chunks = transcript2chunks(transcript)
142
+ p.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
143
+
144
+ def already_ingested(yt_video_id: str):
145
+ data = supabase.table("ingested_youtube_videos").select("*", count="estimated").eq('video_id', yt_video_id).execute()
146
+ return data.count > 0
147
+
148
+ def yt2transcript(video_id):
149
+ print(f"\n\nid: {video_id}")
150
+ # data looks like [{'text': 'hey friends welcome to one little coder', 'start': 0.84, 'duration': 4.38}, ...]
151
+ data = YouTubeTranscriptApi.get_transcript(video_id)
152
+ transcript = ' '.join([x['text'] for x in data])
153
+ # TODO if there is no transcript (how likely is this?), run through whisper-large on hf (but 30k free characters per month)
154
+ return transcript
155
+
156
+ def ingest_video(video_id):
157
+ if already_ingested(video_id):
158
+ return "dupe"
159
+ else:
160
+ transcript = yt2transcript(video_id)
161
+ chunks = transcript2chunks(transcript)
162
+ pinecone.add_texts(chunks, [{'yt_video_id': video_id}] * len(chunks))
163
+ inserted_row_data = supabase.table("ingested_youtube_videos").insert({"video_id": video_id}).execute()
164
+ inserted_row_json = json.loads(inserted_row_data.json())
165
+ print(inserted_row_json)
166
+ inserted_row = inserted_row_json['data'][0]
167
+
168
+ data = supabase.table("ingested_transcripts").insert({'source_id': inserted_row['media_id'],
169
+ 'num_chunks': len(chunks),
170
+ 'embedding_model': str(embedder),
171
+ 'transcribed_by': 'youtube_transcript_api'}).execute()
172
+ return f"ingested {len(chunks)} chunks"
173
+
174
+ # this needn't be in hf space, as it will just call out to openai and the db
175
+ # but why not host it here since it's free vs replits 2 cents/day
176
+ def ask_question(question: str, instruction: str, curation_ids: List[str]):
177
+ # query vector db for topk chunks
178
+ topk = pinecone.similarity_search(question, k = 10)
179
+
180
+ # format prompt (textwrap to guarantee length?)
181
+
182
+ # query llm and return output and topk
183
+ pass
184
+
185
+ # this needn't be in hf space, as it will just interact with db's community and user Curations
186
+ # but why not host it here
187
+ # will be updating the pinecone metadata/namespace with teh curation ids it's a part of
188
+ # - but that would tie the app to pinecone....
189
+ # - so maybe I do want to use elastic 8,5 since it can do both? need nontrivial specs
190
+ # - using metadata would let us assign multiple curation ids to each video, whereas namespace could only be 1
191
+ def organize_curations():
192
+ pass
193
+
194
+ demo = gr.Interface(fn=ingest_video, inputs=["text"], outputs=["text"])
195
+ #article="\n".join([f"- {k}: " + v.replace("\n"," ") for k,v in instructions.items()]))
196
+ demo.launch()
197
+
198
+ #ingest_video(video_id)