bejaeger commited on
Commit
af19ad5
·
0 Parent(s):

Duplicate from bejaeger/sean-carrol-explains

Browse files
Files changed (4) hide show
  1. .gitattributes +31 -0
  2. README.md +15 -0
  3. app.py +213 -0
  4. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sean Carrol Explains
3
+ emoji: 🦾
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: bejaeger/sean-carrol-explains
11
+ ---
12
+
13
+ Curious about how this works? Check out the [article](https://pinecone.io/learn/openai-whisper)!
14
+
15
+ The current version of the app has a very limited video scope. We'd love to add more, so if you'd like to see more content added, feel free to send CSV data, including video title, channel ID, and video ID (at a minimum) to *james\@pinecone.io*. Even better if you could follow a format similar to [this](https://huggingface.co/datasets/jamescalam/channel-metadata).
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pinecone
3
+ from sentence_transformers import SentenceTransformer
4
+ import logging
5
+ import openai
6
+
7
+ PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
8
+ OPENAI_KEY = st.secrets["OPENAI_KEY"]
9
+ INDEX_ID = 'sean-carrol-biggest-ideas-of-the-universe'
10
+
11
+ @st.experimental_singleton
12
+ def init_openai():
13
+ openai.api_key = OPENAI_KEY
14
+
15
+ @st.experimental_singleton
16
+ def init_pinecone():
17
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
18
+ return pinecone.Index(INDEX_ID)
19
+
20
+ @st.experimental_singleton
21
+ def init_retriever():
22
+ return SentenceTransformer("multi-qa-mpnet-base-dot-v1")
23
+
24
+ def make_query(query, retriever, top_k=3, include_values=True, include_metadata=True, filter=None):
25
+ xq = retriever.encode([query]).tolist()
26
+ logging.info(f"Query: {query}")
27
+ attempt = 0
28
+ while attempt < 3:
29
+ try:
30
+ xc = st.session_state.index.query(
31
+ xq,
32
+ top_k=top_k,
33
+ include_values=include_values,
34
+ include_metadata=include_metadata,
35
+ filter=filter
36
+ )
37
+ matches = xc['matches']
38
+ break
39
+ except:
40
+ # force reload
41
+ pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
42
+ st.session_state.index = pinecone.Index(INDEX_ID)
43
+ attempt += 1
44
+ matches = []
45
+ if len(matches) == 0:
46
+ logging.error(f"Query failed")
47
+ return matches
48
+
49
+ def get_prompt(matches):
50
+ contexts = [
51
+ x['metadata']['text'] for x in matches
52
+ ]
53
+ prompt_start = (
54
+ "Answer the question based on the context below.\n\n"+
55
+ "Context:\n"
56
+ )
57
+ prompt_end = (
58
+ f"\n\nQuestion: {query}\nAnswer:"
59
+ )
60
+ limit = 3750
61
+
62
+ for i in range(1, len(contexts)):
63
+ if len("\n\n--\n\n".join(contexts[:i])) >= limit:
64
+ prompt = (
65
+ prompt_start +
66
+ "\n\n--\n\n".join(contexts[:i-1]) +
67
+ prompt_end
68
+ )
69
+ break
70
+ elif i == len(contexts) - 1:
71
+ prompt = (
72
+ prompt_start +
73
+ "\n\n--\n\n".join(contexts) +
74
+ prompt_end
75
+ )
76
+ return prompt
77
+
78
+ init_openai()
79
+ st.session_state.index = init_pinecone()
80
+ retriever = init_retriever()
81
+
82
+ def card(thumbnail: str, title: str, urls: list, contexts: list, starts: list, ends: list):
83
+ meta = [(e, s, u, c) for e, s, u, c in zip(ends, starts, urls, contexts)]
84
+ meta.sort(reverse=False)
85
+ text_content = []
86
+ current_start = 0
87
+ current_end = 0
88
+ for end, start, url, context in meta:
89
+ # reformat seconds to timestamp
90
+ time = start / 60
91
+ mins = f"0{int(time)}"[-2:]
92
+ secs = f"0{int(round((time - int(mins))*60, 0))}"[-2:]
93
+ timestamp = f"{mins}:{secs}"
94
+ if start < current_end and start > current_start:
95
+ # this means it is a continuation of the previous sentence
96
+ text_content[-1][0] = text_content[-1][0].split(context[:10])[0]
97
+ text_content.append([f"[{timestamp}] {context.capitalize()}", url])
98
+ else:
99
+ text_content.append(["xxLINEBREAKxx", ""])
100
+ text_content.append([f"[{timestamp}] {context}", url])
101
+ current_start = start
102
+ current_end = end
103
+ html_text = ""
104
+ for text, url in text_content:
105
+ if text == "xxLINEBREAKxx":
106
+ html_text += "<br>"
107
+ else:
108
+ html_text += f"<small><a href={url}>{text.strip()}... </a></small>"
109
+ print(text)
110
+ html = f"""
111
+ <div class="container-fluid">
112
+ <div class="row align-items-start">
113
+ <div class="col-md-4 col-sm-4">
114
+ <div class="position-relative">
115
+ <a href={urls[0]}><img src={thumbnail} class="img-fluid" style="width: 192px; height: 106px"></a>
116
+ </div>
117
+ </div>
118
+ <div class="col-md-8 col-sm-8">
119
+ <h2>{title}</h2>
120
+ </div>
121
+ <div>
122
+ {html_text}
123
+ <br><br>
124
+ """
125
+ return st.markdown(html, unsafe_allow_html=True)
126
+
127
+ channel_map = {
128
+ 'James Briggs': 'UCv83tO5cePwHMt1952IVVHw',
129
+ 'Daniel Bourke': 'UCr8O8l5cCX85Oem1d18EezQ',
130
+ 'Yannic Kilcher': 'UCZHmQk67mSJgfCCTn7xBfew',
131
+ 'AI Coffee Break with Letitia': 'UCobqgqE4i5Kf7wrxRxhToQA',
132
+ 'sentdex': 'UCfzlCWGWYyIQ0aLC5w48gBQ'
133
+ }
134
+
135
+ st.write("""
136
+ # Sean Carroll Explains
137
+ """)
138
+
139
+ st.info("""
140
+ Ask any question about Sean Carroll's video series 'The Biggest Ideas in the Universe'.
141
+ The search is built using OpenAI's Whisper, SentenceTransformer, GPT-3, and Pinecone, and is built off of James Brigg's [example](https://pinecone.io/learn/openai-whisper)!
142
+ """)
143
+
144
+ st.markdown("""
145
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
146
+ """, unsafe_allow_html=True)
147
+
148
+ query = st.text_input("Ask about the universe...", "")
149
+
150
+ st.checkbox("Generate summary with GPT-3?", key="summarize")
151
+ # with st.expander("Advanced Options"):
152
+ # channel_options = st.multiselect(
153
+ # 'Channels to Search',
154
+ # ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex'],
155
+ # ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex']
156
+ # )
157
+
158
+ if query != "":
159
+ # channels = [channel_map[name] for name in channel_options]
160
+ print(f"query: {query}")
161
+ matches = make_query(
162
+ query, retriever, top_k=5,
163
+ # filter={
164
+ # 'channel_id': {'$in': channels}
165
+ # }
166
+ )
167
+ if st.session_state.summarize:
168
+ prompt = get_prompt(matches)
169
+ res = openai.Completion.create(
170
+ engine='text-davinci-003',
171
+ prompt=prompt,
172
+ temperature=0,
173
+ max_tokens=300,
174
+ top_p=1,
175
+ frequency_penalty=0,
176
+ presence_penalty=0,
177
+ stop=".",
178
+ )
179
+ summary = res['choices'][0]['text'].strip()
180
+ st.info(f"Summary:\n{summary}")
181
+
182
+ results = {}
183
+ order = []
184
+ for context in matches:
185
+ video_id = context['metadata']['url'].split('/')[-1]
186
+ if video_id not in results:
187
+ results[video_id] = {
188
+ 'title': context['metadata']['title'],
189
+ 'urls': [f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"],
190
+ 'contexts': [context['metadata']['text']],
191
+ 'starts': [int(context['metadata']['start'])],
192
+ 'ends': [int(context['metadata']['end'])]
193
+ }
194
+ order.append(video_id)
195
+ else:
196
+ results[video_id]['urls'].append(
197
+ f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"
198
+ )
199
+ results[video_id]['contexts'].append(
200
+ context['metadata']['text']
201
+ )
202
+ results[video_id]['starts'].append(int(context['metadata']['start']))
203
+ results[video_id]['ends'].append(int(context['metadata']['end']))
204
+ # now display cards
205
+ for video_id in order:
206
+ card(
207
+ thumbnail=f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
208
+ title=results[video_id]['title'],
209
+ urls=results[video_id]['urls'],
210
+ contexts=results[video_id]['contexts'],
211
+ starts=results[video_id]['starts'],
212
+ ends=results[video_id]['ends']
213
+ )
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ sentence-transformers
3
+ pinecone-client
4
+ click==8.0
5
+ openai