Spaces:
Runtime error
Runtime error
Commit ·
af19ad5
0
Parent(s):
Duplicate from bejaeger/sean-carrol-explains
Browse files- .gitattributes +31 -0
- README.md +15 -0
- app.py +213 -0
- requirements.txt +5 -0
.gitattributes
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Sean Carrol Explains
|
| 3 |
+
emoji: 🦾
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.10.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
duplicated_from: bejaeger/sean-carrol-explains
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Curious about how this works? Check out the [article](https://pinecone.io/learn/openai-whisper)!
|
| 14 |
+
|
| 15 |
+
The current version of the app has a very limited video scope. We'd love to add more, so if you'd like to see more content added, feel free to send CSV data, including video title, channel ID, and video ID (at a minimum) to *james\@pinecone.io*. Even better if you could follow a format similar to [this](https://huggingface.co/datasets/jamescalam/channel-metadata).
|
app.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pinecone
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
import logging
|
| 5 |
+
import openai
|
| 6 |
+
|
| 7 |
+
PINECONE_KEY = st.secrets["PINECONE_KEY"] # app.pinecone.io
|
| 8 |
+
OPENAI_KEY = st.secrets["OPENAI_KEY"]
|
| 9 |
+
INDEX_ID = 'sean-carrol-biggest-ideas-of-the-universe'
|
| 10 |
+
|
| 11 |
+
@st.experimental_singleton
|
| 12 |
+
def init_openai():
|
| 13 |
+
openai.api_key = OPENAI_KEY
|
| 14 |
+
|
| 15 |
+
@st.experimental_singleton
|
| 16 |
+
def init_pinecone():
|
| 17 |
+
pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
|
| 18 |
+
return pinecone.Index(INDEX_ID)
|
| 19 |
+
|
| 20 |
+
@st.experimental_singleton
|
| 21 |
+
def init_retriever():
|
| 22 |
+
return SentenceTransformer("multi-qa-mpnet-base-dot-v1")
|
| 23 |
+
|
| 24 |
+
def make_query(query, retriever, top_k=3, include_values=True, include_metadata=True, filter=None):
|
| 25 |
+
xq = retriever.encode([query]).tolist()
|
| 26 |
+
logging.info(f"Query: {query}")
|
| 27 |
+
attempt = 0
|
| 28 |
+
while attempt < 3:
|
| 29 |
+
try:
|
| 30 |
+
xc = st.session_state.index.query(
|
| 31 |
+
xq,
|
| 32 |
+
top_k=top_k,
|
| 33 |
+
include_values=include_values,
|
| 34 |
+
include_metadata=include_metadata,
|
| 35 |
+
filter=filter
|
| 36 |
+
)
|
| 37 |
+
matches = xc['matches']
|
| 38 |
+
break
|
| 39 |
+
except:
|
| 40 |
+
# force reload
|
| 41 |
+
pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")
|
| 42 |
+
st.session_state.index = pinecone.Index(INDEX_ID)
|
| 43 |
+
attempt += 1
|
| 44 |
+
matches = []
|
| 45 |
+
if len(matches) == 0:
|
| 46 |
+
logging.error(f"Query failed")
|
| 47 |
+
return matches
|
| 48 |
+
|
| 49 |
+
def get_prompt(matches):
|
| 50 |
+
contexts = [
|
| 51 |
+
x['metadata']['text'] for x in matches
|
| 52 |
+
]
|
| 53 |
+
prompt_start = (
|
| 54 |
+
"Answer the question based on the context below.\n\n"+
|
| 55 |
+
"Context:\n"
|
| 56 |
+
)
|
| 57 |
+
prompt_end = (
|
| 58 |
+
f"\n\nQuestion: {query}\nAnswer:"
|
| 59 |
+
)
|
| 60 |
+
limit = 3750
|
| 61 |
+
|
| 62 |
+
for i in range(1, len(contexts)):
|
| 63 |
+
if len("\n\n--\n\n".join(contexts[:i])) >= limit:
|
| 64 |
+
prompt = (
|
| 65 |
+
prompt_start +
|
| 66 |
+
"\n\n--\n\n".join(contexts[:i-1]) +
|
| 67 |
+
prompt_end
|
| 68 |
+
)
|
| 69 |
+
break
|
| 70 |
+
elif i == len(contexts) - 1:
|
| 71 |
+
prompt = (
|
| 72 |
+
prompt_start +
|
| 73 |
+
"\n\n--\n\n".join(contexts) +
|
| 74 |
+
prompt_end
|
| 75 |
+
)
|
| 76 |
+
return prompt
|
| 77 |
+
|
| 78 |
+
init_openai()
|
| 79 |
+
st.session_state.index = init_pinecone()
|
| 80 |
+
retriever = init_retriever()
|
| 81 |
+
|
| 82 |
+
def card(thumbnail: str, title: str, urls: list, contexts: list, starts: list, ends: list):
|
| 83 |
+
meta = [(e, s, u, c) for e, s, u, c in zip(ends, starts, urls, contexts)]
|
| 84 |
+
meta.sort(reverse=False)
|
| 85 |
+
text_content = []
|
| 86 |
+
current_start = 0
|
| 87 |
+
current_end = 0
|
| 88 |
+
for end, start, url, context in meta:
|
| 89 |
+
# reformat seconds to timestamp
|
| 90 |
+
time = start / 60
|
| 91 |
+
mins = f"0{int(time)}"[-2:]
|
| 92 |
+
secs = f"0{int(round((time - int(mins))*60, 0))}"[-2:]
|
| 93 |
+
timestamp = f"{mins}:{secs}"
|
| 94 |
+
if start < current_end and start > current_start:
|
| 95 |
+
# this means it is a continuation of the previous sentence
|
| 96 |
+
text_content[-1][0] = text_content[-1][0].split(context[:10])[0]
|
| 97 |
+
text_content.append([f"[{timestamp}] {context.capitalize()}", url])
|
| 98 |
+
else:
|
| 99 |
+
text_content.append(["xxLINEBREAKxx", ""])
|
| 100 |
+
text_content.append([f"[{timestamp}] {context}", url])
|
| 101 |
+
current_start = start
|
| 102 |
+
current_end = end
|
| 103 |
+
html_text = ""
|
| 104 |
+
for text, url in text_content:
|
| 105 |
+
if text == "xxLINEBREAKxx":
|
| 106 |
+
html_text += "<br>"
|
| 107 |
+
else:
|
| 108 |
+
html_text += f"<small><a href={url}>{text.strip()}... </a></small>"
|
| 109 |
+
print(text)
|
| 110 |
+
html = f"""
|
| 111 |
+
<div class="container-fluid">
|
| 112 |
+
<div class="row align-items-start">
|
| 113 |
+
<div class="col-md-4 col-sm-4">
|
| 114 |
+
<div class="position-relative">
|
| 115 |
+
<a href={urls[0]}><img src={thumbnail} class="img-fluid" style="width: 192px; height: 106px"></a>
|
| 116 |
+
</div>
|
| 117 |
+
</div>
|
| 118 |
+
<div class="col-md-8 col-sm-8">
|
| 119 |
+
<h2>{title}</h2>
|
| 120 |
+
</div>
|
| 121 |
+
<div>
|
| 122 |
+
{html_text}
|
| 123 |
+
<br><br>
|
| 124 |
+
"""
|
| 125 |
+
return st.markdown(html, unsafe_allow_html=True)
|
| 126 |
+
|
| 127 |
+
channel_map = {
|
| 128 |
+
'James Briggs': 'UCv83tO5cePwHMt1952IVVHw',
|
| 129 |
+
'Daniel Bourke': 'UCr8O8l5cCX85Oem1d18EezQ',
|
| 130 |
+
'Yannic Kilcher': 'UCZHmQk67mSJgfCCTn7xBfew',
|
| 131 |
+
'AI Coffee Break with Letitia': 'UCobqgqE4i5Kf7wrxRxhToQA',
|
| 132 |
+
'sentdex': 'UCfzlCWGWYyIQ0aLC5w48gBQ'
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
st.write("""
|
| 136 |
+
# Sean Carroll Explains
|
| 137 |
+
""")
|
| 138 |
+
|
| 139 |
+
st.info("""
|
| 140 |
+
Ask any question about Sean Carroll's video series 'The Biggest Ideas in the Universe'.
|
| 141 |
+
The search is built using OpenAI's Whisper, SentenceTransformer, GPT-3, and Pinecone, and is built off of James Brigg's [example](https://pinecone.io/learn/openai-whisper)!
|
| 142 |
+
""")
|
| 143 |
+
|
| 144 |
+
st.markdown("""
|
| 145 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
|
| 146 |
+
""", unsafe_allow_html=True)
|
| 147 |
+
|
| 148 |
+
query = st.text_input("Ask about the universe...", "")
|
| 149 |
+
|
| 150 |
+
st.checkbox("Generate summary with GPT-3?", key="summarize")
|
| 151 |
+
# with st.expander("Advanced Options"):
|
| 152 |
+
# channel_options = st.multiselect(
|
| 153 |
+
# 'Channels to Search',
|
| 154 |
+
# ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex'],
|
| 155 |
+
# ['James Briggs', 'Daniel Bourke', 'Yannic Kilcher', 'AI Coffee Break with Letitia', 'sentdex']
|
| 156 |
+
# )
|
| 157 |
+
|
| 158 |
+
if query != "":
|
| 159 |
+
# channels = [channel_map[name] for name in channel_options]
|
| 160 |
+
print(f"query: {query}")
|
| 161 |
+
matches = make_query(
|
| 162 |
+
query, retriever, top_k=5,
|
| 163 |
+
# filter={
|
| 164 |
+
# 'channel_id': {'$in': channels}
|
| 165 |
+
# }
|
| 166 |
+
)
|
| 167 |
+
if st.session_state.summarize:
|
| 168 |
+
prompt = get_prompt(matches)
|
| 169 |
+
res = openai.Completion.create(
|
| 170 |
+
engine='text-davinci-003',
|
| 171 |
+
prompt=prompt,
|
| 172 |
+
temperature=0,
|
| 173 |
+
max_tokens=300,
|
| 174 |
+
top_p=1,
|
| 175 |
+
frequency_penalty=0,
|
| 176 |
+
presence_penalty=0,
|
| 177 |
+
stop=".",
|
| 178 |
+
)
|
| 179 |
+
summary = res['choices'][0]['text'].strip()
|
| 180 |
+
st.info(f"Summary:\n{summary}")
|
| 181 |
+
|
| 182 |
+
results = {}
|
| 183 |
+
order = []
|
| 184 |
+
for context in matches:
|
| 185 |
+
video_id = context['metadata']['url'].split('/')[-1]
|
| 186 |
+
if video_id not in results:
|
| 187 |
+
results[video_id] = {
|
| 188 |
+
'title': context['metadata']['title'],
|
| 189 |
+
'urls': [f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"],
|
| 190 |
+
'contexts': [context['metadata']['text']],
|
| 191 |
+
'starts': [int(context['metadata']['start'])],
|
| 192 |
+
'ends': [int(context['metadata']['end'])]
|
| 193 |
+
}
|
| 194 |
+
order.append(video_id)
|
| 195 |
+
else:
|
| 196 |
+
results[video_id]['urls'].append(
|
| 197 |
+
f"{context['metadata']['url']}?t={int(context['metadata']['start'])}"
|
| 198 |
+
)
|
| 199 |
+
results[video_id]['contexts'].append(
|
| 200 |
+
context['metadata']['text']
|
| 201 |
+
)
|
| 202 |
+
results[video_id]['starts'].append(int(context['metadata']['start']))
|
| 203 |
+
results[video_id]['ends'].append(int(context['metadata']['end']))
|
| 204 |
+
# now display cards
|
| 205 |
+
for video_id in order:
|
| 206 |
+
card(
|
| 207 |
+
thumbnail=f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
| 208 |
+
title=results[video_id]['title'],
|
| 209 |
+
urls=results[video_id]['urls'],
|
| 210 |
+
contexts=results[video_id]['contexts'],
|
| 211 |
+
starts=results[video_id]['starts'],
|
| 212 |
+
ends=results[video_id]['ends']
|
| 213 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
sentence-transformers
|
| 3 |
+
pinecone-client
|
| 4 |
+
click==8.0
|
| 5 |
+
openai
|