Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import openai
|
| 3 |
+
import concurrent.futures
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def convert_to_dict(text):
|
| 10 |
+
"""Converts text in the format "6.08 seconds - Yeah, the Jack Carr one was pretty fun." to a dictionary.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
text: A string in the format "6.08 seconds - Yeah, the Jack Carr one was pretty fun.".
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
A dictionary mapping the seconds to the text.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
result = {}
|
| 20 |
+
for line in text.splitlines():
|
| 21 |
+
match = re.match(r"(\d+\.\d+) seconds - (.*)", line)
|
| 22 |
+
if match:
|
| 23 |
+
seconds = float(match.group(1))
|
| 24 |
+
text = match.group(2)
|
| 25 |
+
result[seconds] = text
|
| 26 |
+
return result
|
| 27 |
+
|
| 28 |
+
def process_dict(text, batch_size=20):
|
| 29 |
+
"""Processes a dictionary by combining the values of 20 elements into a single string.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
dict_in: A dictionary mapping seconds to text.
|
| 33 |
+
batch_size: The number of elements to combine into a single string.
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
A new dictionary mapping the seconds of the first element in each batch to the combined values of the 20 elements in the batch.
|
| 37 |
+
"""
|
| 38 |
+
dict_in = convert_to_dict(text)
|
| 39 |
+
result = {}
|
| 40 |
+
current_batch = []
|
| 41 |
+
current_key = None
|
| 42 |
+
for seconds, text in dict_in.items():
|
| 43 |
+
if current_key is None:
|
| 44 |
+
current_key = seconds
|
| 45 |
+
current_batch.append(text)
|
| 46 |
+
if len(current_batch) == batch_size:
|
| 47 |
+
combined_value = " ".join(current_batch)
|
| 48 |
+
result[current_key] = combined_value
|
| 49 |
+
current_batch = []
|
| 50 |
+
current_key = None
|
| 51 |
+
if current_batch:
|
| 52 |
+
combined_value = " ".join(current_batch)
|
| 53 |
+
result[current_key] = combined_value
|
| 54 |
+
return result
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def call3(chunk):
|
| 58 |
+
response = openai.ChatCompletion.create(
|
| 59 |
+
model="gpt-3.5-turbo",
|
| 60 |
+
temperature= 0,
|
| 61 |
+
messages=[
|
| 62 |
+
{"role": "system", "content": "You are a podcast chunk summarizer. You will be given a random chunk from a podcast transcript. you will return 3 most important topics (or less if necessary) from that chunk as bulleted point as output. Make the bullet points as concise and informative as possible."},
|
| 63 |
+
{"role": "user", "content": str(chunk)}
|
| 64 |
+
]
|
| 65 |
+
)
|
| 66 |
+
return response['choices'][0]['message']['content']
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def run_gpt_3(dict_in, function=call3):
|
| 70 |
+
"""Processes a dictionary by sending all the elements individually to a function and waiting for all of them to finish.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
dict_in: A dictionary mapping keys to values.
|
| 74 |
+
function: A function that takes a value as input and returns a value as output.
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
A dictionary mapping the same keys as the input dictionary to the results of calling the function on each value.
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 81 |
+
futures = [executor.submit(function, value) for value in dict_in.values()]
|
| 82 |
+
results = [future.result() for future in futures]
|
| 83 |
+
|
| 84 |
+
return dict(zip(dict_in.keys(), results))
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def call4(chunk):
|
| 88 |
+
response = openai.ChatCompletion.create(
|
| 89 |
+
model="gpt-3.5-turbo",
|
| 90 |
+
temperature= 0,
|
| 91 |
+
messages=[
|
| 92 |
+
{"role": "system", "content": """You are a podcast summarizer. You will be given the gist of a long podcast, and you will output this format.
|
| 93 |
+
Hook: (Begin your podcast show notes with a gripping quote, anecdote, or question.)
|
| 94 |
+
Ex.One serendipitous relationship led him to start a company & change his life forever.
|
| 95 |
+
Give a Short Summary: Include main talking points and key phrases that will appeal to your
|
| 96 |
+
ideal listener.
|
| 97 |
+
Topics discussed in this episode: For this part, You will act as Youtube Video sectioning algorithm, and output similarly, using the given info.
|
| 98 |
+
|
| 99 |
+
and, Lastly, remember to output in an easily parsable format.
|
| 100 |
+
|
| 101 |
+
"""},
|
| 102 |
+
{"role": "user", "content": str(chunk)}
|
| 103 |
+
]
|
| 104 |
+
)
|
| 105 |
+
return response['choices'][0]['message']['content']
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def clean_and_concatenate_dict_values(dict_in):
|
| 110 |
+
"""Cleans and concatenates the values of a dictionary. before sending to 4
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
dict_in: A dictionary mapping keys to values.
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
A long string containing the concatenated values of the dictionary, with each value preceded by its key.
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
result = ""
|
| 120 |
+
for key, value in dict_in.items():
|
| 121 |
+
# Clean the value
|
| 122 |
+
value = value.strip()
|
| 123 |
+
value = value.replace("- ", "")
|
| 124 |
+
|
| 125 |
+
# Concatenate the value to the result
|
| 126 |
+
result += f"{key}: {value}\n"
|
| 127 |
+
|
| 128 |
+
return result
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# text = """
|
| 150 |
+
# 6.08 seconds - Yeah, the Jack Carr one was pretty fun.
|
| 151 |
+
# 11.32 seconds - He's super nice.
|
| 152 |
+
# 16.56 seconds - I'm really enjoying this book.
|
| 153 |
+
# 21.80 seconds - I can't wait to see what happens next.
|
| 154 |
+
# 27.04 seconds - This is a great read.
|
| 155 |
+
# 32.28 seconds - I highly recommend it to anyone who enjoys thrillers.
|
| 156 |
+
# """
|
| 157 |
+
|
| 158 |
+
# result = convert_to_dict(text)
|
| 159 |
+
# new_result = process_dict(result)
|
| 160 |
+
|
| 161 |
+
# # print(list(new_result.values())[7])
|
| 162 |
+
|
| 163 |
+
# new_result
|