Spaces:

raphael825
/

read_the_youtube

Sleeping

App Files Files Community

read_the_youtube / app.py

raphael825

Update app.py

a5d3b8f about 2 years ago

raw

history blame contribute delete

10.5 kB

	import os
	import requests
	import json
	import re
	import gradio as gr
	from pytube import YouTube
	import whisper
	import time
	import pickle
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema.document import Document
	from langchain.chains.mapreduce import MapReduceChain
	from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
	from langchain.chat_models import ChatOpenAI
	from langchain.chains.llm import LLMChain
	from langchain.prompts import PromptTemplate
	from langchain.chains.combine_documents.stuff import StuffDocumentsChain
	from PIL import Image
	from io import BytesIO

	openai_api_key = ""


	# for API

	# # ==

	def youtube_text(link):
	yt = YouTube(link)
	yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3")

	start = time.time()
	model = whisper.load_model("base")
	text = model.transcribe("test.mp3")
	end = time.time()

	print(text["text"])
	print(f"{end - start:.2f}sec")

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=2000,
	chunk_overlap=50,
	length_function=len, )

	full_docs = text["text"]
	docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])]

	split_docs = text_splitter.split_documents(docs)

	with open("temp/split_example_small.pkl", "wb") as f:
	pickle.dump(split_docs, f)

	return split_docs, full_docs


	def youtube_sum(split_docs, full_docs, API_KEY):
	openai_key = API_KEY
	llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key)

	# Map prompt
	map_template = """The following is a set of documents
	{docs}
	Based on this list of Video subtitles , please identify the main themes
	Helpful Answer:"""

	map_prompt = PromptTemplate.from_template(map_template)

	# Reduce prompt
	reduce_template = """The following is set of summaries:
	{doc_summaries}
	You need to output two things from the above Video Subtitles.
	1. Write an executive summary
	Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video.
	Your summary should.
	- Must be written in Korean
	- Be a 1~2 paragraph
	- Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
	- There are no more than three main topics in the video.
	- Please also briefly describe the overall content of the video
	2. Choose your keyword
	The keywords have the following conditions
	- Must be written in Korean
	- Must be a single word
	- Must be a noun
	- Must be a word that appears in the Video
	- Must be a word that is not a stopword
	- Must be a word that is not a proper noun
	- Must be a word that is not a number
	- Must be a word that is not a verb
	- Must be a word that is not a pronoun
	- Must be a word that is not a preposition
	- Must be a word that is not a conjunction
	- Must be a word that is not an interjection
	- Must be a word that is not an adjective
	- Must be a word that is not an adverb
	- Must be a word that is not a determiner
	- Must be a word that is not a particle
	- Must be a word that is not a numeral
	- Output only one keyword

	Here is an example of the final output
	Summary: Summary of The video
	Keyword: keyword
	Don't output any other text outside of the given format
	Helpful Answer:"""

	reduce_prompt = PromptTemplate.from_template(reduce_template)

	reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
	combine_documents_chain = StuffDocumentsChain(
	llm_chain=reduce_chain, document_variable_name="doc_summaries"
	)

	# Combines and iteravely reduces the mapped documents
	reduce_documents_chain = ReduceDocumentsChain(
	# This is final chain that is called.
	combine_documents_chain=combine_documents_chain,
	# If documents exceed context for `StuffDocumentsChain`
	collapse_documents_chain=combine_documents_chain,
	# The maximum number of tokens to group documents into.
	token_max=4000,
	)

	# 2. Map chain
	map_chain = LLMChain(llm=llm, prompt=map_prompt)

	# Combining documents by mapping a chain over them, then combining results
	map_reduce_chain = MapReduceDocumentsChain(
	# Map chain
	llm_chain=map_chain,
	# Reduce chain
	reduce_documents_chain=reduce_documents_chain,
	# The variable name in the llm_chain to put the documents in
	document_variable_name="docs",
	# Return the results of the map steps in the output
	return_intermediate_steps=False,
	)
	# Run
	result = map_reduce_chain.run(split_docs)
	print(result)
	with open("temp/result.txt", "w") as f:
	f.write(result)
	return result


	def text_to_arr(result):
	text = result

	# Regular expression to find the keyword
	match = re.search(r"Keyword:\s*(\w+)", text)

	if match:
	keyword = match.group(1)
	print("Keyword:", keyword) # The keyword is in the first capturing group
	else:
	match = re.search(r"키워드:\s*(\w+)", text)
	keyword = match.group(1) # No keyword found
	print("Keyword:", keyword)

	return keyword


	def aladin_api(keyword, selected_option):
	aladin_key = 'ttbkangmj08250027001'
	keyword = keyword
	all_data = []
	if selected_option == "사회":
	key = keyword
	print(key)
	url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
	"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1"
	response = requests.get(url)
	response_json = json.loads(response.text)
	all_data.append(response_json)

	elif selected_option == "과학":
	key = keyword
	print(key)
	url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
	"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
	response = requests.get(url)
	response_json = json.loads(response.text)
	all_data.append(response_json)

	elif selected_option == "소설":
	key = keyword
	print(key)
	url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
	"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
	response = requests.get(url)
	response_json = json.loads(response.text)
	all_data.append(response_json)

	elif selected_option == "금융":
	key = keyword
	url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
	"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
	response = requests.get(url)
	response_json = json.loads(response.text)
	all_data.append(response_json)
	# request 보내기
	all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
	with open("temp/book.json", "wb") as f:
	f.write(all_data.encode("utf-8"))
	print(type(all_data))
	print(all_data)
	return all_data


	def book_output(book_json):
	data = json.loads(book_json)

	if len(data[0]['item'][0]) != 0:
	title1 = data[0]['item'][0]['title']
	book_link1 = data[0]['item'][0]['link']
	cover_link1 = data[0]['item'][0]['cover']
	response1 = requests.get(cover_link1)
	image1 = Image.open(BytesIO(response1.content))
	else:
	title1 = "No Data"
	book_link1 = "No Data"
	image1 = Image.open("NO DATA.jpeg")

	if len(data[0]['item'][1]) != 0:
	title2 = data[0]['item'][1]['title']
	book_link2 = data[0]['item'][1]['link']
	cover_link2 = data[0]['item'][1]['cover']
	response2 = requests.get(cover_link2)
	image2 = Image.open(BytesIO(response2.content))
	else:
	title2 = "No Data"
	book_link2 = "No Data"
	image2 = Image.open("NO DATA.jpeg")

	if len(data[0]['item'][2]) != 0:
	title3 = data[0]['item'][2]['title']
	book_link3 = data[0]['item'][2]['link']
	cover_link3 = data[0]['item'][2]['cover']
	response3 = requests.get(cover_link3)
	image3 = Image.open(BytesIO(response3.content))
	else:
	title3 = "No Data"
	book_link3 = "No Data"
	image3 = Image.open("NO DATA.jpeg")

	return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3


	def get_title(API_KEY, link, selected_option):
	docs, split_docs = youtube_text(link)
	result = youtube_sum(docs, split_docs, API_KEY)
	keywords = text_to_arr(result)
	all_data = aladin_api(keywords, selected_option)
	title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data)
	return result, title1, image1, title2, image2, title3, image3, link1, link2, link3


	# Define the list of options for the Dropdown
	options_list = ["사회", "과학", "소설", "금융"]

	with gr.Blocks() as demo:
	gr.Markdown("Paste your Youtube Link and get the book recommandation")
	with gr.Column():
	with gr.Row():
	inp1 = gr.Textbox(label="Your OpenAI KEY")
	inp2 = gr.Textbox(label="Input Link")
	inp3 = gr.Dropdown(choices=options_list, label="Select a category")
	btn = gr.Button("Find the book")

	with gr.Column():
	out1 = gr.Textbox(label="Summary")
	with gr.Row():
	out2 = gr.Textbox(label="Title1")
	out4 = gr.Textbox(label="Title2")
	out6 = gr.Textbox(label="Title3")
	with gr.Row():
	out3 = gr.Image(label="Image1")
	out5 = gr.Image(label="Image2")
	out7 = gr.Image(label="Image3")
	with gr.Row():
	out8 = gr.HTML(label="Book Link1")
	out9 = gr.HTML(label="Book Link2")
	out10 = gr.HTML(label="Book Link3")
	btn.click(fn=get_title, inputs=[inp1, inp2, inp3],
	outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10])

	demo.launch(share=True)