Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| import json | |
| import re | |
| import gradio as gr | |
| from pytube import YouTube | |
| import whisper | |
| import time | |
| import pickle | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.schema.document import Document | |
| from langchain.chains.mapreduce import MapReduceChain | |
| from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.chains.llm import LLMChain | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains.combine_documents.stuff import StuffDocumentsChain | |
| from PIL import Image | |
| from io import BytesIO | |
| openai_api_key = "" | |
| # for API | |
| # # == | |
| def youtube_text(link): | |
| yt = YouTube(link) | |
| yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3") | |
| start = time.time() | |
| model = whisper.load_model("base") | |
| text = model.transcribe("test.mp3") | |
| end = time.time() | |
| print(text["text"]) | |
| print(f"{end - start:.2f}sec") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=2000, | |
| chunk_overlap=50, | |
| length_function=len, ) | |
| full_docs = text["text"] | |
| docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])] | |
| split_docs = text_splitter.split_documents(docs) | |
| with open("temp/split_example_small.pkl", "wb") as f: | |
| pickle.dump(split_docs, f) | |
| return split_docs, full_docs | |
| def youtube_sum(split_docs, full_docs, API_KEY): | |
| openai_key = API_KEY | |
| llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key) | |
| # Map prompt | |
| map_template = """The following is a set of documents | |
| {docs} | |
| Based on this list of Video subtitles , please identify the main themes | |
| Helpful Answer:""" | |
| map_prompt = PromptTemplate.from_template(map_template) | |
| # Reduce prompt | |
| reduce_template = """The following is set of summaries: | |
| {doc_summaries} | |
| You need to output two things from the above Video Subtitles. | |
| 1. Write an executive summary | |
| Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video. | |
| Your summary should. | |
| - Must be written in Korean | |
| - Be a 1~2 paragraph | |
| - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video. | |
| - There are no more than three main topics in the video. | |
| - Please also briefly describe the overall content of the video | |
| 2. Choose your keyword | |
| The keywords have the following conditions | |
| - Must be written in Korean | |
| - Must be a single word | |
| - Must be a noun | |
| - Must be a word that appears in the Video | |
| - Must be a word that is not a stopword | |
| - Must be a word that is not a proper noun | |
| - Must be a word that is not a number | |
| - Must be a word that is not a verb | |
| - Must be a word that is not a pronoun | |
| - Must be a word that is not a preposition | |
| - Must be a word that is not a conjunction | |
| - Must be a word that is not an interjection | |
| - Must be a word that is not an adjective | |
| - Must be a word that is not an adverb | |
| - Must be a word that is not a determiner | |
| - Must be a word that is not a particle | |
| - Must be a word that is not a numeral | |
| - Output only one keyword | |
| Here is an example of the final output | |
| Summary: Summary of The video | |
| Keyword: keyword | |
| Don't output any other text outside of the given format | |
| Helpful Answer:""" | |
| reduce_prompt = PromptTemplate.from_template(reduce_template) | |
| reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt) | |
| combine_documents_chain = StuffDocumentsChain( | |
| llm_chain=reduce_chain, document_variable_name="doc_summaries" | |
| ) | |
| # Combines and iteravely reduces the mapped documents | |
| reduce_documents_chain = ReduceDocumentsChain( | |
| # This is final chain that is called. | |
| combine_documents_chain=combine_documents_chain, | |
| # If documents exceed context for `StuffDocumentsChain` | |
| collapse_documents_chain=combine_documents_chain, | |
| # The maximum number of tokens to group documents into. | |
| token_max=4000, | |
| ) | |
| # 2. Map chain | |
| map_chain = LLMChain(llm=llm, prompt=map_prompt) | |
| # Combining documents by mapping a chain over them, then combining results | |
| map_reduce_chain = MapReduceDocumentsChain( | |
| # Map chain | |
| llm_chain=map_chain, | |
| # Reduce chain | |
| reduce_documents_chain=reduce_documents_chain, | |
| # The variable name in the llm_chain to put the documents in | |
| document_variable_name="docs", | |
| # Return the results of the map steps in the output | |
| return_intermediate_steps=False, | |
| ) | |
| # Run | |
| result = map_reduce_chain.run(split_docs) | |
| print(result) | |
| with open("temp/result.txt", "w") as f: | |
| f.write(result) | |
| return result | |
| def text_to_arr(result): | |
| text = result | |
| # Regular expression to find the keyword | |
| match = re.search(r"Keyword:\s*(\w+)", text) | |
| if match: | |
| keyword = match.group(1) | |
| print("Keyword:", keyword) # The keyword is in the first capturing group | |
| else: | |
| match = re.search(r"ํค์๋:\s*(\w+)", text) | |
| keyword = match.group(1) # No keyword found | |
| print("Keyword:", keyword) | |
| return keyword | |
| def aladin_api(keyword, selected_option): | |
| aladin_key = 'ttbkangmj08250027001' | |
| keyword = keyword | |
| all_data = [] | |
| if selected_option == "์ฌํ": | |
| key = keyword | |
| print(key) | |
| url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
| "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1" | |
| response = requests.get(url) | |
| response_json = json.loads(response.text) | |
| all_data.append(response_json) | |
| elif selected_option == "๊ณผํ": | |
| key = keyword | |
| print(key) | |
| url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
| "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1" | |
| response = requests.get(url) | |
| response_json = json.loads(response.text) | |
| all_data.append(response_json) | |
| elif selected_option == "์์ค": | |
| key = keyword | |
| print(key) | |
| url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
| "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1" | |
| response = requests.get(url) | |
| response_json = json.loads(response.text) | |
| all_data.append(response_json) | |
| elif selected_option == "๊ธ์ต": | |
| key = keyword | |
| url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
| "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1" | |
| response = requests.get(url) | |
| response_json = json.loads(response.text) | |
| all_data.append(response_json) | |
| # request ๋ณด๋ด๊ธฐ | |
| all_data = json.dumps(all_data, ensure_ascii=False, indent=4) | |
| with open("temp/book.json", "wb") as f: | |
| f.write(all_data.encode("utf-8")) | |
| print(type(all_data)) | |
| print(all_data) | |
| return all_data | |
| def book_output(book_json): | |
| data = json.loads(book_json) | |
| if len(data[0]['item'][0]) != 0: | |
| title1 = data[0]['item'][0]['title'] | |
| book_link1 = data[0]['item'][0]['link'] | |
| cover_link1 = data[0]['item'][0]['cover'] | |
| response1 = requests.get(cover_link1) | |
| image1 = Image.open(BytesIO(response1.content)) | |
| else: | |
| title1 = "No Data" | |
| book_link1 = "No Data" | |
| image1 = Image.open("NO DATA.jpeg") | |
| if len(data[0]['item'][1]) != 0: | |
| title2 = data[0]['item'][1]['title'] | |
| book_link2 = data[0]['item'][1]['link'] | |
| cover_link2 = data[0]['item'][1]['cover'] | |
| response2 = requests.get(cover_link2) | |
| image2 = Image.open(BytesIO(response2.content)) | |
| else: | |
| title2 = "No Data" | |
| book_link2 = "No Data" | |
| image2 = Image.open("NO DATA.jpeg") | |
| if len(data[0]['item'][2]) != 0: | |
| title3 = data[0]['item'][2]['title'] | |
| book_link3 = data[0]['item'][2]['link'] | |
| cover_link3 = data[0]['item'][2]['cover'] | |
| response3 = requests.get(cover_link3) | |
| image3 = Image.open(BytesIO(response3.content)) | |
| else: | |
| title3 = "No Data" | |
| book_link3 = "No Data" | |
| image3 = Image.open("NO DATA.jpeg") | |
| return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3 | |
| def get_title(API_KEY, link, selected_option): | |
| docs, split_docs = youtube_text(link) | |
| result = youtube_sum(docs, split_docs, API_KEY) | |
| keywords = text_to_arr(result) | |
| all_data = aladin_api(keywords, selected_option) | |
| title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data) | |
| return result, title1, image1, title2, image2, title3, image3, link1, link2, link3 | |
| # Define the list of options for the Dropdown | |
| options_list = ["์ฌํ", "๊ณผํ", "์์ค", "๊ธ์ต"] | |
| with gr.Blocks() as demo: | |
| gr.Markdown("Paste your Youtube Link and get the book recommandation") | |
| with gr.Column(): | |
| with gr.Row(): | |
| inp1 = gr.Textbox(label="Your OpenAI KEY") | |
| inp2 = gr.Textbox(label="Input Link") | |
| inp3 = gr.Dropdown(choices=options_list, label="Select a category") | |
| btn = gr.Button("Find the book") | |
| with gr.Column(): | |
| out1 = gr.Textbox(label="Summary") | |
| with gr.Row(): | |
| out2 = gr.Textbox(label="Title1") | |
| out4 = gr.Textbox(label="Title2") | |
| out6 = gr.Textbox(label="Title3") | |
| with gr.Row(): | |
| out3 = gr.Image(label="Image1") | |
| out5 = gr.Image(label="Image2") | |
| out7 = gr.Image(label="Image3") | |
| with gr.Row(): | |
| out8 = gr.HTML(label="Book Link1") | |
| out9 = gr.HTML(label="Book Link2") | |
| out10 = gr.HTML(label="Book Link3") | |
| btn.click(fn=get_title, inputs=[inp1, inp2, inp3], | |
| outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10]) | |
| demo.launch(share=True) | |