Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import time | |
| import numpy as np | |
| import openai | |
| import pandas as pd | |
| import spacy | |
| import tqdm | |
| from tqdm import tqdm | |
| from .utils import get_num_tokens, parse_prompt, num_tokens_from_messages, clean_slides, slide_generation_ver2, generate_latex_slide | |
| nlp = spacy.load('en_core_web_sm') | |
| def set_openai_api_key(key: str): | |
| openai.api_key = 'key' | |
| def generate_slide(json_pth: str): | |
| model_list = [model['id'] for model in openai.Model.list()['data']] | |
| gpt4_id = "gpt-4-0314" | |
| gpt3_id = 'gpt-3.5-turbo-0301' | |
| with open(json_pth) as f: | |
| data = json.load(f) | |
| title = data['title'] | |
| abstract = data['abstract'] | |
| paper_length = len(data['text']) | |
| sections = [[head['section'], ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))])] for head in data['headers']] | |
| figures = [fig['caption'] for fig in data['figures']] | |
| ### ! Split the sections by chunks with token_limit | |
| new_sections = [] | |
| toc = "" | |
| token_limit = 1400 | |
| for section in sections: | |
| section_title = section[0] | |
| curr_count = get_num_tokens(section[1]) | |
| toc += section_title + "; " | |
| if curr_count > token_limit: | |
| # split the section into sentences | |
| sents = nlp(section[1]).sents | |
| temp_list = [] | |
| for sent in sents: | |
| if not temp_list: | |
| temp_list.append(sent.text) | |
| continue | |
| curr_count = get_num_tokens(temp_list[-1]) | |
| if curr_count + get_num_tokens(sent.text) < token_limit: | |
| temp_list[-1] += sent.text | |
| else: | |
| temp_list.append(sent.text) | |
| for i in range(len(temp_list)): | |
| if i == 0: | |
| new_sections.append([section_title, temp_list[i]]) | |
| else: | |
| new_sections.append([section_title + " (cont.)", temp_list[i]]) | |
| else: | |
| new_sections.append(section) | |
| print(f"Total number of sections: {len(new_sections)}") | |
| # ! get the initial message | |
| initial_user_message = "Title: " + title + "\nTable of Contents: " + toc + "\nAbstract: " + abstract | |
| initial_section_title = new_sections[0][0] | |
| initial_section_content = new_sections[0][1] | |
| # ! initial dialogue, Generates slides for the first section of the research paper. | |
| res = [] | |
| data = [initial_user_message, initial_section_title, initial_section_content] | |
| messages = parse_prompt("./dialogue_1.txt", data) | |
| token_length = num_tokens_from_messages(messages) | |
| assert token_length < 2400, f"Message is too long: {token_length}" | |
| response = openai.ChatCompletion.create( | |
| model=gpt3_id, | |
| messages=messages, | |
| temperature=0.5, | |
| ) | |
| answer = response["choices"][0]["message"]["content"] | |
| res.append(answer) | |
| time.sleep(10) | |
| ### ! Following dialogue. Generates slides for the following sections of the research paper. | |
| for i, (section_title, section_content) in enumerate(new_sections[1:]): | |
| print(f"Section {i+1}: {section_title} is being processed...") | |
| data = [section_content] | |
| messages = parse_prompt("./dialogue_2.txt", data) | |
| token_length = num_tokens_from_messages(messages) | |
| assert token_length < 2400, f"Message is too long: {token_length}" | |
| response = openai.ChatCompletion.create( | |
| model=gpt3_id, | |
| messages=messages, | |
| temperature=0.9, | |
| ) | |
| answer = response["choices"][0]["message"]["content"] | |
| res.append(answer) | |
| del messages, token_length, response, answer | |
| time.sleep(10) # sleep for 10 seconds to avoid API limit | |
| ### ! Clean slides from comments, empty lines and other garbage | |
| for i in range(len(res)): | |
| res[i] = clean_slides(res[i]) | |
| temp_res = res | |
| prev_cnt = len(temp_res) | |
| while len(temp_res) > 1: | |
| temp_num_tokens = get_num_tokens("\n".join(temp_res)) | |
| temp_res = slide_generation_ver2(temp_res, 1800) | |
| print(f"The length of res is {len(temp_res)}, and the number of tokens is {temp_num_tokens}") | |
| # if the number of slides is not changed then break | |
| if len(temp_res) == prev_cnt: | |
| break | |
| else: | |
| prev_cnt = len(temp_res) | |
| # if the number of tokens is less than 4000 then break | |
| if temp_num_tokens <= 4000: | |
| break | |
| new_res = [] | |
| for i in tqdm(range(len(temp_res))): | |
| data = [temp_res[i]] | |
| messages = parse_prompt("./dialogue_3.txt", data) | |
| token_length = num_tokens_from_messages(messages) | |
| assert token_length < 2400, f"Message is too long: {token_length}" | |
| response = openai.ChatCompletion.create( | |
| model=gpt3_id, | |
| messages=messages, | |
| temperature=0.9, | |
| ) | |
| temp = response["choices"][0]["message"]["content"] | |
| temp = clean_slides(temp) | |
| new_res.append(temp) | |
| time.sleep(5) # needed to avoid API limit | |
| temp_res = new_res | |
| time.sleep(10) # needed to avoid API limit | |
| # ! final refinement | |
| final_draft = "\n".join(temp_res) | |
| data = [final_draft] | |
| messages = parse_prompt("./dialogue_4.txt", data) | |
| print(num_tokens_from_messages(messages)) | |
| response = openai.ChatCompletion.create( | |
| model=gpt4_id if gpt4_id in model_list else gpt3_id, | |
| messages=messages, | |
| temperature=0.5, | |
| ) | |
| temp = response["choices"][0]["message"]["content"] | |
| # generate_latex_slide(temp, "test.tex") | |
| return temp | |