| import openai | |
| import gradio as gr | |
| from gradio.components import Audio, Textbox | |
| import os | |
| import re | |
| import tiktoken | |
| from transformers import GPT2Tokenizer | |
| import whisper | |
| import pandas as pd | |
| from datetime import datetime, timezone, timedelta | |
| import notion_df | |
| import concurrent.futures | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| nltk.download('punkt') | |
| import spacy | |
| from spacy import displacy | |
| from gradio import Markdown | |
| import threading | |
| # Define the tokenizer and model | |
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
| model = openai.api_key = os.environ["OPENAI_API_KEY"] | |
| # Define the initial message and messages list | |
| initialt = 'You are a Tutor. Respond with ALWAYS layered "bullet points" (listing rather than sentences) to all input with a fun mneumonics to memorize that list. But you can answer up to 1200 words if the user requests longer response.' | |
| initial_message = {"role": "system", "content": initialt} | |
| messages = [initial_message] | |
| messages_rev = [initial_message] | |
| # Define the answer counter | |
| answer_count = 0 | |
| # Define the Notion API key | |
| API_KEY = os.environ["API_KEY"] | |
| nlp = spacy.load("en_core_web_sm") | |
| def process_nlp(system_message): | |
| # Colorize the system message text | |
| colorized_text = colorize_text(system_message['content']) | |
| return colorized_text | |
| from colour import Color | |
| # # define color combinations for different parts of speech | |
| # COLORS = { | |
| # "NOUN": "#000000", # Black | |
| # "VERB": "#ff6936", # Orange | |
| # "ADJ": "#4363d8", # Blue | |
| # "ADV": "#228b22", # Green | |
| # "digit": "#9a45d6", # Purple | |
| # "punct": "#ffcc00", # Yellow | |
| # "quote": "#b300b3" # Magenta | |
| # } | |
| # # define color combinations for individuals with dyslexia and color vision deficiencies | |
| # DYSLEXIA_COLORS = { | |
| # "NOUN": "#000000", | |
| # "VERB": "#ff6936", | |
| # "ADJ": "#4363d8", | |
| # "ADV": "#228b22", | |
| # "digit": "#9a45d6", | |
| # "punct": "#ffcc00", | |
| # "quote": "#b300b3", | |
| # } | |
| # RED_GREEN_COLORS = { | |
| # "NOUN": "#000000", | |
| # "VERB": "#fe642e", # Lighter orange | |
| # "ADJ": "#2e86c1", # Lighter blue | |
| # "ADV": "#82e0aa", # Lighter green | |
| # "digit": "#aa6c39", # Brown | |
| # "punct": "#f0b27a", # Lighter yellow | |
| # "quote": "#9932cc" # Darker magenta | |
| # } | |
| # # define a muted background color | |
| # BACKGROUND_COLOR = "#ffffff" # White | |
| # # define font and size | |
| # FONT = "OpenDyslexic" | |
| # FONT_SIZE = "18px" | |
| # def colorize_text(text, colors=DYSLEXIA_COLORS, background_color=None, font=FONT, font_size=FONT_SIZE): | |
| # if colors is None: | |
| # colors = COLORS | |
| # colorized_text = "" | |
| # lines = text.split("\n") | |
| # # set background color | |
| # if background_color is None: | |
| # background_color = BACKGROUND_COLOR | |
| # # iterate over the lines in the text | |
| # for line in lines: | |
| # # parse the line with the language model | |
| # doc = nlp(line) | |
| # # iterate over the tokens in the line | |
| # for token in doc: | |
| # # check if the token is an entity | |
| # if token.ent_type_: | |
| # # use dyslexia colors for entity if available | |
| # if colors == COLORS: | |
| # color = DYSLEXIA_COLORS.get(token.pos_, None) | |
| # else: | |
| # color = colors.get(token.pos_, None) | |
| # # check if a color is available for the token | |
| # if color is not None: | |
| # colorized_text += ( | |
| # f'<span style="color: {color}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {font}; ' | |
| # f'font-size: {font_size}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' | |
| # f"{token.text}</span>" | |
| # ) | |
| # else: | |
| # colorized_text += ( | |
| # f'<span style="font-family: {font}; ' | |
| # f'font-size: {font_size}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' | |
| # f"{token.text}</span>" | |
| # ) | |
| # else: | |
| # # check if a color is available for the token | |
| # color = colors.get(token.pos_, None) | |
| # if color is not None: | |
| # colorized_text += ( | |
| # f'<span style="color: {color}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {font}; ' | |
| # f'font-size: {font_size}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' | |
| # f"{token.text}</span>" | |
| # ) | |
| # elif token.is_digit: | |
| # colorized_text += ( | |
| # f'<span style="color: {colors["digit"]}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {font}; ' | |
| # f'font-size: {font_size}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' | |
| # f"{token.text}</span>" | |
| # ) | |
| # elif token.is_punct: | |
| # colorized_text += ( | |
| # f'<span style="color: {colors["punct"]}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {font}; ' | |
| # f'font-size: {font_size}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' | |
| # f"{token.text}</span>" | |
| # ) | |
| # elif token.is_quote: | |
| # colorized_text += ( | |
| # f'<span style="color: {colors["quote"]}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {font}; ' | |
| # f'font-size: {font_size}; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' | |
| # f"{token.text}</span>" | |
| # ) | |
| # else: | |
| # # use larger font size for specific parts of speech, such as nouns and verbs | |
| # font_size = FONT_SIZE | |
| # if token.pos_ in ["NOUN", "VERB"]: | |
| # font_size = "22px" | |
| # colorized_text += ( | |
| # f'<span style="font-family: {font}; ' | |
| # f'font-size: {font_size}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' | |
| # f"{token.text}</span>" | |
| # ) | |
| # colorized_text += "<br>" | |
| # return colorized_text | |
| # # define color combinations for different parts of speech | |
| # COLORS = { | |
| # "NOUN": "#5e5e5e", # Dark gray | |
| # "VERB": "#ff6936", # Orange | |
| # "ADJ": "#4363d8", # Blue | |
| # "ADV": "#228b22", # Green | |
| # "digit": "#9a45d6", # Purple | |
| # "punct": "#ffcc00", # Yellow | |
| # "quote": "#b300b3" # Magenta | |
| # } | |
| # # define color combinations for individuals with dyslexia | |
| # DYSLEXIA_COLORS = { | |
| # "NOUN": "#5e5e5e", | |
| # "VERB": "#ff6936", | |
| # "ADJ": "#4363d8", | |
| # "ADV": "#228b22", | |
| # "digit": "#9a45d6", | |
| # "punct": "#ffcc00", | |
| # "quote": "#b300b3" | |
| # } | |
| # # define a muted background color | |
| # BACKGROUND_COLOR = "#f5f5f5" # Light gray | |
| # # define font and size | |
| # FONT = "Arial" | |
| # FONT_SIZE = "14px" | |
| # # load the English language model | |
| # nlp = spacy.load('en_core_web_sm') | |
| # def colorize_text(text, colors=DYSLEXIA_COLORS, background_color=None): | |
| # if colors is None: | |
| # colors = COLORS | |
| # colorized_text = "" | |
| # lines = text.split("\n") | |
| # # set background color | |
| # if background_color is None: | |
| # background_color = BACKGROUND_COLOR | |
| # # iterate over the lines in the text | |
| # for line in lines: | |
| # # parse the line with the language model | |
| # doc = nlp(line) | |
| # # iterate over the tokens in the line | |
| # for token in doc: | |
| # # check if the token is an entity | |
| # if token.ent_type_: | |
| # # use dyslexia colors for entity if available | |
| # if colors == COLORS: | |
| # color = DYSLEXIA_COLORS.get(token.pos_, None) | |
| # else: | |
| # color = colors.get(token.pos_, None) | |
| # # check if a color is available for the token | |
| # if color is not None: | |
| # colorized_text += ( | |
| # f'<span style="color: {color}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {FONT}; ' | |
| # f'font-size: {FONT_SIZE}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' # Add space between tokens | |
| # f"{token.text}</span>" | |
| # ) | |
| # else: | |
| # colorized_text += ( | |
| # f'<span style="font-family: {FONT}; ' | |
| # f'font-size: {FONT_SIZE}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' # Add space between tokens | |
| # f"{token.text}</span>" | |
| # ) | |
| # else: | |
| # # check if a color is available for the token | |
| # color = colors.get(token.pos_, None) | |
| # if color is not None: | |
| # colorized_text += ( | |
| # f'<span style="color: {color}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {FONT}; ' | |
| # f'font-size: {FONT_SIZE}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' # Add space between tokens | |
| # f"{token.text}</span>" | |
| # ) | |
| # elif token.is_digit: | |
| # colorized_text += ( | |
| # f'<span style="color: {colors["digit"]}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {FONT}; ' | |
| # f'font-size: {FONT_SIZE}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' # Add space between tokens | |
| # f"{token.text}</span>" | |
| # ) | |
| # elif token.is_punct: | |
| # colorized_text += ( | |
| # f'<span style="color: {colors["punct"]}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {FONT}; ' | |
| # f'font-size: {FONT_SIZE}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' # Add space between tokens | |
| # f"{token.text}</span>" | |
| # ) | |
| # elif token.is_quote: | |
| # colorized_text += ( | |
| # f'<span style="color: {colors["quote"]}; ' | |
| # f'background-color: {background_color}; ' | |
| # f'font-family: {FONT}; ' | |
| # f'font-size: {FONT_SIZE}; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' # Add space between tokens | |
| # f"{token.text}</span>" | |
| # ) | |
| # else: | |
| # colorized_text += ( | |
| # f'<span style="font-family: {FONT}; ' | |
| # f'font-size: {FONT_SIZE}; ' | |
| # f'font-weight: bold; ' | |
| # f'text-decoration: none; ' | |
| # f'padding-right: 0.5em;">' # Add space between tokens | |
| # f"{token.text}</span>" | |
| # ) | |
| # colorized_text += "<br>" | |
| # return colorized_text | |
| # define color combinations for different parts of speech | |
| COLORS = { | |
| "NOUN": "#FF3300", | |
| "VERB": "#008000", | |
| "ADJ": "#1E90FF", | |
| "ADV": "#FF8C00", | |
| "digit": "#FF1493", | |
| "punct": "#8B0000", | |
| "quote": "#800080", | |
| } | |
| # define color combinations for individuals with dyslexia | |
| DYSLEXIA_COLORS = { | |
| "NOUN": "#1E90FF", | |
| "VERB": "#006400", | |
| "ADJ": "#00CED1", | |
| "ADV": "#FF8C00", | |
| "digit": "#FF1493", | |
| "punct": "#A0522D", | |
| "quote": "#800080", | |
| } | |
| # define a muted background color | |
| BACKGROUND_COLOR = "#EAEAEA" | |
| # define font and size | |
| FONT = "Georgia" | |
| FONT_SIZE = "18px" | |
| def colorize_text(text, colors=None, background_color=None): | |
| if colors is None: | |
| colors = COLORS | |
| colorized_text = "" | |
| lines = text.split("\n") | |
| # set background color | |
| if background_color is None: | |
| background_color = BACKGROUND_COLOR | |
| for line in lines: | |
| doc = nlp(line) | |
| for token in doc: | |
| if token.ent_type_: | |
| # use dyslexia colors for entity if available | |
| if colors == COLORS: | |
| color = DYSLEXIA_COLORS.get(token.pos_, None) | |
| else: | |
| color = colors.get(token.pos_, None) | |
| if color is not None: | |
| colorized_text += ( | |
| f'<span style="color: {color}; ' | |
| f'background-color: {background_color}; ' | |
| f'font-family: {FONT}; ' | |
| f'font-size: {FONT_SIZE}; ' | |
| f'text-decoration: underline;">' | |
| f"{token.text}</span>" | |
| ) | |
| else: | |
| colorized_text += ( | |
| f'<span style="font-family: {FONT}; ' | |
| f'font-size: {FONT_SIZE}; ' | |
| f'text-decoration: underline;">' | |
| f"{token.text}</span>" | |
| ) | |
| else: | |
| color = colors.get(token.pos_, None) | |
| if color is not None: | |
| colorized_text += ( | |
| f'<span style="color: {color}; ' | |
| f'background-color: {background_color}; ' | |
| f'font-family: {FONT}; ' | |
| f'font-size: {FONT_SIZE}; ' | |
| f'text-decoration: underline;">' | |
| f"{token.text}</span>" | |
| ) | |
| elif token.is_digit: | |
| colorized_text += ( | |
| f'<span style="color: {colors["digit"]}; ' | |
| f'background-color: {background_color}; ' | |
| f'font-family: {FONT}; ' | |
| f'font-size: {FONT_SIZE}; ' | |
| f'text-decoration: underline;">' | |
| f"{token.text}</span>" | |
| ) | |
| elif token.is_punct: | |
| colorized_text += ( | |
| f'<span style="color: {colors["punct"]}; ' | |
| f'background-color: {background_color}; ' | |
| f'font-family: {FONT}; ' | |
| f'font-size: {FONT_SIZE}; ' | |
| f'text-decoration: underline;">' | |
| f"{token.text}</span>" | |
| ) | |
| elif token.is_quote: | |
| colorized_text += ( | |
| f'<span style="color: {colors["quote"]}; ' | |
| f'background-color: {background_color}; ' | |
| f'font-family: {FONT}; ' | |
| f'font-size: {FONT_SIZE}; ' | |
| f'text-decoration: underline;">' | |
| f"{token.text}</span>" | |
| ) | |
| else: | |
| colorized_text += ( | |
| f'<span style="font-family: {FONT}; ' | |
| f'font-size: {FONT_SIZE}; ' | |
| f'text-decoration: underline;">' | |
| f"{token.text}</span>" | |
| ) | |
| colorized_text += " " | |
| colorized_text += "<br>" | |
| return colorized_text | |
| def colorize_and_update(system_message, submit_update): | |
| colorized_system_message = colorize_text(system_message['content']) | |
| submit_update(None, colorized_system_message) # Pass the colorized_system_message as the second output | |
| def update_text_output(system_message, submit_update): | |
| submit_update(system_message['content'], None) | |
| def train(text): | |
| now_et = datetime.now(timezone(timedelta(hours=-4))) | |
| published_date = now_et.strftime('%m-%d-%y %H:%M') | |
| df = pd.DataFrame([text]) | |
| notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY) | |
| def transcribe(audio, text, submit_update=None): | |
| global messages | |
| global answer_count | |
| transcript = {'text': ''} | |
| input_text = [] | |
| # Check if the first word of the first line is "COLORIZE" | |
| if text and text.split("\n")[0].split(" ")[0].strip().upper() == "COLORIZE": | |
| train(text) | |
| colorized_input = colorize_text(text) | |
| return text, colorized_input | |
| # Transcribe the audio if provided | |
| if audio is not None: | |
| audio_file = open(audio, "rb") | |
| transcript = openai.Audio.transcribe("whisper-1", audio_file, language="en") | |
| # Tokenize the text input | |
| if text is not None: | |
| # Split the input text into sentences | |
| sentences = re.split("(?<=[.!?]) +", text) | |
| # Initialize a list to store the tokens | |
| input_tokens = [] | |
| # Add each sentence to the input_tokens list | |
| for sentence in sentences: | |
| # Tokenize the sentence using the GPT-2 tokenizer | |
| sentence_tokens = tokenizer.encode(sentence) | |
| # Check if adding the sentence would exceed the token limit | |
| if len(input_tokens) + len(sentence_tokens) < 1440: | |
| # Add the sentence tokens to the input_tokens list | |
| input_tokens.extend(sentence_tokens) | |
| else: | |
| # If adding the sentence would exceed the token limit, truncate it | |
| sentence_tokens = sentence_tokens[:1440-len(input_tokens)] | |
| input_tokens.extend(sentence_tokens) | |
| break | |
| # Decode the input tokens into text | |
| input_text = tokenizer.decode(input_tokens) | |
| # Add the input text to the messages list | |
| messages.append({"role": "user", "content": transcript["text"]+input_text}) | |
| # Check if the accumulated tokens have exceeded 2096 | |
| num_tokens = sum(len(tokenizer.encode(message["content"])) for message in messages) | |
| if num_tokens > 2096: | |
| # Concatenate the chat history | |
| chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages if message['role'] != 'system']) | |
| # Append the number of tokens used to the end of the chat transcript | |
| chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n" | |
| # Get the current time in Eastern Time (ET) | |
| now_et = datetime.now(timezone(timedelta(hours=-4))) | |
| # Format the time as string (YY-MM-DD HH:MM) | |
| published_date = now_et.strftime('%m-%d-%y %H:%M') | |
| # Upload the chat transcript to Notion | |
| df = pd.DataFrame([chat_transcript]) | |
| notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date+'FULL'), api_key=API_KEY) | |
| messages = [initial_message] | |
| messages.append({"role": "user", "content": initialt}) | |
| answer_count = 0 | |
| # Add the input text to the messages list | |
| messages.append({"role": "user", "content": input_text}) | |
| else: | |
| # Increment the answer counter | |
| answer_count += 1 | |
| # Generate the system message using the OpenAI API | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| prompt = [{"text": f"{message['role']}: {message['content']}\n\n"} for message in messages] | |
| system_message = openai.ChatCompletion.create( | |
| model="gpt-4", | |
| messages=messages, | |
| max_tokens=2000 | |
| )["choices"][0]["message"] | |
| # Wait for the completion of the OpenAI API call | |
| if submit_update: # Check if submit_update is not None | |
| update_text_output(system_message, submit_update) | |
| # Add the system message to the messages list | |
| messages.append(system_message) | |
| # Add the system message to the beginning of the messages list | |
| messages_rev.insert(0, system_message) | |
| # Add the input text to the messages list | |
| messages_rev.insert(0, {"role": "user", "content": input_text + transcript["text"]}) | |
| # Start a separate thread to process the colorization and update the Gradio interface | |
| if submit_update: # Check if submit_update is not None | |
| colorize_thread = threading.Thread(target=colorize_and_update, args=(system_message, submit_update)) | |
| colorize_thread.start() | |
| # Concatenate the chat history | |
| chat_transcript = "\n\n".join([f"[ANSWER {answer_count}]{message['role']}: {message['content']}" for message in messages_rev if message['role'] != 'system']) | |
| # Append the number of tokens used to the end of the chat transcript | |
| chat_transcript += f"\n\nNumber of tokens used: {num_tokens}\n\n" | |
| # Save the chat transcript to a file | |
| with open("conversation_history.txt", "a") as f: | |
| f.write(chat_transcript) | |
| # Upload the chat transcript to Notion | |
| now_et = datetime.now(timezone(timedelta(hours=-4))) | |
| published_date = now_et.strftime('%m-%d-%y %H:%M') | |
| df = pd.DataFrame([chat_transcript]) | |
| notion_df.upload(df, 'https://www.notion.so/US-62e861a0b35f43da8ef9a7789512b8c2?pvs=4', title=str(published_date), api_key=API_KEY) | |
| # Return the chat transcript | |
| return system_message['content'], colorize_text(system_message['content']) | |
| # Define the input and output components for Gradio | |
| audio_input = Audio(source="microphone", type="filepath", label="Record your message") | |
| text_input = Textbox(label="Type your message", max_length=4096) | |
| output_text = Textbox(label="Text Output") | |
| output_html = Markdown() | |
| output_audio = Audio() | |
| # Define the Gradio interface | |
| iface = gr.Interface( | |
| fn=transcribe, | |
| inputs=[audio_input, text_input], | |
| outputs=[output_text, output_html], | |
| title="Hold On, Pain Ends (HOPE)", | |
| description="Talk to Your USMLE Tutor HOPE. \n If you want to colorize your note, type COLORIZE in the first line of your input.", | |
| theme="compact", | |
| layout="vertical", | |
| allow_flagging=False | |
| ) | |
| # Run the Gradio interface | |
| iface.launch() |