Spaces:

svijayanand
/

Podcast_Oracle

Build error

App Files Files Community

Vijayanand Sankarasubramanian commited on Jun 2, 2024

Commit

240ad82

1 Parent(s): 0dbead4

updated UI to get constants

Browse files

Files changed (9) hide show

.gitignore +1 -2
app.py +73 -32
helpers/import_envs.py +1 -4
helpers/model_utils.py +34 -19
helpers/utils.py +3 -3
{helpers → tools}/__init__.py +0 -0
answer_bot.py → tools/answer_bot.py +6 -6
aspect_and_sentiment_extraction.py → tools/aspect_and_sentiment_extraction.py +4 -5
{helpers → tools}/summarize.py +39 -9

.gitignore CHANGED Viewed

@@ -178,7 +178,6 @@ cython_debug/
 #downloaded deb files
 *.deb
-#cached embeddings
 cache
 flagged

 #downloaded deb files
 *.deb
 cache
 flagged
+*.rtf

app.py CHANGED Viewed

@@ -1,67 +1,108 @@
 import gradio as gr
-from helpers.utils import load_rtf_document_and_chunk
-from helpers.summarize import summarize_with_map_reduce
-from helpers.model_utils import get_model
-from aspect_and_sentiment_extraction import extract_aspects_and_sentiment
-from answer_bot import answer_question
-transcript_file_name = None
-def summarize(transcript_file_name):
-    chunked_docs = load_rtf_document_and_chunk(transcript_file_name)
-    llm = get_model("OPENAI")
-    return summarize_with_map_reduce(chunked_docs, llm)
-def extract_aspects(transcript_file_name):
-    # Implement your aspect extraction and sentiment analysis logic here
-    return extract_aspects_and_sentiment(transcript_file_name)
-def get_answer_for(user_question):
     if transcript_file_name is None:
         return "No Transcript Uploaded, Upload RTF File First", ""
     # Answer the user's question using the question-answering model
     if user_question.strip():  # Ensure there is a question provided
-        answer_text = answer_question(question=user_question)
     else:
         answer_text = "No question asked."
-    return answer_text.lstrip()
-def process_transcript(uploaded_file):
     if transcript_file_name is None:
         return "No Transcript Uploaded, Upload RTF File First", ""
     # Summarize the content
-    summary = summarize(transcript_file_name=transcript_file_name).lstrip()
     # Aspect-Based Sentiment Analysis
-    sentiment = extract_aspects(transcript_file_name=transcript_file_name).lstrip()
-    return summary, sentiment
-def setup_rtf_file_handle(uploaded_file):
     transcript_file_name = uploaded_file.name
-    print(f"Transcript File Name :{transcript_file_name}")
 with gr.Blocks() as demo:
     with gr.Group("Upload RTF File"):
         rtf_file = gr.File(label="Podcast Transcript RTF file")
         submit_button = gr.Button("Upload File")
-        submit_button.click(setup_rtf_file_handle)
-    with gr.Group("Aspects and Sentiment of Podcast"):
         summary = gr.Textbox(label="Summary of Podcast")
         sentiment = gr.Textbox(label="Aspect Based Sentiments")
         submit_button = gr.Button("Generate Aspects and Summary")
-        submit_button.click(process_transcript, inputs=rtf_file, outputs=[summary, sentiment])
     with gr.Group("Question/Answer"):
         gr.Markdown("Question/Answer")
         question = gr.Textbox(label="Question")
         answer = gr.Textbox(label="Answer")
         answer_button = gr.Button("Answer Question")
-        answer_button.click(get_answer_for, inputs= question, outputs=answer)
 demo.launch()

 import gradio as gr
+from helpers.model_utils import GPT3, GPT4, LLAMA3, ANTHROPIC, set_question_answer_llm, set_sentiment_analysis_llm, set_summarization_llm
+from tools.summarize import MAPREDUCE, STUFF, summarize_podcast
+from tools.answer_bot import answer_question
+from tools.aspect_and_sentiment_extraction import extract_aspects_and_sentiment
+def get_answer_for(user_question, transcript_file_name, llm_choice):
     if transcript_file_name is None:
         return "No Transcript Uploaded, Upload RTF File First", ""
+    if user_question is None:
+        return "Question Not Given"
     # Answer the user's question using the question-answering model
     if user_question.strip():  # Ensure there is a question provided
+        answer_text = answer_question(question=user_question, transcript_file_name=transcript_file_name, llm_choice=llm_choice)
     else:
         answer_text = "No question asked."
+    return answer_text.lstrip(), transcript_file_name, llm_choice
+def summarize(uploaded_file, transcript_file_name, summarization_method, llm_choice):
     if transcript_file_name is None:
         return "No Transcript Uploaded, Upload RTF File First", ""
     # Summarize the content
+    summary = summarize_podcast(transcript_file_name=transcript_file_name, summarization_method=summarization_method, llm_choice=llm_choice).lstrip()
+    return summary, transcript_file_name, summarization_method, llm_choice
+def generate_aspects_and_sentiments(uploaded_file, transcript_file_name, llm_choice):
+    if transcript_file_name is None:
+        return "No Transcript Uploaded, Upload RTF File First", ""
     # Aspect-Based Sentiment Analysis
+    sentiment = extract_aspects_and_sentiment(transcript_file_name=transcript_file_name, llm_choice=llm_choice).lstrip()
+    return sentiment, transcript_file_name, llm_choice
+def setup_rtf_file_handle(uploaded_file, transcript_file_name):
+    if not uploaded_file:
+        return None
     transcript_file_name = uploaded_file.name
+    return transcript_file_name
+def setup_summarization_llm(choice, llm_choice):
+    set_summarization_llm(choice)
+    llm_choice = choice
+    return choice, llm_choice
+def setup_sentiment_analysis_llm(choice, llm_choice):
+    set_sentiment_analysis_llm(choice)
+    llm_choice = choice
+    return choice, llm_choice
+def setup_question_answer_llm(choice, llm_choice):
+    set_question_answer_llm(choice)
+    llm_choice = choice
+    return choice, llm_choice
+def setup_summarization_method(choice, summarization_method):
+    summarization_method = choice
+    return choice, summarization_method
+llm_choices = [GPT3, GPT4, LLAMA3, ANTHROPIC]
+summarize_method_choices = [MAPREDUCE, STUFF]
 with gr.Blocks() as demo:
+    transcript_file_name = gr.State()
+    summarization_method = gr.State()
+    llm_choice = gr.State()
     with gr.Group("Upload RTF File"):
         rtf_file = gr.File(label="Podcast Transcript RTF file")
         submit_button = gr.Button("Upload File")
+        submit_button.click(setup_rtf_file_handle, inputs=[rtf_file, transcript_file_name], outputs=transcript_file_name)
+    with gr.Group("LLM Selection"):
+        with gr.Row():
+            choice = gr.Radio(label="Summarization LLM", choices=llm_choices, value=GPT3)
+            output = gr.Textbox(label="", value=GPT3)
+            choice.change(setup_summarization_llm, inputs=[choice,llm_choice], outputs=[output,llm_choice])
+        with gr.Row():
+            choice = gr.Radio(label="Sentiment Analysis LLM", choices=llm_choices, value=GPT3)
+            output = gr.Textbox(label="", value=GPT3)
+            choice.change(setup_summarization_llm, inputs=[choice,llm_choice], outputs=[output,llm_choice])
+        with gr.Row():
+            choice = gr.Radio(label="Question/Answer LLM", choices=llm_choices, value=GPT3)
+            output = gr.Textbox(label="", value=GPT3)
+            choice.change(setup_summarization_llm, inputs=[choice,llm_choice], outputs=[output,llm_choice])
+    with gr.Group("Summarization Method"):
+        choice = gr.Radio(label="Summarization Method", choices=summarize_method_choices, value=MAPREDUCE)
+        output = gr.Textbox(label="", value=MAPREDUCE)
+        choice.change(setup_summarization_method, inputs=[choice, summarization_method], outputs=[output, summarization_method])
+    with gr.Group("Summarize Podcast"):
         summary = gr.Textbox(label="Summary of Podcast")
+        submit_button = gr.Button("Generate Summary")
+        submit_button.click(summarize, inputs=[rtf_file, transcript_file_name, summarization_method, llm_choice], outputs=[summary, transcript_file_name, summarization_method, llm_choice])
+    with gr.Group("Aspects and Sentiment of Podcast"):
         sentiment = gr.Textbox(label="Aspect Based Sentiments")
         submit_button = gr.Button("Generate Aspects and Summary")
+        submit_button.click(generate_aspects_and_sentiments, inputs=[rtf_file, transcript_file_name, llm_choice], outputs=[sentiment, transcript_file_name, llm_choice])
     with gr.Group("Question/Answer"):
         gr.Markdown("Question/Answer")
         question = gr.Textbox(label="Question")
         answer = gr.Textbox(label="Answer")
         answer_button = gr.Button("Answer Question")
+        answer_button.click(get_answer_for, inputs=[question, transcript_file_name, llm_choice], outputs=[answer, transcript_file_name, llm_choice])
 demo.launch()

helpers/import_envs.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import os
 from dotenv import load_dotenv
-llm_model="OPENAI"
-# llm_model="LLAMA3"
-# llm_model_NAME="CLAUDE"
-rtf_file = "./data/Tim_O_Reilly_Podcast_text.rtf"
 index_name = "podcast_oracle_index"
 index_file = f"./{index_name}/index.faiss"

 import os
 from dotenv import load_dotenv
 index_name = "podcast_oracle_index"
 index_file = f"./{index_name}/index.faiss"

helpers/model_utils.py CHANGED Viewed

@@ -1,26 +1,41 @@
-from helpers.import_envs import llm_model, openai_api_key, anthropic_api_key
 from langchain_openai import OpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_community.llms import Ollama
-default_model = OpenAI(temperature=0, api_key=openai_api_key)
-def get_model(model_override = None):
-    if model_override is not None:
-        model_str = model_override
     else:
-        model_str = llm_model
-    if model_str == "OPENAI":
-        llm = default_model
-        print(f"Model Name: {llm.model_name}");
-    elif model_str == "CLAUDE":
-        llm = ChatAnthropic(model_name="claude-2.1", anthropic_api_key=anthropic_api_key)
-        print(f"Model Name: {llm.model}");
-    elif model_str == "LLAMA3":
-        # Now you can use `llm` for generating responses, etc.
-        llm = Ollama(model="llama3")
-        print(f"Model Name: {llm.model}");
-    else:
-        llm = default_model
-        print(f"Model Name: {llm.model_name}");
     return llm

 from langchain_openai import OpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_community.llms import Ollama
+from helpers.import_envs import openai_api_key, anthropic_api_key
+GPT3 = "gpt-3.5"
+GPT3_INSTRUCT = "gpt-3.5-instruct"
+GPT4 = "gpt-4o"
+LLAMA3 = "Llama3"
+ANTHROPIC = "Claude2"
+def _set_llm_based_on_choice(choice):
+    if choice == GPT3_INSTRUCT:
+        model_name = "gpt-3.5-turbo-instruct"
+        llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
+    elif choice == GPT3:
+        model_name = "gpt-3.5-turbo"
+        llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
+    elif choice == GPT4:
+        model_name = "gpt-4o"
+        llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
+    elif choice == ANTHROPIC:
+        model_name = "clause-2.1"
+        llm = ChatAnthropic(model_name=model_name, anthropic_api_key=anthropic_api_key)
+    elif choice == LLAMA3:
+        model_name = "llama3"
+        llm = Ollama(model=model_name)
     else:
+        model_name = "gpt-3.5-turbo"
+        llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
     return llm
+def set_summarization_llm(choice = None):
+    return _set_llm_based_on_choice(choice)
+def set_sentiment_analysis_llm(choice = None):
+    return _set_llm_based_on_choice(choice)
+def set_question_answer_llm(choice = None):
+    return _set_llm_based_on_choice(choice)

helpers/utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ from langchain_openai import OpenAIEmbeddings
 from langchain.storage import LocalFileStore
 from langchain.embeddings import CacheBackedEmbeddings
 from langchain_community.vectorstores import FAISS
-from helpers.import_envs import openai_api_key, rtf_file, index_file, index_name
 import pypandoc
 def load_rtf_document(file_path):
@@ -46,8 +46,8 @@ def embed_chunks(chunked_docs):
     vector_store.save_local(folder_path=index_name)
     return vector_store
-def create_or_load_vectore_store():
-    chunked_docs = load_rtf_document_and_chunk(file_path=rtf_file)
     embedding_model = OpenAIEmbeddings(
         model="text-embedding-3-large", api_key=openai_api_key

 from langchain.storage import LocalFileStore
 from langchain.embeddings import CacheBackedEmbeddings
 from langchain_community.vectorstores import FAISS
+from helpers.import_envs import openai_api_key, index_file, index_name
 import pypandoc
 def load_rtf_document(file_path):
     vector_store.save_local(folder_path=index_name)
     return vector_store
+def create_or_load_vectore_store(transcript_file_name):
+    chunked_docs = load_rtf_document_and_chunk(file_path=transcript_file_name)
     embedding_model = OpenAIEmbeddings(
         model="text-embedding-3-large", api_key=openai_api_key

{helpers → tools}/__init__.py RENAMED Viewed

File without changes

answer_bot.py → tools/answer_bot.py RENAMED Viewed

@@ -4,10 +4,13 @@ from langchain_openai import ChatOpenAI
 from langchain.schema import StrOutputParser
 from langchain_core.runnables.passthrough import RunnablePassthrough
 from langchain.prompts import ChatPromptTemplate
-def answer_question(question):
     # Specify the path to the file you want to check
-    vector_store = create_or_load_vectore_store()
     # create a prompt template to send to our LLM that will incorporate the documents from our retriever with the
     # question we ask the chat model
@@ -20,7 +23,7 @@ def answer_question(question):
     # create a chat model / LLM
     chat_model = ChatOpenAI(
-        model="gpt-3.5-turbo", temperature=0, api_key=openai_api_key
     )
     # create a parser to parse the output of our LLM
@@ -38,6 +41,3 @@ def answer_question(question):
     answer = runnable_chain.invoke(question)
     print(answer)
     return answer
-# question = "What is the opinion of the speaker on open source?"
-# answer_question(question)

 from langchain.schema import StrOutputParser
 from langchain_core.runnables.passthrough import RunnablePassthrough
 from langchain.prompts import ChatPromptTemplate
+from helpers.model_utils import set_question_answer_llm
+def answer_question(question, transcript_file_name, llm_choice=None):
+    question_answer_llm = set_question_answer_llm(llm_choice)
     # Specify the path to the file you want to check
+    vector_store = create_or_load_vectore_store(transcript_file_name=transcript_file_name)
     # create a prompt template to send to our LLM that will incorporate the documents from our retriever with the
     # question we ask the chat model
     # create a chat model / LLM
     chat_model = ChatOpenAI(
+        model=question_answer_llm.model_name, temperature=0, api_key=openai_api_key
     )
     # create a parser to parse the output of our LLM
     answer = runnable_chain.invoke(question)
     print(answer)
     return answer

aspect_and_sentiment_extraction.py → tools/aspect_and_sentiment_extraction.py RENAMED Viewed

@@ -1,9 +1,9 @@
 from helpers.import_envs import openai_api_key
-from helpers.import_envs import rtf_file
 from langchain.prompts import PromptTemplate
 from langchain_openai import ChatOpenAI
 from langchain_core.runnables.passthrough import RunnablePassthrough
 from langchain.schema import StrOutputParser
 import re
 # Define the function to clean and extract text from RTF content
@@ -14,7 +14,8 @@ def extract_text_from_rtf(rtf_str):
     plain_text = plain_text.replace('\n', ' ').replace('\r', '')
     return plain_text
-def extract_aspects_and_sentiment(transcript_file_name):
     # Read the RTF file content
     with open(transcript_file_name, 'r') as file:
         rtf_content = file.read()
@@ -33,7 +34,7 @@ def extract_aspects_and_sentiment(transcript_file_name):
     # create a chat model / LLM
     chat_model = ChatOpenAI(
-        model="gpt-3.5-turbo", temperature=0, api_key=openai_api_key
     )
     # create a parser to parse the output of our LLM
@@ -50,5 +51,3 @@ def extract_aspects_and_sentiment(transcript_file_name):
     answer = runnable_chain.invoke(document_text)
     print(answer)
     return answer
-# extract_aspects_and_sentiment(rtf_file)

 from helpers.import_envs import openai_api_key
 from langchain.prompts import PromptTemplate
 from langchain_openai import ChatOpenAI
 from langchain_core.runnables.passthrough import RunnablePassthrough
 from langchain.schema import StrOutputParser
+from helpers.model_utils import set_sentiment_analysis_llm, GPT3
 import re
 # Define the function to clean and extract text from RTF content
     plain_text = plain_text.replace('\n', ' ').replace('\r', '')
     return plain_text
+def extract_aspects_and_sentiment(transcript_file_name, llm_choice = None):
+    sentiment_analysis_llm = set_sentiment_analysis_llm(llm_choice)
     # Read the RTF file content
     with open(transcript_file_name, 'r') as file:
         rtf_content = file.read()
     # create a chat model / LLM
     chat_model = ChatOpenAI(
+        model=sentiment_analysis_llm.model_name, temperature=0, api_key=openai_api_key
     )
     # create a parser to parse the output of our LLM
     answer = runnable_chain.invoke(document_text)
     print(answer)
     return answer

{helpers → tools}/summarize.py RENAMED Viewed

@@ -1,5 +1,11 @@
 from langchain.chains.summarize import load_summarize_chain
 from helpers.prompts import BULLET_POINT_PROMPT
 """
 This method involves an initial prompt on each chunk of data * ( for summarization tasks, this could be a summary
@@ -17,8 +23,17 @@ def run_chain(chain, docs):
     summary = output['output_text']
     print(summary)
     return summary
-def summarize_with_map_reduce(docs, llm):
     chain = load_summarize_chain(llm=llm, chain_type="map_reduce", verbose=False)
     # prompt used by the chain for summarizing each part
@@ -29,24 +44,26 @@ def summarize_with_map_reduce(docs, llm):
     # print("prompt used by the chain for combining the parts:")
     # print(chain.combine_document_chain.llm_chain.prompt.template)
-    return run_chain(chain=chain, docs=docs)
-def summarize_with_map_reduce_and_bullet_point_prompt(docs, llm):
     chain = load_summarize_chain(
-        llm,
         chain_type="map_reduce",
         map_prompt=BULLET_POINT_PROMPT,
         combine_prompt=BULLET_POINT_PROMPT,
     )
-    return run_chain(chain=chain, docs=docs)
 """
 Stuffing is the simplest method, whereby you simply stuff all the related data into the prompt as context to pass to
 the language model. This is implemented in LangChain as the StuffDocumentsChain.
 Pros: Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.
-Cons: Most LLMs have a context length, and for large documents (or many documents) this will not work as it will
         result in a prompt larger than the context length.
 The main downside of this method is that it only works one smaller pieces of data. Once you are working with many
@@ -54,9 +71,22 @@ pieces of data, this approach is no longer feasible. The next two approaches are
 """
-def summarize_with_stuff_chain(docs, llm):
-    chain = load_summarize_chain(llm, chain_type="stuff")
     return run_chain(chain=chain, docs=docs)
-    # chain = load_summarize_chain(llm, chain_type="stuff", prompt=BULLET_POINT_PROMPT)
     # run_chain(chain=chain, docs=docs)

 from langchain.chains.summarize import load_summarize_chain
 from helpers.prompts import BULLET_POINT_PROMPT
+from helpers.utils import load_rtf_document_and_chunk, load_rtf_document
+from helpers.model_utils import set_summarization_llm, GPT3_INSTRUCT
+MAPREDUCE="map-reduce"
+STUFF="stuff"
+summarization_method = MAPREDUCE
 """
 This method involves an initial prompt on each chunk of data * ( for summarization tasks, this could be a summary
     summary = output['output_text']
     print(summary)
     return summary
+    # prompt used by the chain for summarizing each part
+    # print("prompt used by the chain for summarizing each part:")
+    # print(chain.llm_chain.prompt.template)
+    # prompt used by the chain for combining the parts
+    # print("prompt used by the chain for combining the parts:")
+    # print(chain.combine_document_chain.llm_chain.promdocs
+def _summarize_with_map_reduce(transcript_file_name, llm):
+    chunked_docs = load_rtf_document_and_chunk(transcript_file_name)
     chain = load_summarize_chain(llm=llm, chain_type="map_reduce", verbose=False)
     # prompt used by the chain for summarizing each part
     # print("prompt used by the chain for combining the parts:")
     # print(chain.combine_document_chain.llm_chain.prompt.template)
+    return run_chain(chain=chain, docs=chunked_docs)
+def _summarize_with_map_reduce_and_bullet_point_prompt(transcript_file_name, llm):
+    chunked_docs = load_rtf_document_and_chunk(transcript_file_name)
     chain = load_summarize_chain(
+        llm=llm,
         chain_type="map_reduce",
         map_prompt=BULLET_POINT_PROMPT,
         combine_prompt=BULLET_POINT_PROMPT,
     )
+    return run_chain(chain=chain, docs=chunked_docs)
 """
 Stuffing is the simplest method, whereby you simply stuff all the related data into the prompt as context to pass to
 the language model. This is implemented in LangChain as the StuffDocumentsChain.
 Pros: Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.
+Cons: Most LLMs have a context length, and for large documents (or many documen# extract_aspects_and_sentiment(rtf_file)
+s) this will not work as it will
         result in a prompt larger than the context length.
 The main downside of this method is that it only works one smaller pieces of data. Once you are working with many
 """
+def _summarize_with_stuff_chain(transcript_file_name, llm):
+    docs = load_rtf_document(transcript_file_name)
+    chain = load_summarize_chain(llm=llm, chain_type="stuff")
     return run_chain(chain=chain, docs=docs)
+    # chain = load_summarize_chain(llm=llm, chain_type="stuff", prompt=BULLET_POINT_PROMPT)
     # run_chain(chain=chain, docs=docs)
+def summarize_podcast(transcript_file_name, summarization_method = None, llm_choice = None):
+    # override model
+    llm_choice = GPT3_INSTRUCT
+    llm = set_summarization_llm(llm_choice)
+    if summarization_method == MAPREDUCE:
+        return _summarize_with_map_reduce(transcript_file_name=transcript_file_name, llm=llm)
+    elif summarization_method == STUFF:
+        return _summarize_with_stuff_chain(transcript_file_name=transcript_file_name, llm=llm)
+    else:
+        return _summarize_with_map_reduce(transcript_file_name=transcript_file_name, llm=llm)