Spaces:
Build error
Build error
Vijayanand Sankarasubramanian commited on
Commit ·
240ad82
1
Parent(s): 0dbead4
updated UI to get constants
Browse files- .gitignore +1 -2
- app.py +73 -32
- helpers/import_envs.py +1 -4
- helpers/model_utils.py +34 -19
- helpers/utils.py +3 -3
- {helpers → tools}/__init__.py +0 -0
- answer_bot.py → tools/answer_bot.py +6 -6
- aspect_and_sentiment_extraction.py → tools/aspect_and_sentiment_extraction.py +4 -5
- {helpers → tools}/summarize.py +39 -9
.gitignore
CHANGED
|
@@ -178,7 +178,6 @@ cython_debug/
|
|
| 178 |
#downloaded deb files
|
| 179 |
*.deb
|
| 180 |
|
| 181 |
-
#cached embeddings
|
| 182 |
cache
|
| 183 |
-
|
| 184 |
flagged
|
|
|
|
|
|
| 178 |
#downloaded deb files
|
| 179 |
*.deb
|
| 180 |
|
|
|
|
| 181 |
cache
|
|
|
|
| 182 |
flagged
|
| 183 |
+
*.rtf
|
app.py
CHANGED
|
@@ -1,67 +1,108 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from helpers.
|
| 3 |
-
from
|
| 4 |
-
from
|
| 5 |
-
from aspect_and_sentiment_extraction import extract_aspects_and_sentiment
|
| 6 |
-
from answer_bot import answer_question
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def summarize(transcript_file_name):
|
| 11 |
-
chunked_docs = load_rtf_document_and_chunk(transcript_file_name)
|
| 12 |
-
|
| 13 |
-
llm = get_model("OPENAI")
|
| 14 |
-
return summarize_with_map_reduce(chunked_docs, llm)
|
| 15 |
-
|
| 16 |
-
def extract_aspects(transcript_file_name):
|
| 17 |
-
# Implement your aspect extraction and sentiment analysis logic here
|
| 18 |
-
return extract_aspects_and_sentiment(transcript_file_name)
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def get_answer_for(user_question):
|
| 22 |
if transcript_file_name is None:
|
| 23 |
return "No Transcript Uploaded, Upload RTF File First", ""
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Answer the user's question using the question-answering model
|
| 26 |
if user_question.strip(): # Ensure there is a question provided
|
| 27 |
-
answer_text = answer_question(question=user_question)
|
| 28 |
else:
|
| 29 |
answer_text = "No question asked."
|
| 30 |
|
| 31 |
-
return answer_text.lstrip()
|
| 32 |
|
| 33 |
-
def
|
| 34 |
if transcript_file_name is None:
|
| 35 |
return "No Transcript Uploaded, Upload RTF File First", ""
|
| 36 |
|
| 37 |
# Summarize the content
|
| 38 |
-
summary =
|
|
|
|
|
|
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# Aspect-Based Sentiment Analysis
|
| 41 |
-
sentiment =
|
| 42 |
|
| 43 |
-
return
|
| 44 |
|
| 45 |
-
def setup_rtf_file_handle(uploaded_file):
|
|
|
|
|
|
|
| 46 |
transcript_file_name = uploaded_file.name
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
|
| 50 |
with gr.Group("Upload RTF File"):
|
| 51 |
rtf_file = gr.File(label="Podcast Transcript RTF file")
|
| 52 |
submit_button = gr.Button("Upload File")
|
| 53 |
-
submit_button.click(setup_rtf_file_handle)
|
| 54 |
-
with gr.Group("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
summary = gr.Textbox(label="Summary of Podcast")
|
|
|
|
|
|
|
|
|
|
| 56 |
sentiment = gr.Textbox(label="Aspect Based Sentiments")
|
| 57 |
submit_button = gr.Button("Generate Aspects and Summary")
|
| 58 |
-
submit_button.click(
|
| 59 |
-
|
| 60 |
with gr.Group("Question/Answer"):
|
| 61 |
gr.Markdown("Question/Answer")
|
| 62 |
question = gr.Textbox(label="Question")
|
| 63 |
answer = gr.Textbox(label="Answer")
|
| 64 |
answer_button = gr.Button("Answer Question")
|
| 65 |
-
answer_button.click(get_answer_for, inputs=
|
| 66 |
|
| 67 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from helpers.model_utils import GPT3, GPT4, LLAMA3, ANTHROPIC, set_question_answer_llm, set_sentiment_analysis_llm, set_summarization_llm
|
| 3 |
+
from tools.summarize import MAPREDUCE, STUFF, summarize_podcast
|
| 4 |
+
from tools.answer_bot import answer_question
|
| 5 |
+
from tools.aspect_and_sentiment_extraction import extract_aspects_and_sentiment
|
|
|
|
| 6 |
|
| 7 |
+
def get_answer_for(user_question, transcript_file_name, llm_choice):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
if transcript_file_name is None:
|
| 9 |
return "No Transcript Uploaded, Upload RTF File First", ""
|
| 10 |
+
if user_question is None:
|
| 11 |
+
return "Question Not Given"
|
| 12 |
|
| 13 |
# Answer the user's question using the question-answering model
|
| 14 |
if user_question.strip(): # Ensure there is a question provided
|
| 15 |
+
answer_text = answer_question(question=user_question, transcript_file_name=transcript_file_name, llm_choice=llm_choice)
|
| 16 |
else:
|
| 17 |
answer_text = "No question asked."
|
| 18 |
|
| 19 |
+
return answer_text.lstrip(), transcript_file_name, llm_choice
|
| 20 |
|
| 21 |
+
def summarize(uploaded_file, transcript_file_name, summarization_method, llm_choice):
|
| 22 |
if transcript_file_name is None:
|
| 23 |
return "No Transcript Uploaded, Upload RTF File First", ""
|
| 24 |
|
| 25 |
# Summarize the content
|
| 26 |
+
summary = summarize_podcast(transcript_file_name=transcript_file_name, summarization_method=summarization_method, llm_choice=llm_choice).lstrip()
|
| 27 |
+
|
| 28 |
+
return summary, transcript_file_name, summarization_method, llm_choice
|
| 29 |
|
| 30 |
+
def generate_aspects_and_sentiments(uploaded_file, transcript_file_name, llm_choice):
|
| 31 |
+
if transcript_file_name is None:
|
| 32 |
+
return "No Transcript Uploaded, Upload RTF File First", ""
|
| 33 |
+
|
| 34 |
# Aspect-Based Sentiment Analysis
|
| 35 |
+
sentiment = extract_aspects_and_sentiment(transcript_file_name=transcript_file_name, llm_choice=llm_choice).lstrip()
|
| 36 |
|
| 37 |
+
return sentiment, transcript_file_name, llm_choice
|
| 38 |
|
| 39 |
+
def setup_rtf_file_handle(uploaded_file, transcript_file_name):
|
| 40 |
+
if not uploaded_file:
|
| 41 |
+
return None
|
| 42 |
transcript_file_name = uploaded_file.name
|
| 43 |
+
return transcript_file_name
|
| 44 |
+
|
| 45 |
+
def setup_summarization_llm(choice, llm_choice):
|
| 46 |
+
set_summarization_llm(choice)
|
| 47 |
+
llm_choice = choice
|
| 48 |
+
return choice, llm_choice
|
| 49 |
+
|
| 50 |
+
def setup_sentiment_analysis_llm(choice, llm_choice):
|
| 51 |
+
set_sentiment_analysis_llm(choice)
|
| 52 |
+
llm_choice = choice
|
| 53 |
+
return choice, llm_choice
|
| 54 |
+
|
| 55 |
+
def setup_question_answer_llm(choice, llm_choice):
|
| 56 |
+
set_question_answer_llm(choice)
|
| 57 |
+
llm_choice = choice
|
| 58 |
+
return choice, llm_choice
|
| 59 |
+
|
| 60 |
+
def setup_summarization_method(choice, summarization_method):
|
| 61 |
+
summarization_method = choice
|
| 62 |
+
return choice, summarization_method
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
llm_choices = [GPT3, GPT4, LLAMA3, ANTHROPIC]
|
| 66 |
+
summarize_method_choices = [MAPREDUCE, STUFF]
|
| 67 |
|
| 68 |
with gr.Blocks() as demo:
|
| 69 |
+
transcript_file_name = gr.State()
|
| 70 |
+
summarization_method = gr.State()
|
| 71 |
+
llm_choice = gr.State()
|
| 72 |
with gr.Group("Upload RTF File"):
|
| 73 |
rtf_file = gr.File(label="Podcast Transcript RTF file")
|
| 74 |
submit_button = gr.Button("Upload File")
|
| 75 |
+
submit_button.click(setup_rtf_file_handle, inputs=[rtf_file, transcript_file_name], outputs=transcript_file_name)
|
| 76 |
+
with gr.Group("LLM Selection"):
|
| 77 |
+
with gr.Row():
|
| 78 |
+
choice = gr.Radio(label="Summarization LLM", choices=llm_choices, value=GPT3)
|
| 79 |
+
output = gr.Textbox(label="", value=GPT3)
|
| 80 |
+
choice.change(setup_summarization_llm, inputs=[choice,llm_choice], outputs=[output,llm_choice])
|
| 81 |
+
with gr.Row():
|
| 82 |
+
choice = gr.Radio(label="Sentiment Analysis LLM", choices=llm_choices, value=GPT3)
|
| 83 |
+
output = gr.Textbox(label="", value=GPT3)
|
| 84 |
+
choice.change(setup_summarization_llm, inputs=[choice,llm_choice], outputs=[output,llm_choice])
|
| 85 |
+
with gr.Row():
|
| 86 |
+
choice = gr.Radio(label="Question/Answer LLM", choices=llm_choices, value=GPT3)
|
| 87 |
+
output = gr.Textbox(label="", value=GPT3)
|
| 88 |
+
choice.change(setup_summarization_llm, inputs=[choice,llm_choice], outputs=[output,llm_choice])
|
| 89 |
+
with gr.Group("Summarization Method"):
|
| 90 |
+
choice = gr.Radio(label="Summarization Method", choices=summarize_method_choices, value=MAPREDUCE)
|
| 91 |
+
output = gr.Textbox(label="", value=MAPREDUCE)
|
| 92 |
+
choice.change(setup_summarization_method, inputs=[choice, summarization_method], outputs=[output, summarization_method])
|
| 93 |
+
with gr.Group("Summarize Podcast"):
|
| 94 |
summary = gr.Textbox(label="Summary of Podcast")
|
| 95 |
+
submit_button = gr.Button("Generate Summary")
|
| 96 |
+
submit_button.click(summarize, inputs=[rtf_file, transcript_file_name, summarization_method, llm_choice], outputs=[summary, transcript_file_name, summarization_method, llm_choice])
|
| 97 |
+
with gr.Group("Aspects and Sentiment of Podcast"):
|
| 98 |
sentiment = gr.Textbox(label="Aspect Based Sentiments")
|
| 99 |
submit_button = gr.Button("Generate Aspects and Summary")
|
| 100 |
+
submit_button.click(generate_aspects_and_sentiments, inputs=[rtf_file, transcript_file_name, llm_choice], outputs=[sentiment, transcript_file_name, llm_choice])
|
|
|
|
| 101 |
with gr.Group("Question/Answer"):
|
| 102 |
gr.Markdown("Question/Answer")
|
| 103 |
question = gr.Textbox(label="Question")
|
| 104 |
answer = gr.Textbox(label="Answer")
|
| 105 |
answer_button = gr.Button("Answer Question")
|
| 106 |
+
answer_button.click(get_answer_for, inputs=[question, transcript_file_name, llm_choice], outputs=[answer, transcript_file_name, llm_choice])
|
| 107 |
|
| 108 |
demo.launch()
|
helpers/import_envs.py
CHANGED
|
@@ -1,9 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
-
|
| 4 |
-
# llm_model="LLAMA3"
|
| 5 |
-
# llm_model_NAME="CLAUDE"
|
| 6 |
-
rtf_file = "./data/Tim_O_Reilly_Podcast_text.rtf"
|
| 7 |
index_name = "podcast_oracle_index"
|
| 8 |
index_file = f"./{index_name}/index.faiss"
|
| 9 |
|
|
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
+
|
|
|
|
|
|
|
|
|
|
| 4 |
index_name = "podcast_oracle_index"
|
| 5 |
index_file = f"./{index_name}/index.faiss"
|
| 6 |
|
helpers/model_utils.py
CHANGED
|
@@ -1,26 +1,41 @@
|
|
| 1 |
-
from helpers.import_envs import llm_model, openai_api_key, anthropic_api_key
|
| 2 |
from langchain_openai import OpenAI
|
| 3 |
from langchain_anthropic import ChatAnthropic
|
| 4 |
from langchain_community.llms import Ollama
|
|
|
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
def
|
| 9 |
-
if
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
else:
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
llm = default_model
|
| 15 |
-
print(f"Model Name: {llm.model_name}");
|
| 16 |
-
elif model_str == "CLAUDE":
|
| 17 |
-
llm = ChatAnthropic(model_name="claude-2.1", anthropic_api_key=anthropic_api_key)
|
| 18 |
-
print(f"Model Name: {llm.model}");
|
| 19 |
-
elif model_str == "LLAMA3":
|
| 20 |
-
# Now you can use `llm` for generating responses, etc.
|
| 21 |
-
llm = Ollama(model="llama3")
|
| 22 |
-
print(f"Model Name: {llm.model}");
|
| 23 |
-
else:
|
| 24 |
-
llm = default_model
|
| 25 |
-
print(f"Model Name: {llm.model_name}");
|
| 26 |
return llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from langchain_openai import OpenAI
|
| 2 |
from langchain_anthropic import ChatAnthropic
|
| 3 |
from langchain_community.llms import Ollama
|
| 4 |
+
from helpers.import_envs import openai_api_key, anthropic_api_key
|
| 5 |
|
| 6 |
+
GPT3 = "gpt-3.5"
|
| 7 |
+
GPT3_INSTRUCT = "gpt-3.5-instruct"
|
| 8 |
+
GPT4 = "gpt-4o"
|
| 9 |
+
LLAMA3 = "Llama3"
|
| 10 |
+
ANTHROPIC = "Claude2"
|
| 11 |
|
| 12 |
+
def _set_llm_based_on_choice(choice):
|
| 13 |
+
if choice == GPT3_INSTRUCT:
|
| 14 |
+
model_name = "gpt-3.5-turbo-instruct"
|
| 15 |
+
llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
|
| 16 |
+
elif choice == GPT3:
|
| 17 |
+
model_name = "gpt-3.5-turbo"
|
| 18 |
+
llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
|
| 19 |
+
elif choice == GPT4:
|
| 20 |
+
model_name = "gpt-4o"
|
| 21 |
+
llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
|
| 22 |
+
elif choice == ANTHROPIC:
|
| 23 |
+
model_name = "clause-2.1"
|
| 24 |
+
llm = ChatAnthropic(model_name=model_name, anthropic_api_key=anthropic_api_key)
|
| 25 |
+
elif choice == LLAMA3:
|
| 26 |
+
model_name = "llama3"
|
| 27 |
+
llm = Ollama(model=model_name)
|
| 28 |
else:
|
| 29 |
+
model_name = "gpt-3.5-turbo"
|
| 30 |
+
llm = OpenAI(model=model_name, temperature=0, api_key=openai_api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
return llm
|
| 32 |
+
|
| 33 |
+
def set_summarization_llm(choice = None):
|
| 34 |
+
return _set_llm_based_on_choice(choice)
|
| 35 |
+
|
| 36 |
+
def set_sentiment_analysis_llm(choice = None):
|
| 37 |
+
return _set_llm_based_on_choice(choice)
|
| 38 |
+
|
| 39 |
+
def set_question_answer_llm(choice = None):
|
| 40 |
+
return _set_llm_based_on_choice(choice)
|
| 41 |
+
|
helpers/utils.py
CHANGED
|
@@ -4,7 +4,7 @@ from langchain_openai import OpenAIEmbeddings
|
|
| 4 |
from langchain.storage import LocalFileStore
|
| 5 |
from langchain.embeddings import CacheBackedEmbeddings
|
| 6 |
from langchain_community.vectorstores import FAISS
|
| 7 |
-
from helpers.import_envs import openai_api_key,
|
| 8 |
import pypandoc
|
| 9 |
|
| 10 |
def load_rtf_document(file_path):
|
|
@@ -46,8 +46,8 @@ def embed_chunks(chunked_docs):
|
|
| 46 |
vector_store.save_local(folder_path=index_name)
|
| 47 |
return vector_store
|
| 48 |
|
| 49 |
-
def create_or_load_vectore_store():
|
| 50 |
-
chunked_docs = load_rtf_document_and_chunk(file_path=
|
| 51 |
|
| 52 |
embedding_model = OpenAIEmbeddings(
|
| 53 |
model="text-embedding-3-large", api_key=openai_api_key
|
|
|
|
| 4 |
from langchain.storage import LocalFileStore
|
| 5 |
from langchain.embeddings import CacheBackedEmbeddings
|
| 6 |
from langchain_community.vectorstores import FAISS
|
| 7 |
+
from helpers.import_envs import openai_api_key, index_file, index_name
|
| 8 |
import pypandoc
|
| 9 |
|
| 10 |
def load_rtf_document(file_path):
|
|
|
|
| 46 |
vector_store.save_local(folder_path=index_name)
|
| 47 |
return vector_store
|
| 48 |
|
| 49 |
+
def create_or_load_vectore_store(transcript_file_name):
|
| 50 |
+
chunked_docs = load_rtf_document_and_chunk(file_path=transcript_file_name)
|
| 51 |
|
| 52 |
embedding_model = OpenAIEmbeddings(
|
| 53 |
model="text-embedding-3-large", api_key=openai_api_key
|
{helpers → tools}/__init__.py
RENAMED
|
File without changes
|
answer_bot.py → tools/answer_bot.py
RENAMED
|
@@ -4,10 +4,13 @@ from langchain_openai import ChatOpenAI
|
|
| 4 |
from langchain.schema import StrOutputParser
|
| 5 |
from langchain_core.runnables.passthrough import RunnablePassthrough
|
| 6 |
from langchain.prompts import ChatPromptTemplate
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
def answer_question(question):
|
| 9 |
# Specify the path to the file you want to check
|
| 10 |
-
vector_store = create_or_load_vectore_store()
|
| 11 |
|
| 12 |
# create a prompt template to send to our LLM that will incorporate the documents from our retriever with the
|
| 13 |
# question we ask the chat model
|
|
@@ -20,7 +23,7 @@ def answer_question(question):
|
|
| 20 |
|
| 21 |
# create a chat model / LLM
|
| 22 |
chat_model = ChatOpenAI(
|
| 23 |
-
model=
|
| 24 |
)
|
| 25 |
|
| 26 |
# create a parser to parse the output of our LLM
|
|
@@ -38,6 +41,3 @@ def answer_question(question):
|
|
| 38 |
answer = runnable_chain.invoke(question)
|
| 39 |
print(answer)
|
| 40 |
return answer
|
| 41 |
-
|
| 42 |
-
# question = "What is the opinion of the speaker on open source?"
|
| 43 |
-
# answer_question(question)
|
|
|
|
| 4 |
from langchain.schema import StrOutputParser
|
| 5 |
from langchain_core.runnables.passthrough import RunnablePassthrough
|
| 6 |
from langchain.prompts import ChatPromptTemplate
|
| 7 |
+
from helpers.model_utils import set_question_answer_llm
|
| 8 |
+
|
| 9 |
+
def answer_question(question, transcript_file_name, llm_choice=None):
|
| 10 |
+
question_answer_llm = set_question_answer_llm(llm_choice)
|
| 11 |
|
|
|
|
| 12 |
# Specify the path to the file you want to check
|
| 13 |
+
vector_store = create_or_load_vectore_store(transcript_file_name=transcript_file_name)
|
| 14 |
|
| 15 |
# create a prompt template to send to our LLM that will incorporate the documents from our retriever with the
|
| 16 |
# question we ask the chat model
|
|
|
|
| 23 |
|
| 24 |
# create a chat model / LLM
|
| 25 |
chat_model = ChatOpenAI(
|
| 26 |
+
model=question_answer_llm.model_name, temperature=0, api_key=openai_api_key
|
| 27 |
)
|
| 28 |
|
| 29 |
# create a parser to parse the output of our LLM
|
|
|
|
| 41 |
answer = runnable_chain.invoke(question)
|
| 42 |
print(answer)
|
| 43 |
return answer
|
|
|
|
|
|
|
|
|
aspect_and_sentiment_extraction.py → tools/aspect_and_sentiment_extraction.py
RENAMED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
from helpers.import_envs import openai_api_key
|
| 2 |
-
from helpers.import_envs import rtf_file
|
| 3 |
from langchain.prompts import PromptTemplate
|
| 4 |
from langchain_openai import ChatOpenAI
|
| 5 |
from langchain_core.runnables.passthrough import RunnablePassthrough
|
| 6 |
from langchain.schema import StrOutputParser
|
|
|
|
| 7 |
import re
|
| 8 |
|
| 9 |
# Define the function to clean and extract text from RTF content
|
|
@@ -14,7 +14,8 @@ def extract_text_from_rtf(rtf_str):
|
|
| 14 |
plain_text = plain_text.replace('\n', ' ').replace('\r', '')
|
| 15 |
return plain_text
|
| 16 |
|
| 17 |
-
def extract_aspects_and_sentiment(transcript_file_name):
|
|
|
|
| 18 |
# Read the RTF file content
|
| 19 |
with open(transcript_file_name, 'r') as file:
|
| 20 |
rtf_content = file.read()
|
|
@@ -33,7 +34,7 @@ def extract_aspects_and_sentiment(transcript_file_name):
|
|
| 33 |
|
| 34 |
# create a chat model / LLM
|
| 35 |
chat_model = ChatOpenAI(
|
| 36 |
-
model=
|
| 37 |
)
|
| 38 |
|
| 39 |
# create a parser to parse the output of our LLM
|
|
@@ -50,5 +51,3 @@ def extract_aspects_and_sentiment(transcript_file_name):
|
|
| 50 |
answer = runnable_chain.invoke(document_text)
|
| 51 |
print(answer)
|
| 52 |
return answer
|
| 53 |
-
|
| 54 |
-
# extract_aspects_and_sentiment(rtf_file)
|
|
|
|
| 1 |
from helpers.import_envs import openai_api_key
|
|
|
|
| 2 |
from langchain.prompts import PromptTemplate
|
| 3 |
from langchain_openai import ChatOpenAI
|
| 4 |
from langchain_core.runnables.passthrough import RunnablePassthrough
|
| 5 |
from langchain.schema import StrOutputParser
|
| 6 |
+
from helpers.model_utils import set_sentiment_analysis_llm, GPT3
|
| 7 |
import re
|
| 8 |
|
| 9 |
# Define the function to clean and extract text from RTF content
|
|
|
|
| 14 |
plain_text = plain_text.replace('\n', ' ').replace('\r', '')
|
| 15 |
return plain_text
|
| 16 |
|
| 17 |
+
def extract_aspects_and_sentiment(transcript_file_name, llm_choice = None):
|
| 18 |
+
sentiment_analysis_llm = set_sentiment_analysis_llm(llm_choice)
|
| 19 |
# Read the RTF file content
|
| 20 |
with open(transcript_file_name, 'r') as file:
|
| 21 |
rtf_content = file.read()
|
|
|
|
| 34 |
|
| 35 |
# create a chat model / LLM
|
| 36 |
chat_model = ChatOpenAI(
|
| 37 |
+
model=sentiment_analysis_llm.model_name, temperature=0, api_key=openai_api_key
|
| 38 |
)
|
| 39 |
|
| 40 |
# create a parser to parse the output of our LLM
|
|
|
|
| 51 |
answer = runnable_chain.invoke(document_text)
|
| 52 |
print(answer)
|
| 53 |
return answer
|
|
|
|
|
|
{helpers → tools}/summarize.py
RENAMED
|
@@ -1,5 +1,11 @@
|
|
| 1 |
from langchain.chains.summarize import load_summarize_chain
|
| 2 |
from helpers.prompts import BULLET_POINT_PROMPT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
"""
|
| 5 |
This method involves an initial prompt on each chunk of data * ( for summarization tasks, this could be a summary
|
|
@@ -17,8 +23,17 @@ def run_chain(chain, docs):
|
|
| 17 |
summary = output['output_text']
|
| 18 |
print(summary)
|
| 19 |
return summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
def
|
|
|
|
| 22 |
chain = load_summarize_chain(llm=llm, chain_type="map_reduce", verbose=False)
|
| 23 |
|
| 24 |
# prompt used by the chain for summarizing each part
|
|
@@ -29,24 +44,26 @@ def summarize_with_map_reduce(docs, llm):
|
|
| 29 |
# print("prompt used by the chain for combining the parts:")
|
| 30 |
# print(chain.combine_document_chain.llm_chain.prompt.template)
|
| 31 |
|
| 32 |
-
return run_chain(chain=chain, docs=
|
| 33 |
|
| 34 |
-
def
|
|
|
|
| 35 |
chain = load_summarize_chain(
|
| 36 |
-
llm,
|
| 37 |
chain_type="map_reduce",
|
| 38 |
map_prompt=BULLET_POINT_PROMPT,
|
| 39 |
combine_prompt=BULLET_POINT_PROMPT,
|
| 40 |
)
|
| 41 |
|
| 42 |
-
return run_chain(chain=chain, docs=
|
| 43 |
|
| 44 |
|
| 45 |
"""
|
| 46 |
Stuffing is the simplest method, whereby you simply stuff all the related data into the prompt as context to pass to
|
| 47 |
the language model. This is implemented in LangChain as the StuffDocumentsChain.
|
| 48 |
Pros: Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.
|
| 49 |
-
Cons: Most LLMs have a context length, and for large documents (or many
|
|
|
|
| 50 |
result in a prompt larger than the context length.
|
| 51 |
|
| 52 |
The main downside of this method is that it only works one smaller pieces of data. Once you are working with many
|
|
@@ -54,9 +71,22 @@ pieces of data, this approach is no longer feasible. The next two approaches are
|
|
| 54 |
"""
|
| 55 |
|
| 56 |
|
| 57 |
-
def
|
| 58 |
-
|
|
|
|
| 59 |
return run_chain(chain=chain, docs=docs)
|
| 60 |
|
| 61 |
-
# chain = load_summarize_chain(llm, chain_type="stuff", prompt=BULLET_POINT_PROMPT)
|
| 62 |
# run_chain(chain=chain, docs=docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from langchain.chains.summarize import load_summarize_chain
|
| 2 |
from helpers.prompts import BULLET_POINT_PROMPT
|
| 3 |
+
from helpers.utils import load_rtf_document_and_chunk, load_rtf_document
|
| 4 |
+
from helpers.model_utils import set_summarization_llm, GPT3_INSTRUCT
|
| 5 |
+
|
| 6 |
+
MAPREDUCE="map-reduce"
|
| 7 |
+
STUFF="stuff"
|
| 8 |
+
summarization_method = MAPREDUCE
|
| 9 |
|
| 10 |
"""
|
| 11 |
This method involves an initial prompt on each chunk of data * ( for summarization tasks, this could be a summary
|
|
|
|
| 23 |
summary = output['output_text']
|
| 24 |
print(summary)
|
| 25 |
return summary
|
| 26 |
+
# prompt used by the chain for summarizing each part
|
| 27 |
+
# print("prompt used by the chain for summarizing each part:")
|
| 28 |
+
# print(chain.llm_chain.prompt.template)
|
| 29 |
+
|
| 30 |
+
# prompt used by the chain for combining the parts
|
| 31 |
+
# print("prompt used by the chain for combining the parts:")
|
| 32 |
+
# print(chain.combine_document_chain.llm_chain.promdocs
|
| 33 |
+
|
| 34 |
|
| 35 |
+
def _summarize_with_map_reduce(transcript_file_name, llm):
|
| 36 |
+
chunked_docs = load_rtf_document_and_chunk(transcript_file_name)
|
| 37 |
chain = load_summarize_chain(llm=llm, chain_type="map_reduce", verbose=False)
|
| 38 |
|
| 39 |
# prompt used by the chain for summarizing each part
|
|
|
|
| 44 |
# print("prompt used by the chain for combining the parts:")
|
| 45 |
# print(chain.combine_document_chain.llm_chain.prompt.template)
|
| 46 |
|
| 47 |
+
return run_chain(chain=chain, docs=chunked_docs)
|
| 48 |
|
| 49 |
+
def _summarize_with_map_reduce_and_bullet_point_prompt(transcript_file_name, llm):
|
| 50 |
+
chunked_docs = load_rtf_document_and_chunk(transcript_file_name)
|
| 51 |
chain = load_summarize_chain(
|
| 52 |
+
llm=llm,
|
| 53 |
chain_type="map_reduce",
|
| 54 |
map_prompt=BULLET_POINT_PROMPT,
|
| 55 |
combine_prompt=BULLET_POINT_PROMPT,
|
| 56 |
)
|
| 57 |
|
| 58 |
+
return run_chain(chain=chain, docs=chunked_docs)
|
| 59 |
|
| 60 |
|
| 61 |
"""
|
| 62 |
Stuffing is the simplest method, whereby you simply stuff all the related data into the prompt as context to pass to
|
| 63 |
the language model. This is implemented in LangChain as the StuffDocumentsChain.
|
| 64 |
Pros: Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.
|
| 65 |
+
Cons: Most LLMs have a context length, and for large documents (or many documen# extract_aspects_and_sentiment(rtf_file)
|
| 66 |
+
s) this will not work as it will
|
| 67 |
result in a prompt larger than the context length.
|
| 68 |
|
| 69 |
The main downside of this method is that it only works one smaller pieces of data. Once you are working with many
|
|
|
|
| 71 |
"""
|
| 72 |
|
| 73 |
|
| 74 |
+
def _summarize_with_stuff_chain(transcript_file_name, llm):
|
| 75 |
+
docs = load_rtf_document(transcript_file_name)
|
| 76 |
+
chain = load_summarize_chain(llm=llm, chain_type="stuff")
|
| 77 |
return run_chain(chain=chain, docs=docs)
|
| 78 |
|
| 79 |
+
# chain = load_summarize_chain(llm=llm, chain_type="stuff", prompt=BULLET_POINT_PROMPT)
|
| 80 |
# run_chain(chain=chain, docs=docs)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def summarize_podcast(transcript_file_name, summarization_method = None, llm_choice = None):
|
| 84 |
+
# override model
|
| 85 |
+
llm_choice = GPT3_INSTRUCT
|
| 86 |
+
llm = set_summarization_llm(llm_choice)
|
| 87 |
+
if summarization_method == MAPREDUCE:
|
| 88 |
+
return _summarize_with_map_reduce(transcript_file_name=transcript_file_name, llm=llm)
|
| 89 |
+
elif summarization_method == STUFF:
|
| 90 |
+
return _summarize_with_stuff_chain(transcript_file_name=transcript_file_name, llm=llm)
|
| 91 |
+
else:
|
| 92 |
+
return _summarize_with_map_reduce(transcript_file_name=transcript_file_name, llm=llm)
|