multimodalRAG / RAG_MLM /ragMLM.py
sunny333's picture
initial commit
568cd7b
from operator import itemgetter
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.messages import HumanMessage
from . import utility as ut
from . import embedder as ed
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)
chatgpt = ChatOpenAI(model_name='gpt-4o', temperature=0)
def multimodal_prompt_function(data_dict):
"""
Create a multimodal prompt with both text and image context.
This function formats the provided context from `data_dict`, which contains
text, tables, and base64-encoded images. It joins the text (with table) portions
and prepares the image(s) in a base64-encoded format to be included in a
message.
The formatted text and images (context) along with the user question are used to
construct a prompt for GPT-4o
"""
formatted_texts = "\n".join(data_dict["context"]["texts"])
messages = []
# Adding image(s) to the messages if present
if data_dict["context"]["images"]:
for image in data_dict["context"]["images"]:
image_message = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
}
messages.append(image_message)
# Adding the text for analysis
text_message = {
"type": "text",
"text": (
f"""You are an analyst tasked with understanding detailed information
and trends from text documents,
data tables, and charts and graphs in images.
You will be given context information below which will be a mix of
text, tables, and images usually of charts or graphs.
Use this information to provide answers related to the user
question.
Do not make up answers, use the provided context documents below and
answer the question to the best of your ability.
User question:
{data_dict['question']}
Context documents:
{formatted_texts}
Answer:
"""
),
}
messages.append(text_message)
return [HumanMessage(content=messages)]
# Create RAG chain
multimodal_rag = (
{
"context": itemgetter('context'),
"question": itemgetter('input'),
}
|
RunnableLambda(multimodal_prompt_function)
|
chatgpt
|
StrOutputParser()
)
# Pass input query to retriever and get context document elements
retrieve_docs = (itemgetter('input')
|
ed.retriever_multi_vector
|
RunnableLambda(ut.split_image_text_types))
# Below, we chain `.assign` calls. This takes a dict and successively
# adds keys-- "context" and "answer"-- where the value for each key
# is determined by a Runnable (function or chain executing at runtime).
# This helps in having the retrieved context along with the answer generated by GPT-4o
multimodal_rag_w_sources = (RunnablePassthrough.assign(context=retrieve_docs)
.assign(answer=multimodal_rag)
)
#------ direct testing-------
#response = multimodal_rag_w_sources.invoke({'input': query})