RaghuCourage9605 commited on
Commit
4df1eba
·
verified ·
1 Parent(s): a993fae

Upload 5 files

Browse files
Files changed (5) hide show
  1. 1_Document_QA.py +76 -0
  2. 2_Image_QA.py +160 -0
  3. 3_Video_QA.py +200 -0
  4. 4_Audio_QA.py +0 -0
  5. app.py +261 -0
1_Document_QA.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.schema import Document
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.document_loaders import (
5
+ PythonLoader,
6
+ PyMuPDFLoader,
7
+ WikipediaLoader,
8
+ )
9
+ from langchain.vectorstores import FAISS
10
+ from langchain_community.docstore import InMemoryDocstore
11
+ from langchain_huggingface import HuggingFaceEmbeddings
12
+ from langchain_cerebras import ChatCerebras
13
+ from langchain_mistralai import ChatMistralAI
14
+ from langchain_core.messages import HumanMessage
15
+ from langchain_google_genai import ChatGoogleGenerativeAI
16
+ from langchain.prompts import ChatPromptTemplate
17
+ from langchain.schema import StrOutputParser
18
+ from uuid import uuid4
19
+ import numpy as np
20
+ import faiss
21
+ import whisper
22
+ import torch
23
+ import os
24
+ from dotenv import load_dotenv
25
+ import logging
26
+ import base64
27
+ import asyncio
28
+ from concurrent.futures import ThreadPoolExecutor
29
+
30
+ if "models" not in st.session_state:
31
+ st.session_state.models = {
32
+ "Gemini": ChatGoogleGenerativeAI(
33
+ model="gemini-2.0-flash-exp",
34
+ temperature=0.8,
35
+ verbose=True,
36
+ api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY")
37
+ ),
38
+ "Mistral": ChatMistralAI(
39
+ model_name="open-mistral-nemo",
40
+ temperature=0.8,
41
+ verbose=True
42
+ ),
43
+ "Llama": ChatCerebras(
44
+ model="llama-3.3-70b",
45
+ temperature=0.8,
46
+ verbose=True,
47
+ api_key=os.getenv("CEREBRAS_API_KEY")
48
+ )
49
+ }
50
+ if "embeddings" not in st.session_state:
51
+ model_name = "sentence-transformers/all-mpnet-base-v2"
52
+ model_kwargs = {'device': 'cpu'}
53
+ encode_kwargs = {'normalize_embeddings': False}
54
+ st.session_state.embeddings = HuggingFaceEmbeddings(
55
+ model_name=model_name,
56
+ model_kwargs=model_kwargs,
57
+ encode_kwargs=encode_kwargs
58
+ )
59
+
60
+ st.header(" 1. 🗂️ ✨ Document Question Answering")
61
+ st.write("""
62
+ In this section, you can upload a document files and query its content for answers.
63
+ The system will process the document and allow you to ask specific questions about the text in the document.
64
+
65
+ Here’s a list of the different types of document files for Chatting with your Doc:
66
+
67
+ - Text Files (.txt)
68
+ - PDF Files (.pdf)
69
+ - Word Documents (.doc, .docx)
70
+ - ArXiV Papers (.pdf)
71
+ """)
72
+
73
+ uploaded_doc = st.file_uploader("Upload the required documents (.pdf,.docx,)",type=[".pdf","docx"])
74
+
75
+
76
+
2_Image_QA.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.schema import Document
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import FAISS
5
+ from langchain_community.docstore import InMemoryDocstore
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_core.messages import HumanMessage
8
+ from langchain_cerebras import ChatCerebras
9
+ from langchain_mistralai import ChatMistralAI
10
+ from langchain_google_genai import ChatGoogleGenerativeAI
11
+ from langchain.prompts import ChatPromptTemplate
12
+ from langchain.schema import StrOutputParser
13
+ from uuid import uuid4
14
+ import faiss
15
+ import os
16
+ from dotenv import load_dotenv
17
+ import logging
18
+ import httpx
19
+ import base64
20
+ import asyncio
21
+
22
+ # Initialize environment variables and logging
23
+ load_dotenv()
24
+ logging.basicConfig(level=logging.INFO)
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Async function to invoke chain
28
+ async def async_invoke_chain(chain, input_data):
29
+ loop = asyncio.get_event_loop()
30
+ return await loop.run_in_executor(None, chain.invoke, input_data)
31
+
32
+ # Initialize session state for messages and models
33
+ if "messages" not in st.session_state:
34
+ st.session_state.messages = []
35
+
36
+ if "models" not in st.session_state:
37
+ st.session_state.models = {
38
+ "Gemini": ChatGoogleGenerativeAI(
39
+ model="gemini-2.0-flash-exp",
40
+ temperature=0.8,
41
+ verbose=True,
42
+ api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY")
43
+ ),
44
+ "Mistral": ChatMistralAI(
45
+ model_name="open-mistral-nemo",
46
+ temperature=0.8,
47
+ verbose=True
48
+ ),
49
+ "Llama": ChatCerebras(
50
+ model="llama-3.3-70b",
51
+ temperature=0.8,
52
+ verbose=True,
53
+ api_key=os.getenv("CEREBRAS_API_KEY")
54
+ )
55
+ }
56
+
57
+ # Initialize embeddings model
58
+ if "embeddings" not in st.session_state:
59
+ model_name = "sentence-transformers/all-mpnet-base-v2"
60
+ model_kwargs = {'device': 'cpu'}
61
+ encode_kwargs = {'normalize_embeddings': False}
62
+ st.session_state.embeddings = HuggingFaceEmbeddings(
63
+ model_name=model_name,
64
+ model_kwargs=model_kwargs,
65
+ encode_kwargs=encode_kwargs
66
+ )
67
+
68
+ st.header("📸📈📊 ֎ Image Content Analysis and Question Answering")
69
+
70
+ # Brief overview for image content analysis
71
+ description = """
72
+ Upload an image, and the AI will analyze its content and answer your questions.
73
+ It can interpret various types of images including:
74
+ - General imagery (objects, people, scenes)
75
+ - Diagrams, graphs, and data visualizations
76
+ - Scientific and medical images
77
+ - Text-based images (documents, screenshots)
78
+ """
79
+
80
+ # Display the brief description
81
+ st.write(description)
82
+
83
+ # File upload and URL input
84
+ st.header("Upload Image for Question Answering")
85
+ uploaded_file = st.file_uploader("Upload an image (.jpeg, .jpg, .png, etc.):", type=["jpeg", "jpg", "png"])
86
+
87
+ st.header("Or Enter the Image URL :")
88
+ image_url = st.text_input("Enter the image URL")
89
+
90
+ image_data = None
91
+
92
+ if uploaded_file:
93
+ st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
94
+ image_data = base64.b64encode(uploaded_file.read()).decode("utf-8")
95
+ elif image_url:
96
+ try:
97
+ with httpx.Client() as client:
98
+ response = client.get(image_url)
99
+ response.raise_for_status()
100
+ st.image(response.content, caption="Image from URL", use_column_width=True)
101
+ image_data = base64.b64encode(response.content).decode("utf-8")
102
+ except Exception as e:
103
+ st.error(f"Error fetching image from URL: {e}")
104
+
105
+ if image_data:
106
+ message = HumanMessage(content=[{
107
+ "type": "text", "text": "Describe what is in the image in detail."
108
+ }, {
109
+ "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
110
+ }])
111
+
112
+ # Generate response from the model
113
+ response = asyncio.run(async_invoke_chain(st.session_state.models["Gemini"], [message]))
114
+ knowledge = [Document(page_content=response.content)]
115
+
116
+ # Split text into chunks for indexing
117
+ text_splitter = RecursiveCharacterTextSplitter(separators="\n\n", chunk_size=1500, chunk_overlap=200)
118
+ chunks = text_splitter.split_documents(knowledge)
119
+
120
+ # Create FAISS IndexHNSWFlat for indexing image embeddings
121
+ index = faiss.IndexFlatL2(len(st.session_state.embeddings.embed_query("hello world")))
122
+
123
+ # Create FAISS vector store for document retrieval
124
+ vector_store = FAISS(
125
+ embedding_function=st.session_state.embeddings,
126
+ index=index,
127
+ docstore=InMemoryDocstore(),
128
+ index_to_docstore_id={},
129
+ )
130
+
131
+ # Generate unique IDs and add documents to the store
132
+ ids = [str(uuid4()) for _ in range(len(chunks))]
133
+ vector_store.add_documents(documents=chunks, ids=ids)
134
+
135
+ # Update the mapping between FAISS index and document IDs
136
+ for idx, doc_id in enumerate(ids):
137
+ vector_store.index_to_docstore_id[idx] = doc_id
138
+
139
+ # Create image retriever with the FAISS index
140
+ image_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 6})
141
+
142
+ def get_retrieved_context(query):
143
+ retrieved_documents = image_retriever.get_relevant_documents(query)
144
+ return "\n".join(doc.page_content for doc in retrieved_documents)
145
+
146
+ # User query for image QA
147
+ user_input = st.chat_input("Ask a question about the image:")
148
+
149
+ prompt = ChatPromptTemplate.from_messages([(
150
+ "system", "You are an expert in analyzing images. Use the context: {context} to answer the query."
151
+ ), ("human", "{question}")])
152
+
153
+ if user_input:
154
+ st.session_state.messages.append({"role": "user", "content": user_input})
155
+ qa_chain = prompt | st.session_state.models["Mistral"] | StrOutputParser()
156
+ context = get_retrieved_context(user_input)
157
+ response_message = asyncio.run(async_invoke_chain(qa_chain, {"question": user_input, "context": context}))
158
+ st.session_state.messages.append({"role": "assistant", "content": response_message})
159
+ for message in st.session_state.messages:
160
+ st.chat_message(message["role"]).markdown(message["content"])
3_Video_QA.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.schema import Document
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import FAISS
5
+ from langchain_community.docstore import InMemoryDocstore
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_cerebras import ChatCerebras
8
+ from langchain_mistralai import ChatMistralAI
9
+ from langchain_google_genai import ChatGoogleGenerativeAI
10
+ from langchain.prompts import ChatPromptTemplate
11
+ from langchain.schema import StrOutputParser
12
+ from uuid import uuid4
13
+ import faiss
14
+ import whisper
15
+ import yt_dlp
16
+ import torch
17
+ import os
18
+ from dotenv import load_dotenv
19
+ import logging
20
+ import asyncio
21
+
22
+ load_dotenv()
23
+ logging.basicConfig(level=logging.INFO)
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ async def async_invoke_chain(chain, input_data):
28
+ loop = asyncio.get_event_loop()
29
+ return await loop.run_in_executor(None, chain.invoke, input_data)
30
+
31
+ if "models" not in st.session_state:
32
+ st.session_state.models = {
33
+ "Gemini": ChatGoogleGenerativeAI(
34
+ model="gemini-2.0-flash-exp",
35
+ temperature=0.8,
36
+ verbose=True,
37
+ api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY"),
38
+ ),
39
+ "Mistral": ChatMistralAI(
40
+ model_name="open-mistral-nemo", temperature=0.8, verbose=True
41
+ ),
42
+ "Llama": ChatCerebras(
43
+ model="llama-3.3-70b",
44
+ temperature=0.8,
45
+ verbose=True,
46
+ api_key=os.getenv("CEREBRAS_API_KEY"),
47
+ ),
48
+ }
49
+
50
+ # Initialize embeddings
51
+ if "embeddings" not in st.session_state:
52
+ model_name = "sentence-transformers/all-mpnet-base-v2"
53
+ model_kwargs = {"device": "cpu"}
54
+ encode_kwargs = {"normalize_embeddings": False}
55
+ st.session_state.embeddings = HuggingFaceEmbeddings(
56
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
57
+ )
58
+
59
+ st.header("1. 🎬📽️🎞️֎ Video Question Answering ")
60
+ st.write("""
61
+ In this section, you can upload a video file or provide a YouTube URL to ask questions related to its content.
62
+ The system will process the video and provide relevant answers based on the transcription and analysis of the video content.
63
+ """)
64
+
65
+ if "messages" not in st.session_state:
66
+ st.session_state.messages = []
67
+
68
+ # Function to transcribe video using whisper
69
+ async def process_video(video_path):
70
+ st.info("Transcribing video...")
71
+ model = whisper.load_model("small")
72
+ model = model.to(device="cuda" if torch.cuda.is_available() else "cpu")
73
+ loop = asyncio.get_event_loop()
74
+ result = await loop.run_in_executor(None, model.transcribe, video_path)
75
+ st.success("Transcription complete")
76
+ return result["text"]
77
+
78
+
79
+ # Function to download video from YouTube using yt-dlp
80
+ def download_video(video_url, save_path="./"):
81
+ try:
82
+ ydl_opts = {
83
+ "outtmpl": f"{save_path}/%(title)s.%(ext)s", # Save path and file name
84
+ }
85
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
86
+ ydl.download([video_url])
87
+ print("Download complete!")
88
+ except Exception as e:
89
+ print(f"An error occurred: {e}")
90
+
91
+
92
+ # Streamlit UI
93
+ st.header("Upload Video 📤")
94
+ uploaded_file = st.file_uploader(
95
+ "Upload a video file (.mp4, .webm, .mkv):", type=["mp4", "webm", "mkv"]
96
+ )
97
+
98
+ st.header("Provide Youtube Video URL 🔗")
99
+ uploaded_video_url = st.text_input("Enter the URL of the video you want to chat with")
100
+ st.warning(
101
+ "Please note that processing a YouTube URL may take some time, depending on the video's length and the current load."
102
+ )
103
+
104
+
105
+ if st.button("Process Video"):
106
+ if uploaded_video_url:
107
+ save_path = "./downloads"
108
+ os.makedirs(save_path, exist_ok=True)
109
+ download_video(uploaded_video_url, save_path) # Download the video from YouTube
110
+ video_file = next(
111
+ (f for f in os.listdir(save_path) if f.endswith((".mp4", ".webm", ".mkv"))),
112
+ None,
113
+ )
114
+ if video_file:
115
+ video_path = os.path.join(save_path, video_file)
116
+ transcription = asyncio.run(
117
+ process_video(video_path)
118
+ ) # Fetch transcription text
119
+ st.session_state.transcription = transcription
120
+ st.success("Transcription complete!")
121
+ else:
122
+ st.error("No video file found after download.")
123
+ elif uploaded_file:
124
+ save_path = "./uploads"
125
+ os.makedirs(save_path, exist_ok=True)
126
+ video_path = os.path.join(save_path, uploaded_file.name)
127
+ with open(video_path, "wb") as f:
128
+ f.write(uploaded_file.read())
129
+ st.session_state.transcription = asyncio.run(
130
+ process_video(video_path)
131
+ ) # Fetch transcription text
132
+ st.success("Transcription complete!")
133
+ else:
134
+ st.error("Please upload a video file or provide a video URL.")
135
+
136
+ # Handle LLM response after transcription
137
+ if "transcription" in st.session_state:
138
+ transcription = st.session_state.transcription
139
+ response = asyncio.run(async_invoke_chain(st.session_state.models["Gemini"], transcription))
140
+ knowledge = [Document(page_content=response.content)]
141
+
142
+ text_splitter = RecursiveCharacterTextSplitter(
143
+ separators="\n\n", chunk_size=1500, chunk_overlap=200
144
+ )
145
+ chunks = text_splitter.split_documents(knowledge)
146
+ index = faiss.IndexFlatL2(
147
+ len(st.session_state.embeddings.embed_query("hello world"))
148
+ )
149
+
150
+ # Create FAISS vector store for document retrieval
151
+ vector_store = FAISS(
152
+ embedding_function=st.session_state.embeddings,
153
+ index=index,
154
+ docstore=InMemoryDocstore(),
155
+ index_to_docstore_id={},
156
+ )
157
+
158
+ # Generate unique IDs and add documents to the store
159
+ ids = [str(uuid4()) for _ in range(len(chunks))]
160
+ vector_store.add_documents(documents=chunks, ids=ids)
161
+
162
+ # Update the mapping between FAISS index and document IDs
163
+ for idx, doc_id in enumerate(ids):
164
+ vector_store.index_to_docstore_id[idx] = doc_id
165
+
166
+ # Create video retriever with the FAISS index
167
+ video_retriever = vector_store.as_retriever(
168
+ search_type="similarity", search_kwargs={"k": 4}
169
+ )
170
+
171
+ def get_retrieved_context(query):
172
+ retrieved_documents = video_retriever.get_relevant_documents(query)
173
+ return "\n".join(doc.page_content for doc in retrieved_documents)
174
+
175
+ # User query for video QA
176
+ user_input = st.chat_input("Ask a question about the video:")
177
+ prompt = ChatPromptTemplate.from_messages(
178
+ [
179
+ (
180
+ "system",
181
+ "You are an expert in analyzing videos. Use the context: {context} to answer the query.",
182
+ ),
183
+ ("human", "{question}"),
184
+ ]
185
+ )
186
+
187
+ if user_input:
188
+ st.session_state.messages.append({"role": "user", "content": user_input})
189
+ qa_chain = prompt | st.session_state.models["Mistral"] | StrOutputParser()
190
+ context = get_retrieved_context(user_input)
191
+ response_message = asyncio.run(
192
+ async_invoke_chain(qa_chain, {"question": user_input, "context": context})
193
+ )
194
+ st.session_state.messages.append(
195
+ {"role": "assistant", "content": response_message}
196
+ )
197
+ for message in st.session_state.messages:
198
+ st.chat_message(message["role"]).markdown(message["content"])
199
+ else:
200
+ st.error("No transcription available. Please upload or process a video first.")
4_Audio_QA.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # from langchain.schema import Document
3
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ # from langchain.vectorstores import FAISS
5
+ # from langchain_community.docstore import InMemoryDocstore
6
+ # from langchain_huggingface import HuggingFaceEmbeddings
7
+ # from langchain_cerebras import ChatCerebras
8
+ # from langchain_mistralai import ChatMistralAI
9
+ # from langchain_core.messages import HumanMessage
10
+ # from langchain_google_genai import ChatGoogleGenerativeAI
11
+ # from langchain.prompts import ChatPromptTemplate
12
+ # from langchain.schema import StrOutputParser
13
+ # from uuid import uuid4
14
+ # import numpy as np
15
+ # import faiss
16
+ # import whisper
17
+ # import torch
18
+ # import os
19
+ # from dotenv import load_dotenv
20
+ # import logging
21
+ # import base64
22
+ # import asyncio
23
+ # from concurrent.futures import ThreadPoolExecutor
24
+
25
+ # # Load environment variables
26
+ # load_dotenv()
27
+ # logging.basicConfig(level=logging.INFO)
28
+ # logger = logging.getLogger(__name__)
29
+
30
+ # st.set_page_config(
31
+ # page_title = "Multi-Modal RAG",
32
+ # page_icon = ":red RED",
33
+ # layout="wide"
34
+ # )
35
+
36
+ # if "models" not in st.session_state:
37
+ # st.session_state.models = {
38
+ # "Gemini": ChatGoogleGenerativeAI(
39
+ # model="gemini-2.0-flash-exp",
40
+ # temperature=0.8,
41
+ # verbose=True,
42
+ # api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY")
43
+ # ),
44
+ # "Mistral": ChatMistralAI(
45
+ # model_name="open-mistral-nemo",
46
+ # temperature=0.8,
47
+ # verbose=True
48
+ # ),
49
+ # "Llama": ChatCerebras(
50
+ # model="llama-3.3-70b",
51
+ # temperature=0.8,
52
+ # verbose=True,
53
+ # api_key=os.getenv("CEREBRAS_API_KEY")
54
+ # )
55
+ # }
56
+
57
+ # if "embeddings" not in st.session_state:
58
+ # model_name = "sentence-transformers/all-mpnet-base-v2"
59
+ # model_kwargs = {'device': 'cpu'}
60
+ # encode_kwargs = {'normalize_embeddings': False}
61
+ # st.session_state.embeddings = HuggingFaceEmbeddings(
62
+ # model_name=model_name,
63
+ # model_kwargs=model_kwargs,
64
+ # encode_kwargs=encode_kwargs
65
+ # )
66
+
67
+ # # Initialize Streamlit app
68
+ # st.title("Multi-Modal Retrieval-Augmented Generation (RAG)")
69
+ # st.sidebar.title("Options")
70
+ # st.sidebar.write("""
71
+ # 1. Video Question Answering RAG (.mp4, .mp3, etc.)
72
+ # 2. Image Question Answering RAG (.jpeg, .jpg, .png, etc.)
73
+ # 3. Document Question Answering RAG (.pdf, .docx, etc.)
74
+ # """)
75
+
76
+ # # Initialize session state for conversation history
77
+ # if "messages" not in st.session_state:
78
+ # st.session_state.messages = []
79
+
80
+ # # Function to process LLM calls asynchronously
81
+ # async def async_invoke_chain(chain, input_data):
82
+ # loop = asyncio.get_event_loop()
83
+ # return await loop.run_in_executor(None, chain.invoke, input_data)
84
+
85
+ # async def process_video(video_path):
86
+ # st.info("Transcribing video...")
87
+ # model = whisper.load_model("small")
88
+ # model = model.to(device="cuda" if torch.cuda.is_available() else "cpu")
89
+ # loop = asyncio.get_event_loop()
90
+ # result = await loop.run_in_executor(None, model.transcribe, video_path)
91
+ # return result["text"]
92
+
93
+ # # Handle video processing
94
+ # # Correct the variable name mismatch and properly handle video retrieval.
95
+ # if modal_choice == "Video":
96
+ # st.header("Step 1: Upload or Provide Video")
97
+ # uploaded_file = st.file_uploader("Upload a video file (.mp4, .webm, etc.):", type=["mp4", "webm", "mkv"])
98
+
99
+ # if st.button("Process Video"):
100
+ # if uploaded_file:
101
+ # save_path = "./uploads"
102
+ # os.makedirs(save_path, exist_ok=True)
103
+ # video_path = os.path.join(save_path, uploaded_file.name)
104
+ # with open(video_path, "wb") as f:
105
+ # f.write(uploaded_file.read())
106
+ # transcription = asyncio.run(process_video(video_path)) # Fetch transcription text
107
+ # st.session_state.transcription = transcription
108
+ # st.success("Transcription complete!")
109
+ # else:
110
+ # st.error("Please upload a video file.")
111
+
112
+ # # Handle LLM response after transcription
113
+ # if "transcription" in st.session_state:
114
+ # response = asyncio.run(async_invoke_chain(st.session_state.models["Gemini"], transcription)) # Correcting the input here
115
+ # knowledge = [Document(page_content=response.content)]
116
+ # text_splitter = RecursiveCharacterTextSplitter(separators="\n\n", chunk_size=1500, chunk_overlap=200)
117
+ # chunks = text_splitter.split_documents(knowledge)
118
+
119
+ # # Creating the FAISS index
120
+ # index = faiss.IndexFlatL2(len(st.session_state.embeddings.embed_query("hello world")))
121
+
122
+ # # Create FAISS vector store for document retrieval
123
+ # vector_store = FAISS(
124
+ # embedding_function=st.session_state.embeddings,
125
+ # index=index,
126
+ # docstore=InMemoryDocstore(),
127
+ # index_to_docstore_id={},
128
+ # )
129
+
130
+ # # Generate unique IDs and add documents to the store
131
+ # ids = [str(uuid4()) for _ in range(len(chunks))]
132
+ # vector_store.add_documents(documents=chunks, ids=ids)
133
+
134
+ # # Update the mapping between FAISS index and document IDs
135
+ # for idx, doc_id in enumerate(ids):
136
+ # vector_store.index_to_docstore_id[idx] = doc_id
137
+
138
+ # # Create video retriever with the FAISS index
139
+ # video_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
140
+
141
+ # def get_retrieved_context(query):
142
+ # retrieved_documents = video_retriever.get_relevant_documents(query)
143
+ # return "\n".join(doc.page_content for doc in retrieved_documents)
144
+
145
+ # # User query for video QA
146
+ # user_input = st.chat_input("Ask a question about the video:")
147
+ # prompt = ChatPromptTemplate.from_messages([(
148
+ # "system", "You are an expert in analyzing videos. Use the context: {context} to answer the query."
149
+ # ), ("human", "{question}")])
150
+
151
+ # if user_input:
152
+ # st.session_state.messages.append({"role": "user", "content": user_input})
153
+ # qa_chain = prompt | st.session_state.models["Mistral"] | StrOutputParser()
154
+ # context = get_retrieved_context(user_input)
155
+ # response_message = asyncio.run(async_invoke_chain(qa_chain, {"question": user_input, "context": context}))
156
+ # st.session_state.messages.append({"role": "assistant", "content": response_message})
157
+ # for message in st.session_state.messages:
158
+ # st.chat_message(message["role"]).markdown(message["content"])
159
+
160
+
161
+ # # Handle image processing
162
+ # elif modal_choice == "Image":
163
+ # st.header("Step 2: Upload Image for Question Answering")
164
+ # uploaded_file = st.file_uploader("Upload an image (.jpeg, .jpg, .png, etc.):", type=["jpeg", "jpg", "png"])
165
+
166
+ # if uploaded_file:
167
+ # st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
168
+ # image_data = base64.b64encode(uploaded_file.read()).decode("utf-8")
169
+
170
+ # message = HumanMessage(content=[{
171
+ # "type": "text", "text": "Describe what is in the image in detail."
172
+ # }, {
173
+ # "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
174
+ # }])
175
+ # response = asyncio.run(async_invoke_chain(st.session_state.models["Gemini"], [message]))
176
+ # knowledge = [Document(page_content=response.content)]
177
+
178
+ # # Split text into chunks for indexing
179
+ # text_splitter = RecursiveCharacterTextSplitter(separators="\n\n", chunk_size=1500, chunk_overlap=200)
180
+ # chunks = text_splitter.split_documents(knowledge)
181
+
182
+ # # Create FAISS IndexHNSWFlat for indexing image embeddings
183
+ # index = faiss.IndexFlatL2(len(st.session_state.embeddings.embed_query("hello world")))
184
+
185
+ # # Create FAISS vector store for document retrieval
186
+ # vector_store = FAISS(
187
+ # embedding_function=st.session_state.embeddings,
188
+ # index=index,
189
+ # docstore=InMemoryDocstore(),
190
+ # index_to_docstore_id={},
191
+ # )
192
+
193
+ # # Generate unique IDs and add documents to the store
194
+ # ids = [str(uuid4()) for _ in range(len(chunks))]
195
+ # vector_store.add_documents(documents=chunks, ids=ids)
196
+
197
+ # # Update the mapping between FAISS index and document IDs
198
+ # for idx, doc_id in enumerate(ids):
199
+ # vector_store.index_to_docstore_id[idx] = doc_id
200
+
201
+ # # Create image retriever with the FAISS index
202
+ # image_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 6})
203
+
204
+ # def get_retrieved_context(query):
205
+ # retrieved_documents = image_retriever.get_relevant_documents(query)
206
+ # return "\n".join(doc.page_content for doc in retrieved_documents)
207
+
208
+ # # User query for image QA
209
+ # user_input = st.chat_input("Ask a question about the image:")
210
+ # prompt = ChatPromptTemplate.from_messages([(
211
+ # "system", "You are an expert in analyzing images. Use the context: {context} to answer the query."
212
+ # ), ("human", "{question}")])
213
+
214
+ # if user_input:
215
+ # st.session_state.messages.append({"role": "user", "content": user_input})
216
+ # qa_chain = prompt | st.session_state.models["Mistral"] | StrOutputParser()
217
+ # context = get_retrieved_context(user_input)
218
+ # response_message = asyncio.run(async_invoke_chain(qa_chain, {"question": user_input, "context": context}))
219
+ # st.session_state.messages.append({"role": "assistant", "content": response_message})
220
+ # for message in st.session_state.messages:
221
+ # st.chat_message(message["role"]).markdown(message["content"])
222
+
223
+ # elif modal_choice == "Document":
224
+ # st.header("Step 3: Upload Document for Question Answering")
225
+ # uploaded_doc = st.file_uploader("Upload a document (.pdf, .docx, etc.):", type=["pdf", "docx"])
226
+ # if uploaded_doc:
227
+ # pass
228
+ import streamlit as st
229
+
230
+ # Set up Streamlit page
231
+ st.set_page_config(page_title="Multi-Modal RAG", page_icon=":red_circle:",initial_sidebar_state="expanded", layout="wide",menu_items={
232
+ 'Get Help': 'https://www.extremelycoolapp.com/help',
233
+ 'Report a bug': "https://www.extremelycoolapp.com/bug",
234
+ 'About': "# This is a header. This is an *extremely* cool app!"
235
+ })
236
+
237
+ # Title of the web app
238
+ st.title("Multi-Modal Retrieval-Augmented Generation (RAG)")
239
+
240
+ # Explanation for Non-Technical Users
241
+ st.write("""
242
+ **Multi-Modal RAG** stands for **Multi-Modal Retrieval-Augmented Generation**. It's a process that allows you to ask questions about different types of media, such as videos, images, or documents, and get answers powered by artificial intelligence.
243
+
244
+ In this app, you can interact with the following features:
245
+
246
+ 1. **Video Question Answering**: Upload a video, and the app will transcribe it. You can then ask questions about the video's content.
247
+ 2. **Image Question Answering**: Upload an image, and the app will describe it. You can ask questions about the contents of the image.
248
+ 3. **Document Question Answering**: Upload a document (PDF, Word, etc.), and the app will extract the relevant information to answer your questions.
249
+
250
+ Here's how it works:
251
+
252
+ - **Video QA**: The app first transcribes the video to text. Then, it allows you to ask any questions about the video. Based on the transcription, it retrieves relevant information to help answer your query.
253
+
254
+ - **Image QA**: Upload an image, and the app will analyze the image, describing its contents. You can then ask questions about what’s in the image.
255
+
256
+ - **Document QA**: Upload a document (like a PDF or Word file). The app extracts key information from the document to help answer your questions.
257
+
258
+ Each feature uses a combination of AI models and sophisticated algorithms to give you the best possible answers.
259
+ """)
260
+
261
+ # Sidebar Options