Spaces:

RaghuCourage9605
/

Multi-Modal-RAG

Runtime error

App Files Files Community

RaghuCourage9605 commited on Jan 21, 2025

Commit

511a3eb

verified ·

1 Parent(s): a44bd4f

Upload 4 files

Browse files

Files changed (4) hide show

pages/1_Document_QA.py +76 -0
pages/2_Image_QA.py +160 -0
pages/3_Video_QA.py +200 -0
pages/4_Audio_QA.py +0 -0

pages/1_Document_QA.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import streamlit as st
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import (
+    PythonLoader,
+    PyMuPDFLoader,
+    WikipediaLoader,
+)
+from langchain.vectorstores import FAISS
+from langchain_community.docstore import InMemoryDocstore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_cerebras import ChatCerebras
+from langchain_mistralai import ChatMistralAI
+from langchain_core.messages import HumanMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import StrOutputParser
+from uuid import uuid4
+import numpy as np
+import faiss
+import whisper
+import torch
+import os
+from dotenv import load_dotenv
+import logging
+import base64
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+if "models" not in st.session_state:
+    st.session_state.models = {
+        "Gemini": ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash-exp",
+            temperature=0.8,
+            verbose=True,
+            api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY")
+        ),
+        "Mistral": ChatMistralAI(
+            model_name="open-mistral-nemo",
+            temperature=0.8,
+            verbose=True
+        ),
+        "Llama": ChatCerebras(
+            model="llama-3.3-70b",
+            temperature=0.8,
+            verbose=True,
+            api_key=os.getenv("CEREBRAS_API_KEY")
+        )
+    }
+if "embeddings" not in st.session_state:
+    model_name = "sentence-transformers/all-mpnet-base-v2"
+    model_kwargs = {'device': 'cpu'}
+    encode_kwargs = {'normalize_embeddings': False}
+    st.session_state.embeddings = HuggingFaceEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs
+    )
+st.header(" 1. 🗂️ ✨ Document Question Answering")
+st.write("""
+        In this section, you can upload a document files and query its content for answers.
+        The system will process the document and allow you to ask specific questions about the text in the document.
+Here’s a list of the different types of document files for Chatting with your Doc:
+- Text Files (.txt)
+- PDF Files (.pdf)
+- Word Documents (.doc, .docx)
+- ArXiV Papers (.pdf)
+         """)
+uploaded_doc = st.file_uploader("Upload the required documents (.pdf,.docx,)",type=[".pdf","docx"])

pages/2_Image_QA.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import streamlit as st
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain_community.docstore import InMemoryDocstore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.messages import HumanMessage
+from langchain_cerebras import ChatCerebras
+from langchain_mistralai import ChatMistralAI
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import StrOutputParser
+from uuid import uuid4
+import faiss
+import os
+from dotenv import load_dotenv
+import logging
+import httpx
+import base64
+import asyncio
+# Initialize environment variables and logging
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Async function to invoke chain
+async def async_invoke_chain(chain, input_data):
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, chain.invoke, input_data)
+# Initialize session state for messages and models
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "models" not in st.session_state:
+    st.session_state.models = {
+        "Gemini": ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash-exp",
+            temperature=0.8,
+            verbose=True,
+            api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY")
+        ),
+        "Mistral": ChatMistralAI(
+            model_name="open-mistral-nemo",
+            temperature=0.8,
+            verbose=True
+        ),
+        "Llama": ChatCerebras(
+            model="llama-3.3-70b",
+            temperature=0.8,
+            verbose=True,
+            api_key=os.getenv("CEREBRAS_API_KEY")
+        )
+    }
+# Initialize embeddings model
+if "embeddings" not in st.session_state:
+    model_name = "sentence-transformers/all-mpnet-base-v2"
+    model_kwargs = {'device': 'cpu'}
+    encode_kwargs = {'normalize_embeddings': False}
+    st.session_state.embeddings = HuggingFaceEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs
+    )
+st.header("📸📈📊 ֎ Image Content Analysis and Question Answering")
+# Brief overview for image content analysis
+description = """
+Upload an image, and the AI will analyze its content and answer your questions.
+It can interpret various types of images including:
+- General imagery (objects, people, scenes)
+- Diagrams, graphs, and data visualizations
+- Scientific and medical images
+- Text-based images (documents, screenshots)
+"""
+# Display the brief description
+st.write(description)
+# File upload and URL input
+st.header("Upload Image for Question Answering")
+uploaded_file = st.file_uploader("Upload an image (.jpeg, .jpg, .png, etc.):", type=["jpeg", "jpg", "png"])
+st.header("Or Enter the Image URL :")
+image_url = st.text_input("Enter the image URL")
+image_data = None
+if uploaded_file:
+    st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
+    image_data = base64.b64encode(uploaded_file.read()).decode("utf-8")
+elif image_url:
+    try:
+        with httpx.Client() as client:
+            response = client.get(image_url)
+            response.raise_for_status()
+            st.image(response.content, caption="Image from URL", use_column_width=True)
+            image_data = base64.b64encode(response.content).decode("utf-8")
+    except Exception as e:
+        st.error(f"Error fetching image from URL: {e}")
+if image_data:
+    message = HumanMessage(content=[{
+            "type": "text", "text": "Describe what is in the image in detail."
+        }, {
+            "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
+        }])
+    # Generate response from the model
+    response = asyncio.run(async_invoke_chain(st.session_state.models["Gemini"], [message]))
+    knowledge = [Document(page_content=response.content)]
+    # Split text into chunks for indexing
+    text_splitter = RecursiveCharacterTextSplitter(separators="\n\n", chunk_size=1500, chunk_overlap=200)
+    chunks = text_splitter.split_documents(knowledge)
+    # Create FAISS IndexHNSWFlat for indexing image embeddings
+    index = faiss.IndexFlatL2(len(st.session_state.embeddings.embed_query("hello world")))
+    # Create FAISS vector store for document retrieval
+    vector_store = FAISS(
+        embedding_function=st.session_state.embeddings,
+        index=index,
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+    )
+    # Generate unique IDs and add documents to the store
+    ids = [str(uuid4()) for _ in range(len(chunks))]
+    vector_store.add_documents(documents=chunks, ids=ids)
+    # Update the mapping between FAISS index and document IDs
+    for idx, doc_id in enumerate(ids):
+        vector_store.index_to_docstore_id[idx] = doc_id
+    # Create image retriever with the FAISS index
+    image_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 6})
+    def get_retrieved_context(query):
+        retrieved_documents = image_retriever.get_relevant_documents(query)
+        return "\n".join(doc.page_content for doc in retrieved_documents)
+    # User query for image QA
+    user_input = st.chat_input("Ask a question about the image:")
+    prompt = ChatPromptTemplate.from_messages([(
+            "system", "You are an expert in analyzing images. Use the context: {context} to answer the query."
+        ), ("human", "{question}")])
+    if user_input:
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        qa_chain = prompt | st.session_state.models["Mistral"] | StrOutputParser()
+        context = get_retrieved_context(user_input)
+        response_message = asyncio.run(async_invoke_chain(qa_chain, {"question": user_input, "context": context}))
+        st.session_state.messages.append({"role": "assistant", "content": response_message})
+        for message in st.session_state.messages:
+            st.chat_message(message["role"]).markdown(message["content"])

pages/3_Video_QA.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import streamlit as st
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain_community.docstore import InMemoryDocstore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_cerebras import ChatCerebras
+from langchain_mistralai import ChatMistralAI
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import StrOutputParser
+from uuid import uuid4
+import faiss
+import whisper
+import yt_dlp
+import torch
+import os
+from dotenv import load_dotenv
+import logging
+import asyncio
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+async def async_invoke_chain(chain, input_data):
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, chain.invoke, input_data)
+if "models" not in st.session_state:
+    st.session_state.models = {
+        "Gemini": ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash-exp",
+            temperature=0.8,
+            verbose=True,
+            api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY"),
+        ),
+        "Mistral": ChatMistralAI(
+            model_name="open-mistral-nemo", temperature=0.8, verbose=True
+        ),
+        "Llama": ChatCerebras(
+            model="llama-3.3-70b",
+            temperature=0.8,
+            verbose=True,
+            api_key=os.getenv("CEREBRAS_API_KEY"),
+        ),
+    }
+# Initialize embeddings
+if "embeddings" not in st.session_state:
+    model_name = "sentence-transformers/all-mpnet-base-v2"
+    model_kwargs = {"device": "cpu"}
+    encode_kwargs = {"normalize_embeddings": False}
+    st.session_state.embeddings = HuggingFaceEmbeddings(
+        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
+    )
+st.header("1. 🎬📽️🎞️֎  Video Question Answering ")
+st.write("""
+    In this section, you can upload a video file or provide a YouTube URL to ask questions related to its content.
+    The system will process the video and provide relevant answers based on the transcription and analysis of the video content.
+""")
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Function to transcribe video using whisper
+async def process_video(video_path):
+    st.info("Transcribing video...")
+    model = whisper.load_model("small")
+    model = model.to(device="cuda" if torch.cuda.is_available() else "cpu")
+    loop = asyncio.get_event_loop()
+    result = await loop.run_in_executor(None, model.transcribe, video_path)
+    st.success("Transcription complete")
+    return result["text"]
+# Function to download video from YouTube using yt-dlp
+def download_video(video_url, save_path="./"):
+    try:
+        ydl_opts = {
+            "outtmpl": f"{save_path}/%(title)s.%(ext)s",    # Save path and file name
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([video_url])
+        print("Download complete!")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+# Streamlit UI
+st.header("Upload Video 📤")
+uploaded_file = st.file_uploader(
+    "Upload a video file (.mp4, .webm, .mkv):", type=["mp4", "webm", "mkv"]
+)
+st.header("Provide Youtube Video URL 🔗")
+uploaded_video_url = st.text_input("Enter the URL of the video you want to chat with")
+st.warning(
+    "Please note that processing a YouTube URL may take some time, depending on the video's length and the current load."
+)
+if st.button("Process Video"):
+    if uploaded_video_url:
+        save_path = "./downloads"
+        os.makedirs(save_path, exist_ok=True)
+        download_video(uploaded_video_url, save_path)  # Download the video from YouTube
+        video_file = next(
+            (f for f in os.listdir(save_path) if f.endswith((".mp4", ".webm", ".mkv"))),
+            None,
+        )
+        if video_file:
+            video_path = os.path.join(save_path, video_file)
+            transcription = asyncio.run(
+                process_video(video_path)
+            )  # Fetch transcription text
+            st.session_state.transcription = transcription
+            st.success("Transcription complete!")
+        else:
+            st.error("No video file found after download.")
+    elif uploaded_file:
+        save_path = "./uploads"
+        os.makedirs(save_path, exist_ok=True)
+        video_path = os.path.join(save_path, uploaded_file.name)
+        with open(video_path, "wb") as f:
+            f.write(uploaded_file.read())
+        st.session_state.transcription = asyncio.run(
+            process_video(video_path)
+        )  # Fetch transcription text
+        st.success("Transcription complete!")
+    else:
+        st.error("Please upload a video file or provide a video URL.")
+# Handle LLM response after transcription
+if "transcription" in st.session_state:
+    transcription = st.session_state.transcription
+    response = asyncio.run(async_invoke_chain(st.session_state.models["Gemini"], transcription))
+    knowledge = [Document(page_content=response.content)]
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators="\n\n", chunk_size=1500, chunk_overlap=200
+    )
+    chunks = text_splitter.split_documents(knowledge)
+    index = faiss.IndexFlatL2(
+        len(st.session_state.embeddings.embed_query("hello world"))
+    )
+    # Create FAISS vector store for document retrieval
+    vector_store = FAISS(
+        embedding_function=st.session_state.embeddings,
+        index=index,
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+    )
+    # Generate unique IDs and add documents to the store
+    ids = [str(uuid4()) for _ in range(len(chunks))]
+    vector_store.add_documents(documents=chunks, ids=ids)
+    # Update the mapping between FAISS index and document IDs
+    for idx, doc_id in enumerate(ids):
+        vector_store.index_to_docstore_id[idx] = doc_id
+    # Create video retriever with the FAISS index
+    video_retriever = vector_store.as_retriever(
+        search_type="similarity", search_kwargs={"k": 4}
+    )
+    def get_retrieved_context(query):
+        retrieved_documents = video_retriever.get_relevant_documents(query)
+        return "\n".join(doc.page_content for doc in retrieved_documents)
+    # User query for video QA
+    user_input = st.chat_input("Ask a question about the video:")
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "system",
+                "You are an expert in analyzing videos. Use the context: {context} to answer the query.",
+            ),
+            ("human", "{question}"),
+        ]
+    )
+    if user_input:
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        qa_chain = prompt | st.session_state.models["Mistral"] | StrOutputParser()
+        context = get_retrieved_context(user_input)
+        response_message = asyncio.run(
+            async_invoke_chain(qa_chain, {"question": user_input, "context": context})
+        )
+        st.session_state.messages.append(
+            {"role": "assistant", "content": response_message}
+        )
+        for message in st.session_state.messages:
+            st.chat_message(message["role"]).markdown(message["content"])
+else:
+    st.error("No transcription available. Please upload or process a video first.")

pages/4_Audio_QA.py ADDED Viewed

File without changes