Spaces:

sajjadrahman56
/

multi_model_chat_bot

Sleeping

File size: 11,402 Bytes

import gradio as gr
import openai
import base64
from PIL import Image
import io
import fitz

# ---------- PDF Text Extraction ----------
def extract_text_from_pdf(pdf_file):
    try:
        text = ""
        pdf_document = fitz.open(pdf_file)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
        return text
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

# ---------- PDF Quiz Generation ----------
def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
    if not openai_api_key:
        return "Error: No API key provided."

    openai.api_key = openai_api_key
    limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content

    prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
For each question:
1. Create a clear question based on key concepts in the document
2. Provide 4 options (A, B, C, D)
3. Indicate the correct answer
4. Briefly explain the correct answer
Document content:
{limited_content}
"""

    try:
        response = openai.ChatCompletion.create(
            model=model_choice,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error generating quiz: {str(e)}"

# ---------- Image Processing ----------
def generate_image_response(input_text, image, openai_api_key, model_choice):
    if not openai_api_key:
        return "Error: No API key provided."

    openai.api_key = openai_api_key

    # Convert image to base64
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    base64_str = base64.b64encode(buffered.getvalue()).decode("utf-8")

    try:
        response = openai.ChatCompletion.create(
            model=model_choice,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": input_text},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
                    ]
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error processing image: {str(e)}"

# ---------- Voice Processing ----------
def process_voice_input(audio_path, openai_api_key, model_choice):
    if not openai_api_key:
        return "Error: No API key provided."

    try:
        openai.api_key = openai_api_key
        audio_file = open(audio_path, "rb")
        transcript = openai.Audio.transcribe("whisper-1", audio_file)
        prompt = transcript["text"]
        audio_file.close()

        response = openai.ChatCompletion.create(
            model=model_choice,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content, prompt
    except Exception as e:
        return f"Error processing voice: {str(e)}", ""

# ---------- Unified Chatbot Handler ----------
def chatbot(input_text, image, pdf_file, audio_file, openai_api_key, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, audio_mode, history):
    if history is None:
        history = []

    new_pdf_content = pdf_content

    # Handle PDF file upload and extract text
    if pdf_file is not None:
        new_pdf_content = extract_text_from_pdf(pdf_file)

    # Handle PDF Quiz Mode
    if pdf_quiz_mode:
        if new_pdf_content:
            quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
            history.append((f"👤: [PDF Quiz - {num_quiz_questions} questions]", f"🤖: {quiz_response}"))
        else:
            history.append(("👤: [PDF Quiz]", "🤖: Please upload a PDF file first."))

    # Handle Audio Mode
    elif audio_mode:
        if audio_file is not None:
            response, transcribed_text = process_voice_input(audio_file, openai_api_key, model_choice)
            history.append((f"👤 (Voice): {transcribed_text}", f"🤖: {response}"))
        else:
            history.append(("👤: [Audio]", "🤖: Please upload or record an audio file."))

    # Handle Image Mode
    else:
        if image is not None:
            response = generate_image_response(input_text, image, openai_api_key, model_choice)
            history.append((f"👤: {input_text or '[Image]'}", f"🤖: {response}"))
        elif input_text:
            # Handle text-only input when no image is provided
            try:
                openai.api_key = openai_api_key
                response = openai.ChatCompletion.create(
                    model=model_choice,
                    messages=[{"role": "user", "content": input_text}]
                )
                history.append((f"👤: {input_text}", f"🤖: {response.choices[0].message.content}"))
            except Exception as e:
                history.append((f"👤: {input_text}", f"🤖: Error: {str(e)}"))

    return "", None, None, None, new_pdf_content, history

# ---------- Clear Chat ----------
def clear_history():
    return "", None, None, None, "", []

# ---------- Input Type Toggle ----------
def update_input_type(choice):
    if choice == "Image":
        hint_text = """
        💡 **Image Mode Tips:**
        - Both **o1** and **o3-mini** support image analysis
        - o1 provides more detailed analysis but costs more
        - o3-mini is faster and more cost-effective for simple image questions
        """
        return (
            gr.update(visible=True),   # input_text
            gr.update(visible=True),   # image_input
            gr.update(visible=False),  # pdf_input
            gr.update(visible=False),  # audio_input
            gr.update(visible=False),  # quiz_slider
            gr.update(value=False),    # pdf_quiz_mode
            gr.update(value=False),    # audio_mode
            gr.update(value=hint_text, visible=True)  # model_hint
        )
    elif choice == "PDF(QUIZ)":
        hint_text = """
        📚 **PDF Quiz Mode Tips:**
        - Both models can generate quizzes from PDF content
        - o1 creates more comprehensive and detailed questions
        - o3-mini generates quizzes faster with good quality
        - Large PDFs are automatically limited to first 8000 characters
        """
        return (
            gr.update(visible=False),  # input_text
            gr.update(visible=False),  # image_input
            gr.update(visible=True),   # pdf_input
            gr.update(visible=False),  # audio_input
            gr.update(visible=True),   # quiz_slider
            gr.update(value=True),     # pdf_quiz_mode
            gr.update(value=False),    # audio_mode
            gr.update(value=hint_text, visible=True)  # model_hint
        )
    elif choice == "Audio":
        hint_text = """
        🎤 **Audio Mode Tips:**
        - **Important:** Audio transcription uses OpenAI's `whisper-1` model (separate cost)
        - **gpt-4 transcribe**: More sophisticated responses but higher cost per token
        - **gpt-4-mini-transcribe**: Cost-effective for most audio conversations
        - Supports common audio formats (MP3, WAV, M4A, etc.)
        - Maximum audio file size: 25MB
        """

        return (
            gr.update(visible=False),  # input_text
            gr.update(visible=False),  # image_input
            gr.update(visible=False),  # pdf_input
            gr.update(visible=True),   # audio_input
            gr.update(visible=False),  # quiz_slider
            gr.update(value=False),    # pdf_quiz_mode
            gr.update(value=True),     # audio_mode
            gr.update(value=hint_text, visible=True)  # model_hint
        )

# ---------- CSS Styling ----------
custom_css = """
.gradio-container {
    font-family: 'Arial', sans-serif;
    background-color: #f0f4f8;
}
.gradio-header {
    background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
    color: white;
    padding: 20px;
    border-radius: 8px;
    text-align: center;
}
#submit-btn {
    background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
    color: white;
    border-radius: 8px;
}
#clear-history {
    background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%);
    color: white;
    border-radius: 8px;
}
"""

# ---------- UI Interface ----------
def create_interface():
    with gr.Blocks(css=custom_css) as demo:
        gr.Markdown("""
            <div class="gradio-header">
                <h1>Multimodal Chatbot (Image + PDF Quiz + Voice)</h1>
                <h3>Ask via image, PDF, or voice</h3>
            </div>
        """)

        with gr.Accordion("Instructions", open=False):
            gr.Markdown("""
                - **Image Chat**: Upload an image and ask about it
                - **PDF Quiz**: Upload a PDF and generate MCQs
                - **Audio Chat**: Upload or record audio to chat
                - Always provide your OpenAI API key
            """)

        # State variables
        pdf_content = gr.State("")

        with gr.Row():
            openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...")

        with gr.Row():
            input_type = gr.Radio(["Image", "PDF(QUIZ)", "Audio"], label="Input Type", value="Image")

        # Model-specific hints that appear based on input type
        model_hint = gr.Markdown("", visible=False)

        # Input components row - all in one organized row
        with gr.Row():
            input_text = gr.Textbox(label="Question (for images)", visible=True)
            image_input = gr.Image(label="Upload Image", type="pil", visible=True)
            pdf_input = gr.File(label="Upload PDF", visible=False)
            audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False)
            quiz_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Questions", visible=False)

        # Hidden state components for mode control
        pdf_quiz_mode = gr.Checkbox(visible=False, value=False)
        audio_mode = gr.Checkbox(visible=False, value=False)

        with gr.Row():
            model_choice = gr.Dropdown(["o1", "o3-mini","whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe",], label="Model", value="o1")
            submit_btn = gr.Button("Submit", elem_id="submit-btn")
            clear_btn = gr.Button("Clear History", elem_id="clear-history")

        chat_history = gr.Chatbot()

        # Event handlers
        input_type.change(
            update_input_type,
            inputs=[input_type],
            outputs=[input_text, image_input, pdf_input, audio_input, quiz_slider, pdf_quiz_mode, audio_mode, model_hint]
        )

        submit_btn.click(
            chatbot,
            inputs=[input_text, image_input, pdf_input, audio_input, openai_api_key, model_choice,
                   pdf_content, quiz_slider, pdf_quiz_mode, audio_mode, chat_history],
            outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
        )

        clear_btn.click(
            clear_history,
            outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
        )

    return demo

# ---------- Launch ----------
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()