File size: 11,402 Bytes
a047f73
 
 
 
 
4d40501
a047f73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import gradio as gr
import openai
import base64
from PIL import Image
import io
import fitz

# ---------- PDF Text Extraction ----------
def extract_text_from_pdf(pdf_file):
    try:
        text = ""
        pdf_document = fitz.open(pdf_file)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
        return text
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

# ---------- PDF Quiz Generation ----------
def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
    if not openai_api_key:
        return "Error: No API key provided."

    openai.api_key = openai_api_key
    limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content

    prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
For each question:
1. Create a clear question based on key concepts in the document
2. Provide 4 options (A, B, C, D)
3. Indicate the correct answer
4. Briefly explain the correct answer
Document content:
{limited_content}
"""

    try:
        response = openai.ChatCompletion.create(
            model=model_choice,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error generating quiz: {str(e)}"

# ---------- Image Processing ----------
def generate_image_response(input_text, image, openai_api_key, model_choice):
    if not openai_api_key:
        return "Error: No API key provided."

    openai.api_key = openai_api_key

    # Convert image to base64
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    base64_str = base64.b64encode(buffered.getvalue()).decode("utf-8")

    try:
        response = openai.ChatCompletion.create(
            model=model_choice,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": input_text},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
                    ]
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error processing image: {str(e)}"

# ---------- Voice Processing ----------
def process_voice_input(audio_path, openai_api_key, model_choice):
    if not openai_api_key:
        return "Error: No API key provided."

    try:
        openai.api_key = openai_api_key
        audio_file = open(audio_path, "rb")
        transcript = openai.Audio.transcribe("whisper-1", audio_file)
        prompt = transcript["text"]
        audio_file.close()

        response = openai.ChatCompletion.create(
            model=model_choice,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content, prompt
    except Exception as e:
        return f"Error processing voice: {str(e)}", ""

# ---------- Unified Chatbot Handler ----------
def chatbot(input_text, image, pdf_file, audio_file, openai_api_key, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, audio_mode, history):
    if history is None:
        history = []

    new_pdf_content = pdf_content

    # Handle PDF file upload and extract text
    if pdf_file is not None:
        new_pdf_content = extract_text_from_pdf(pdf_file)

    # Handle PDF Quiz Mode
    if pdf_quiz_mode:
        if new_pdf_content:
            quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
            history.append((f"πŸ‘€: [PDF Quiz - {num_quiz_questions} questions]", f"πŸ€–: {quiz_response}"))
        else:
            history.append(("πŸ‘€: [PDF Quiz]", "πŸ€–: Please upload a PDF file first."))

    # Handle Audio Mode
    elif audio_mode:
        if audio_file is not None:
            response, transcribed_text = process_voice_input(audio_file, openai_api_key, model_choice)
            history.append((f"πŸ‘€ (Voice): {transcribed_text}", f"πŸ€–: {response}"))
        else:
            history.append(("πŸ‘€: [Audio]", "πŸ€–: Please upload or record an audio file."))

    # Handle Image Mode
    else:
        if image is not None:
            response = generate_image_response(input_text, image, openai_api_key, model_choice)
            history.append((f"πŸ‘€: {input_text or '[Image]'}", f"πŸ€–: {response}"))
        elif input_text:
            # Handle text-only input when no image is provided
            try:
                openai.api_key = openai_api_key
                response = openai.ChatCompletion.create(
                    model=model_choice,
                    messages=[{"role": "user", "content": input_text}]
                )
                history.append((f"πŸ‘€: {input_text}", f"πŸ€–: {response.choices[0].message.content}"))
            except Exception as e:
                history.append((f"πŸ‘€: {input_text}", f"πŸ€–: Error: {str(e)}"))

    return "", None, None, None, new_pdf_content, history

# ---------- Clear Chat ----------
def clear_history():
    return "", None, None, None, "", []

# ---------- Input Type Toggle ----------
def update_input_type(choice):
    if choice == "Image":
        hint_text = """
        πŸ’‘ **Image Mode Tips:**
        - Both **o1** and **o3-mini** support image analysis
        - o1 provides more detailed analysis but costs more
        - o3-mini is faster and more cost-effective for simple image questions
        """
        return (
            gr.update(visible=True),   # input_text
            gr.update(visible=True),   # image_input
            gr.update(visible=False),  # pdf_input
            gr.update(visible=False),  # audio_input
            gr.update(visible=False),  # quiz_slider
            gr.update(value=False),    # pdf_quiz_mode
            gr.update(value=False),    # audio_mode
            gr.update(value=hint_text, visible=True)  # model_hint
        )
    elif choice == "PDF(QUIZ)":
        hint_text = """
        πŸ“š **PDF Quiz Mode Tips:**
        - Both models can generate quizzes from PDF content
        - o1 creates more comprehensive and detailed questions
        - o3-mini generates quizzes faster with good quality
        - Large PDFs are automatically limited to first 8000 characters
        """
        return (
            gr.update(visible=False),  # input_text
            gr.update(visible=False),  # image_input
            gr.update(visible=True),   # pdf_input
            gr.update(visible=False),  # audio_input
            gr.update(visible=True),   # quiz_slider
            gr.update(value=True),     # pdf_quiz_mode
            gr.update(value=False),    # audio_mode
            gr.update(value=hint_text, visible=True)  # model_hint
        )
    elif choice == "Audio":
        hint_text = """
        🎀 **Audio Mode Tips:**
        - **Important:** Audio transcription uses OpenAI's `whisper-1` model (separate cost)
        - **gpt-4 transcribe**: More sophisticated responses but higher cost per token
        - **gpt-4-mini-transcribe**: Cost-effective for most audio conversations
        - Supports common audio formats (MP3, WAV, M4A, etc.)
        - Maximum audio file size: 25MB
        """

        return (
            gr.update(visible=False),  # input_text
            gr.update(visible=False),  # image_input
            gr.update(visible=False),  # pdf_input
            gr.update(visible=True),   # audio_input
            gr.update(visible=False),  # quiz_slider
            gr.update(value=False),    # pdf_quiz_mode
            gr.update(value=True),     # audio_mode
            gr.update(value=hint_text, visible=True)  # model_hint
        )

# ---------- CSS Styling ----------
custom_css = """
.gradio-container {
    font-family: 'Arial', sans-serif;
    background-color: #f0f4f8;
}
.gradio-header {
    background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
    color: white;
    padding: 20px;
    border-radius: 8px;
    text-align: center;
}
#submit-btn {
    background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
    color: white;
    border-radius: 8px;
}
#clear-history {
    background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%);
    color: white;
    border-radius: 8px;
}
"""

# ---------- UI Interface ----------
def create_interface():
    with gr.Blocks(css=custom_css) as demo:
        gr.Markdown("""
            <div class="gradio-header">
                <h1>Multimodal Chatbot (Image + PDF Quiz + Voice)</h1>
                <h3>Ask via image, PDF, or voice</h3>
            </div>
        """)

        with gr.Accordion("Instructions", open=False):
            gr.Markdown("""
                - **Image Chat**: Upload an image and ask about it
                - **PDF Quiz**: Upload a PDF and generate MCQs
                - **Audio Chat**: Upload or record audio to chat
                - Always provide your OpenAI API key
            """)

        # State variables
        pdf_content = gr.State("")

        with gr.Row():
            openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...")

        with gr.Row():
            input_type = gr.Radio(["Image", "PDF(QUIZ)", "Audio"], label="Input Type", value="Image")

        # Model-specific hints that appear based on input type
        model_hint = gr.Markdown("", visible=False)

        # Input components row - all in one organized row
        with gr.Row():
            input_text = gr.Textbox(label="Question (for images)", visible=True)
            image_input = gr.Image(label="Upload Image", type="pil", visible=True)
            pdf_input = gr.File(label="Upload PDF", visible=False)
            audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False)
            quiz_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Questions", visible=False)

        # Hidden state components for mode control
        pdf_quiz_mode = gr.Checkbox(visible=False, value=False)
        audio_mode = gr.Checkbox(visible=False, value=False)

        with gr.Row():
            model_choice = gr.Dropdown(["o1", "o3-mini","whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe",], label="Model", value="o1")
            submit_btn = gr.Button("Submit", elem_id="submit-btn")
            clear_btn = gr.Button("Clear History", elem_id="clear-history")

        chat_history = gr.Chatbot()

        # Event handlers
        input_type.change(
            update_input_type,
            inputs=[input_type],
            outputs=[input_text, image_input, pdf_input, audio_input, quiz_slider, pdf_quiz_mode, audio_mode, model_hint]
        )

        submit_btn.click(
            chatbot,
            inputs=[input_text, image_input, pdf_input, audio_input, openai_api_key, model_choice,
                   pdf_content, quiz_slider, pdf_quiz_mode, audio_mode, chat_history],
            outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
        )

        clear_btn.click(
            clear_history,
            outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
        )

    return demo

# ---------- Launch ----------
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()