import gradio as gr import openai import base64 from PIL import Image import io import fitz # PyMuPDF for PDF handling # Extract text from PDF def extract_text_from_pdf(pdf_file): try: text = "" pdf_document = fitz.open(pdf_file) for page_num in range(len(pdf_document)): page = pdf_document[page_num] text += page.get_text() pdf_document.close() return text except Exception as e: return f"Error extracting text from PDF: {str(e)}" # Generate MCQ quiz from PDF def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key limited_content = pdf_content[:8000] prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions. For each question: 1. Write a clear question 2. Give 4 options (A, B, C, D) 3. Indicate the correct answer 4. Briefly explain why the answer is correct Document: {limited_content} """ try: response = openai.ChatCompletion.create( model=model_choice, messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content except Exception as e: return f"Error generating quiz: {str(e)}" # Convert image to base64 def get_base64_string_from_image(pil_image): buffered = io.BytesIO() pil_image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode("utf-8") # Transcribe audio def transcribe_audio(audio, openai_api_key): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key try: with open(audio, 'rb') as f: audio_bytes = f.read() file_obj = io.BytesIO(audio_bytes) file_obj.name = 'audio.wav' transcription = openai.Audio.transcribe(file=file_obj, model="whisper-1") return transcription.text except Exception as e: return f"Error transcribing audio: {str(e)}" # Generate response for text/pdf (using o1 or o3-mini) def generate_text_response(input_text, pdf_content, openai_api_key, reasoning_effort, model_choice): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key if pdf_content and input_text: input_text = f"Based on the document below, answer the question:\n\n{input_text}\n\nDocument:\n{pdf_content}" try: response = openai.ChatCompletion.create( model=model_choice, messages=[{"role": "user", "content": input_text}], max_completion_tokens=2000 ) return response.choices[0].message.content except Exception as e: return f"Error calling OpenAI API: {str(e)}" # Generate response for image (using GPT-4.5) def generate_image_response(image, input_text, openai_api_key): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key try: image_b64 = get_base64_string_from_image(image) messages = [ {"role": "user", "content": [ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}, {"type": "text", "text": input_text} ]}, ] response = openai.ChatCompletion.create( model="gpt-4.1", messages=messages, max_tokens=2000 ) return response.choices[0].message.content except Exception as e: return f"Error calling OpenAI API for image: {str(e)}" # Chatbot logic def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, history): if history is None: history = [] if audio: input_text = transcribe_audio(audio, openai_api_key) new_pdf_content = pdf_content if pdf_file: new_pdf_content = extract_text_from_pdf(pdf_file) if pdf_quiz_mode: if new_pdf_content: quiz = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice) history.append({"role": "user", "content": f"📘 Generated {num_quiz_questions} quiz questions"}) history.append({"role": "assistant", "content": quiz}) else: history.append({"role": "user", "content": "No PDF detected"}) history.append({"role": "assistant", "content": "Please upload a PDF file first."}) else: # Handle image input separately with GPT-4.5 if image: if not input_text: input_text = "Please describe this image." response = generate_image_response(image, input_text, openai_api_key) history.append({"role": "user", "content": f"🖼️ [Image Uploaded] {input_text}"}) history.append({"role": "assistant", "content": response}) # Handle text/PDF with o1 or o3-mini elif input_text or new_pdf_content: response = generate_text_response(input_text, new_pdf_content, openai_api_key, reasoning_effort, model_choice) if input_text: history.append({"role": "user", "content": input_text}) history.append({"role": "assistant", "content": response}) elif pdf_file: history.append({"role": "user", "content": "📄 [PDF Uploaded]"}) history.append({"role": "assistant", "content": response}) else: history.append({"role": "user", "content": "No input"}) history.append({"role": "assistant", "content": "Please provide input."}) return "", None, None, None, new_pdf_content, history # Reset all fields def clear_history(): return "", None, None, None, "", [] # Extract text when PDF uploaded def process_pdf(pdf_file): if pdf_file is None: return "" return extract_text_from_pdf(pdf_file) # Switch between input modes def update_input_type(choice): if choice == "Text": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False) elif choice == "Image": return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False) elif choice == "Voice": return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value=False) elif choice == "PDF": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=False) elif choice == "PDF(QUIZ)": return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(value=True) # Build Gradio interface def create_interface(): with gr.Blocks() as demo: gr.Markdown("## 🧠 Multimodal Chatbot — Text | Image | Voice | PDF | Quiz") gr.Markdown("*Image chat uses GPT-4.5 | Text/PDF/Quiz use O1/O3-mini models*") pdf_content = gr.State("") openai_api_key = gr.Textbox(label="🔑 OpenAI API Key", type="password", placeholder="sk-...") input_type = gr.Radio( ["Text", "Image", "Voice", "PDF", "PDF(QUIZ)"], label="Choose Input Type", value="Text" ) input_text = gr.Textbox(label="Enter your question or text", lines=2, visible=True) image_input = gr.Image(label="Upload Image", type="pil", visible=False) audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False) pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], visible=False) quiz_questions_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Quiz Questions", visible=False) quiz_mode = gr.Checkbox(label="Quiz Mode", visible=False, value=False) with gr.Row(): reasoning_effort = gr.Dropdown(["low", "medium", "high"], value="medium", label="Reasoning Effort (for Text/PDF)") model_choice = gr.Dropdown(["o1", "o3-mini"], value="o1", label="Model (for Text/PDF/Quiz)") submit_btn = gr.Button("Submit") clear_btn = gr.Button("Clear Chat") chat_history = gr.Chatbot(label="Chat History") # Input type handling input_type.change( fn=update_input_type, inputs=[input_type], outputs=[input_text, image_input, audio_input, pdf_input, quiz_questions_slider, quiz_mode] ) # PDF upload processing pdf_input.change(fn=process_pdf, inputs=[pdf_input], outputs=[pdf_content]) # Submit submit_btn.click( fn=chatbot, inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content, quiz_questions_slider, quiz_mode, chat_history], outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history] ) # Clear clear_btn.click(fn=clear_history, inputs=[], outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]) return demo if __name__ == "__main__": demo = create_interface() demo.launch()