Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import openai | |
| import base64 | |
| from PIL import Image | |
| import io | |
| import fitz | |
| # ---------- PDF Text Extraction ---------- | |
| def extract_text_from_pdf(pdf_file): | |
| try: | |
| text = "" | |
| pdf_document = fitz.open(pdf_file) | |
| for page in pdf_document: | |
| text += page.get_text() | |
| pdf_document.close() | |
| return text | |
| except Exception as e: | |
| return f"Error extracting text from PDF: {str(e)}" | |
| # ---------- PDF Quiz Generation ---------- | |
| def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice): | |
| if not openai_api_key: | |
| return "Error: No API key provided." | |
| openai.api_key = openai_api_key | |
| limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content | |
| prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions. | |
| For each question: | |
| 1. Create a clear question based on key concepts in the document | |
| 2. Provide 4 options (A, B, C, D) | |
| 3. Indicate the correct answer | |
| 4. Briefly explain the correct answer | |
| Document content: | |
| {limited_content} | |
| """ | |
| try: | |
| response = openai.ChatCompletion.create( | |
| model=model_choice, | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"Error generating quiz: {str(e)}" | |
| # ---------- Image Processing ---------- | |
| def generate_image_response(input_text, image, openai_api_key, model_choice): | |
| if not openai_api_key: | |
| return "Error: No API key provided." | |
| openai.api_key = openai_api_key | |
| # Convert image to base64 | |
| buffered = io.BytesIO() | |
| image.save(buffered, format="PNG") | |
| base64_str = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| try: | |
| response = openai.ChatCompletion.create( | |
| model=model_choice, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": input_text}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}} | |
| ] | |
| } | |
| ] | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"Error processing image: {str(e)}" | |
| # ---------- Voice Processing ---------- | |
| def process_voice_input(audio_path, openai_api_key, model_choice): | |
| if not openai_api_key: | |
| return "Error: No API key provided." | |
| try: | |
| openai.api_key = openai_api_key | |
| audio_file = open(audio_path, "rb") | |
| transcript = openai.Audio.transcribe("whisper-1", audio_file) | |
| prompt = transcript["text"] | |
| audio_file.close() | |
| response = openai.ChatCompletion.create( | |
| model=model_choice, | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| return response.choices[0].message.content, prompt | |
| except Exception as e: | |
| return f"Error processing voice: {str(e)}", "" | |
| # ---------- Unified Chatbot Handler ---------- | |
| def chatbot(input_text, image, pdf_file, audio_file, openai_api_key, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, audio_mode, history): | |
| if history is None: | |
| history = [] | |
| new_pdf_content = pdf_content | |
| # Handle PDF file upload and extract text | |
| if pdf_file is not None: | |
| new_pdf_content = extract_text_from_pdf(pdf_file) | |
| # Handle PDF Quiz Mode | |
| if pdf_quiz_mode: | |
| if new_pdf_content: | |
| quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice) | |
| history.append((f"π€: [PDF Quiz - {num_quiz_questions} questions]", f"π€: {quiz_response}")) | |
| else: | |
| history.append(("π€: [PDF Quiz]", "π€: Please upload a PDF file first.")) | |
| # Handle Audio Mode | |
| elif audio_mode: | |
| if audio_file is not None: | |
| response, transcribed_text = process_voice_input(audio_file, openai_api_key, model_choice) | |
| history.append((f"π€ (Voice): {transcribed_text}", f"π€: {response}")) | |
| else: | |
| history.append(("π€: [Audio]", "π€: Please upload or record an audio file.")) | |
| # Handle Image Mode | |
| else: | |
| if image is not None: | |
| response = generate_image_response(input_text, image, openai_api_key, model_choice) | |
| history.append((f"π€: {input_text or '[Image]'}", f"π€: {response}")) | |
| elif input_text: | |
| # Handle text-only input when no image is provided | |
| try: | |
| openai.api_key = openai_api_key | |
| response = openai.ChatCompletion.create( | |
| model=model_choice, | |
| messages=[{"role": "user", "content": input_text}] | |
| ) | |
| history.append((f"π€: {input_text}", f"π€: {response.choices[0].message.content}")) | |
| except Exception as e: | |
| history.append((f"π€: {input_text}", f"π€: Error: {str(e)}")) | |
| return "", None, None, None, new_pdf_content, history | |
| # ---------- Clear Chat ---------- | |
| def clear_history(): | |
| return "", None, None, None, "", [] | |
| # ---------- Input Type Toggle ---------- | |
| def update_input_type(choice): | |
| if choice == "Image": | |
| hint_text = """ | |
| π‘ **Image Mode Tips:** | |
| - Both **o1** and **o3-mini** support image analysis | |
| - o1 provides more detailed analysis but costs more | |
| - o3-mini is faster and more cost-effective for simple image questions | |
| """ | |
| return ( | |
| gr.update(visible=True), # input_text | |
| gr.update(visible=True), # image_input | |
| gr.update(visible=False), # pdf_input | |
| gr.update(visible=False), # audio_input | |
| gr.update(visible=False), # quiz_slider | |
| gr.update(value=False), # pdf_quiz_mode | |
| gr.update(value=False), # audio_mode | |
| gr.update(value=hint_text, visible=True) # model_hint | |
| ) | |
| elif choice == "PDF(QUIZ)": | |
| hint_text = """ | |
| π **PDF Quiz Mode Tips:** | |
| - Both models can generate quizzes from PDF content | |
| - o1 creates more comprehensive and detailed questions | |
| - o3-mini generates quizzes faster with good quality | |
| - Large PDFs are automatically limited to first 8000 characters | |
| """ | |
| return ( | |
| gr.update(visible=False), # input_text | |
| gr.update(visible=False), # image_input | |
| gr.update(visible=True), # pdf_input | |
| gr.update(visible=False), # audio_input | |
| gr.update(visible=True), # quiz_slider | |
| gr.update(value=True), # pdf_quiz_mode | |
| gr.update(value=False), # audio_mode | |
| gr.update(value=hint_text, visible=True) # model_hint | |
| ) | |
| elif choice == "Audio": | |
| hint_text = """ | |
| π€ **Audio Mode Tips:** | |
| - **Important:** Audio transcription uses OpenAI's `whisper-1` model (separate cost) | |
| - **gpt-4 transcribe**: More sophisticated responses but higher cost per token | |
| - **gpt-4-mini-transcribe**: Cost-effective for most audio conversations | |
| - Supports common audio formats (MP3, WAV, M4A, etc.) | |
| - Maximum audio file size: 25MB | |
| """ | |
| return ( | |
| gr.update(visible=False), # input_text | |
| gr.update(visible=False), # image_input | |
| gr.update(visible=False), # pdf_input | |
| gr.update(visible=True), # audio_input | |
| gr.update(visible=False), # quiz_slider | |
| gr.update(value=False), # pdf_quiz_mode | |
| gr.update(value=True), # audio_mode | |
| gr.update(value=hint_text, visible=True) # model_hint | |
| ) | |
| # ---------- CSS Styling ---------- | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'Arial', sans-serif; | |
| background-color: #f0f4f8; | |
| } | |
| .gradio-header { | |
| background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); | |
| color: white; | |
| padding: 20px; | |
| border-radius: 8px; | |
| text-align: center; | |
| } | |
| #submit-btn { | |
| background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); | |
| color: white; | |
| border-radius: 8px; | |
| } | |
| #clear-history { | |
| background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%); | |
| color: white; | |
| border-radius: 8px; | |
| } | |
| """ | |
| # ---------- UI Interface ---------- | |
| def create_interface(): | |
| with gr.Blocks(css=custom_css) as demo: | |
| gr.Markdown(""" | |
| <div class="gradio-header"> | |
| <h1>Multimodal Chatbot (Image + PDF Quiz + Voice)</h1> | |
| <h3>Ask via image, PDF, or voice</h3> | |
| </div> | |
| """) | |
| with gr.Accordion("Instructions", open=False): | |
| gr.Markdown(""" | |
| - **Image Chat**: Upload an image and ask about it | |
| - **PDF Quiz**: Upload a PDF and generate MCQs | |
| - **Audio Chat**: Upload or record audio to chat | |
| - Always provide your OpenAI API key | |
| """) | |
| # State variables | |
| pdf_content = gr.State("") | |
| with gr.Row(): | |
| openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...") | |
| with gr.Row(): | |
| input_type = gr.Radio(["Image", "PDF(QUIZ)", "Audio"], label="Input Type", value="Image") | |
| # Model-specific hints that appear based on input type | |
| model_hint = gr.Markdown("", visible=False) | |
| # Input components row - all in one organized row | |
| with gr.Row(): | |
| input_text = gr.Textbox(label="Question (for images)", visible=True) | |
| image_input = gr.Image(label="Upload Image", type="pil", visible=True) | |
| pdf_input = gr.File(label="Upload PDF", visible=False) | |
| audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False) | |
| quiz_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Questions", visible=False) | |
| # Hidden state components for mode control | |
| pdf_quiz_mode = gr.Checkbox(visible=False, value=False) | |
| audio_mode = gr.Checkbox(visible=False, value=False) | |
| with gr.Row(): | |
| model_choice = gr.Dropdown(["o1", "o3-mini","whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe",], label="Model", value="o1") | |
| submit_btn = gr.Button("Submit", elem_id="submit-btn") | |
| clear_btn = gr.Button("Clear History", elem_id="clear-history") | |
| chat_history = gr.Chatbot() | |
| # Event handlers | |
| input_type.change( | |
| update_input_type, | |
| inputs=[input_type], | |
| outputs=[input_text, image_input, pdf_input, audio_input, quiz_slider, pdf_quiz_mode, audio_mode, model_hint] | |
| ) | |
| submit_btn.click( | |
| chatbot, | |
| inputs=[input_text, image_input, pdf_input, audio_input, openai_api_key, model_choice, | |
| pdf_content, quiz_slider, pdf_quiz_mode, audio_mode, chat_history], | |
| outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history] | |
| ) | |
| clear_btn.click( | |
| clear_history, | |
| outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history] | |
| ) | |
| return demo | |
| # ---------- Launch ---------- | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() |