sajjadrahman56's picture
Update app.py
4d40501 verified
import gradio as gr
import openai
import base64
from PIL import Image
import io
import fitz
# ---------- PDF Text Extraction ----------
def extract_text_from_pdf(pdf_file):
try:
text = ""
pdf_document = fitz.open(pdf_file)
for page in pdf_document:
text += page.get_text()
pdf_document.close()
return text
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"
# ---------- PDF Quiz Generation ----------
def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content
prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
For each question:
1. Create a clear question based on key concepts in the document
2. Provide 4 options (A, B, C, D)
3. Indicate the correct answer
4. Briefly explain the correct answer
Document content:
{limited_content}
"""
try:
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
except Exception as e:
return f"Error generating quiz: {str(e)}"
# ---------- Image Processing ----------
def generate_image_response(input_text, image, openai_api_key, model_choice):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
base64_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
try:
response = openai.ChatCompletion.create(
model=model_choice,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": input_text},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
]
}
]
)
return response.choices[0].message.content
except Exception as e:
return f"Error processing image: {str(e)}"
# ---------- Voice Processing ----------
def process_voice_input(audio_path, openai_api_key, model_choice):
if not openai_api_key:
return "Error: No API key provided."
try:
openai.api_key = openai_api_key
audio_file = open(audio_path, "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
prompt = transcript["text"]
audio_file.close()
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content, prompt
except Exception as e:
return f"Error processing voice: {str(e)}", ""
# ---------- Unified Chatbot Handler ----------
def chatbot(input_text, image, pdf_file, audio_file, openai_api_key, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, audio_mode, history):
if history is None:
history = []
new_pdf_content = pdf_content
# Handle PDF file upload and extract text
if pdf_file is not None:
new_pdf_content = extract_text_from_pdf(pdf_file)
# Handle PDF Quiz Mode
if pdf_quiz_mode:
if new_pdf_content:
quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
history.append((f"πŸ‘€: [PDF Quiz - {num_quiz_questions} questions]", f"πŸ€–: {quiz_response}"))
else:
history.append(("πŸ‘€: [PDF Quiz]", "πŸ€–: Please upload a PDF file first."))
# Handle Audio Mode
elif audio_mode:
if audio_file is not None:
response, transcribed_text = process_voice_input(audio_file, openai_api_key, model_choice)
history.append((f"πŸ‘€ (Voice): {transcribed_text}", f"πŸ€–: {response}"))
else:
history.append(("πŸ‘€: [Audio]", "πŸ€–: Please upload or record an audio file."))
# Handle Image Mode
else:
if image is not None:
response = generate_image_response(input_text, image, openai_api_key, model_choice)
history.append((f"πŸ‘€: {input_text or '[Image]'}", f"πŸ€–: {response}"))
elif input_text:
# Handle text-only input when no image is provided
try:
openai.api_key = openai_api_key
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": input_text}]
)
history.append((f"πŸ‘€: {input_text}", f"πŸ€–: {response.choices[0].message.content}"))
except Exception as e:
history.append((f"πŸ‘€: {input_text}", f"πŸ€–: Error: {str(e)}"))
return "", None, None, None, new_pdf_content, history
# ---------- Clear Chat ----------
def clear_history():
return "", None, None, None, "", []
# ---------- Input Type Toggle ----------
def update_input_type(choice):
if choice == "Image":
hint_text = """
πŸ’‘ **Image Mode Tips:**
- Both **o1** and **o3-mini** support image analysis
- o1 provides more detailed analysis but costs more
- o3-mini is faster and more cost-effective for simple image questions
"""
return (
gr.update(visible=True), # input_text
gr.update(visible=True), # image_input
gr.update(visible=False), # pdf_input
gr.update(visible=False), # audio_input
gr.update(visible=False), # quiz_slider
gr.update(value=False), # pdf_quiz_mode
gr.update(value=False), # audio_mode
gr.update(value=hint_text, visible=True) # model_hint
)
elif choice == "PDF(QUIZ)":
hint_text = """
πŸ“š **PDF Quiz Mode Tips:**
- Both models can generate quizzes from PDF content
- o1 creates more comprehensive and detailed questions
- o3-mini generates quizzes faster with good quality
- Large PDFs are automatically limited to first 8000 characters
"""
return (
gr.update(visible=False), # input_text
gr.update(visible=False), # image_input
gr.update(visible=True), # pdf_input
gr.update(visible=False), # audio_input
gr.update(visible=True), # quiz_slider
gr.update(value=True), # pdf_quiz_mode
gr.update(value=False), # audio_mode
gr.update(value=hint_text, visible=True) # model_hint
)
elif choice == "Audio":
hint_text = """
🎀 **Audio Mode Tips:**
- **Important:** Audio transcription uses OpenAI's `whisper-1` model (separate cost)
- **gpt-4 transcribe**: More sophisticated responses but higher cost per token
- **gpt-4-mini-transcribe**: Cost-effective for most audio conversations
- Supports common audio formats (MP3, WAV, M4A, etc.)
- Maximum audio file size: 25MB
"""
return (
gr.update(visible=False), # input_text
gr.update(visible=False), # image_input
gr.update(visible=False), # pdf_input
gr.update(visible=True), # audio_input
gr.update(visible=False), # quiz_slider
gr.update(value=False), # pdf_quiz_mode
gr.update(value=True), # audio_mode
gr.update(value=hint_text, visible=True) # model_hint
)
# ---------- CSS Styling ----------
custom_css = """
.gradio-container {
font-family: 'Arial', sans-serif;
background-color: #f0f4f8;
}
.gradio-header {
background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
#submit-btn {
background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
color: white;
border-radius: 8px;
}
#clear-history {
background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%);
color: white;
border-radius: 8px;
}
"""
# ---------- UI Interface ----------
def create_interface():
with gr.Blocks(css=custom_css) as demo:
gr.Markdown("""
<div class="gradio-header">
<h1>Multimodal Chatbot (Image + PDF Quiz + Voice)</h1>
<h3>Ask via image, PDF, or voice</h3>
</div>
""")
with gr.Accordion("Instructions", open=False):
gr.Markdown("""
- **Image Chat**: Upload an image and ask about it
- **PDF Quiz**: Upload a PDF and generate MCQs
- **Audio Chat**: Upload or record audio to chat
- Always provide your OpenAI API key
""")
# State variables
pdf_content = gr.State("")
with gr.Row():
openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...")
with gr.Row():
input_type = gr.Radio(["Image", "PDF(QUIZ)", "Audio"], label="Input Type", value="Image")
# Model-specific hints that appear based on input type
model_hint = gr.Markdown("", visible=False)
# Input components row - all in one organized row
with gr.Row():
input_text = gr.Textbox(label="Question (for images)", visible=True)
image_input = gr.Image(label="Upload Image", type="pil", visible=True)
pdf_input = gr.File(label="Upload PDF", visible=False)
audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False)
quiz_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Questions", visible=False)
# Hidden state components for mode control
pdf_quiz_mode = gr.Checkbox(visible=False, value=False)
audio_mode = gr.Checkbox(visible=False, value=False)
with gr.Row():
model_choice = gr.Dropdown(["o1", "o3-mini","whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe",], label="Model", value="o1")
submit_btn = gr.Button("Submit", elem_id="submit-btn")
clear_btn = gr.Button("Clear History", elem_id="clear-history")
chat_history = gr.Chatbot()
# Event handlers
input_type.change(
update_input_type,
inputs=[input_type],
outputs=[input_text, image_input, pdf_input, audio_input, quiz_slider, pdf_quiz_mode, audio_mode, model_hint]
)
submit_btn.click(
chatbot,
inputs=[input_text, image_input, pdf_input, audio_input, openai_api_key, model_choice,
pdf_content, quiz_slider, pdf_quiz_mode, audio_mode, chat_history],
outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
)
clear_btn.click(
clear_history,
outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
)
return demo
# ---------- Launch ----------
if __name__ == "__main__":
demo = create_interface()
demo.launch()