Spaces:
Sleeping
Sleeping
File size: 11,402 Bytes
a047f73 4d40501 a047f73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 |
import gradio as gr
import openai
import base64
from PIL import Image
import io
import fitz
# ---------- PDF Text Extraction ----------
def extract_text_from_pdf(pdf_file):
try:
text = ""
pdf_document = fitz.open(pdf_file)
for page in pdf_document:
text += page.get_text()
pdf_document.close()
return text
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"
# ---------- PDF Quiz Generation ----------
def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content
prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
For each question:
1. Create a clear question based on key concepts in the document
2. Provide 4 options (A, B, C, D)
3. Indicate the correct answer
4. Briefly explain the correct answer
Document content:
{limited_content}
"""
try:
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
except Exception as e:
return f"Error generating quiz: {str(e)}"
# ---------- Image Processing ----------
def generate_image_response(input_text, image, openai_api_key, model_choice):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
base64_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
try:
response = openai.ChatCompletion.create(
model=model_choice,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": input_text},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
]
}
]
)
return response.choices[0].message.content
except Exception as e:
return f"Error processing image: {str(e)}"
# ---------- Voice Processing ----------
def process_voice_input(audio_path, openai_api_key, model_choice):
if not openai_api_key:
return "Error: No API key provided."
try:
openai.api_key = openai_api_key
audio_file = open(audio_path, "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
prompt = transcript["text"]
audio_file.close()
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content, prompt
except Exception as e:
return f"Error processing voice: {str(e)}", ""
# ---------- Unified Chatbot Handler ----------
def chatbot(input_text, image, pdf_file, audio_file, openai_api_key, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, audio_mode, history):
if history is None:
history = []
new_pdf_content = pdf_content
# Handle PDF file upload and extract text
if pdf_file is not None:
new_pdf_content = extract_text_from_pdf(pdf_file)
# Handle PDF Quiz Mode
if pdf_quiz_mode:
if new_pdf_content:
quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
history.append((f"π€: [PDF Quiz - {num_quiz_questions} questions]", f"π€: {quiz_response}"))
else:
history.append(("π€: [PDF Quiz]", "π€: Please upload a PDF file first."))
# Handle Audio Mode
elif audio_mode:
if audio_file is not None:
response, transcribed_text = process_voice_input(audio_file, openai_api_key, model_choice)
history.append((f"π€ (Voice): {transcribed_text}", f"π€: {response}"))
else:
history.append(("π€: [Audio]", "π€: Please upload or record an audio file."))
# Handle Image Mode
else:
if image is not None:
response = generate_image_response(input_text, image, openai_api_key, model_choice)
history.append((f"π€: {input_text or '[Image]'}", f"π€: {response}"))
elif input_text:
# Handle text-only input when no image is provided
try:
openai.api_key = openai_api_key
response = openai.ChatCompletion.create(
model=model_choice,
messages=[{"role": "user", "content": input_text}]
)
history.append((f"π€: {input_text}", f"π€: {response.choices[0].message.content}"))
except Exception as e:
history.append((f"π€: {input_text}", f"π€: Error: {str(e)}"))
return "", None, None, None, new_pdf_content, history
# ---------- Clear Chat ----------
def clear_history():
return "", None, None, None, "", []
# ---------- Input Type Toggle ----------
def update_input_type(choice):
if choice == "Image":
hint_text = """
π‘ **Image Mode Tips:**
- Both **o1** and **o3-mini** support image analysis
- o1 provides more detailed analysis but costs more
- o3-mini is faster and more cost-effective for simple image questions
"""
return (
gr.update(visible=True), # input_text
gr.update(visible=True), # image_input
gr.update(visible=False), # pdf_input
gr.update(visible=False), # audio_input
gr.update(visible=False), # quiz_slider
gr.update(value=False), # pdf_quiz_mode
gr.update(value=False), # audio_mode
gr.update(value=hint_text, visible=True) # model_hint
)
elif choice == "PDF(QUIZ)":
hint_text = """
π **PDF Quiz Mode Tips:**
- Both models can generate quizzes from PDF content
- o1 creates more comprehensive and detailed questions
- o3-mini generates quizzes faster with good quality
- Large PDFs are automatically limited to first 8000 characters
"""
return (
gr.update(visible=False), # input_text
gr.update(visible=False), # image_input
gr.update(visible=True), # pdf_input
gr.update(visible=False), # audio_input
gr.update(visible=True), # quiz_slider
gr.update(value=True), # pdf_quiz_mode
gr.update(value=False), # audio_mode
gr.update(value=hint_text, visible=True) # model_hint
)
elif choice == "Audio":
hint_text = """
π€ **Audio Mode Tips:**
- **Important:** Audio transcription uses OpenAI's `whisper-1` model (separate cost)
- **gpt-4 transcribe**: More sophisticated responses but higher cost per token
- **gpt-4-mini-transcribe**: Cost-effective for most audio conversations
- Supports common audio formats (MP3, WAV, M4A, etc.)
- Maximum audio file size: 25MB
"""
return (
gr.update(visible=False), # input_text
gr.update(visible=False), # image_input
gr.update(visible=False), # pdf_input
gr.update(visible=True), # audio_input
gr.update(visible=False), # quiz_slider
gr.update(value=False), # pdf_quiz_mode
gr.update(value=True), # audio_mode
gr.update(value=hint_text, visible=True) # model_hint
)
# ---------- CSS Styling ----------
custom_css = """
.gradio-container {
font-family: 'Arial', sans-serif;
background-color: #f0f4f8;
}
.gradio-header {
background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
#submit-btn {
background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%);
color: white;
border-radius: 8px;
}
#clear-history {
background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%);
color: white;
border-radius: 8px;
}
"""
# ---------- UI Interface ----------
def create_interface():
with gr.Blocks(css=custom_css) as demo:
gr.Markdown("""
<div class="gradio-header">
<h1>Multimodal Chatbot (Image + PDF Quiz + Voice)</h1>
<h3>Ask via image, PDF, or voice</h3>
</div>
""")
with gr.Accordion("Instructions", open=False):
gr.Markdown("""
- **Image Chat**: Upload an image and ask about it
- **PDF Quiz**: Upload a PDF and generate MCQs
- **Audio Chat**: Upload or record audio to chat
- Always provide your OpenAI API key
""")
# State variables
pdf_content = gr.State("")
with gr.Row():
openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...")
with gr.Row():
input_type = gr.Radio(["Image", "PDF(QUIZ)", "Audio"], label="Input Type", value="Image")
# Model-specific hints that appear based on input type
model_hint = gr.Markdown("", visible=False)
# Input components row - all in one organized row
with gr.Row():
input_text = gr.Textbox(label="Question (for images)", visible=True)
image_input = gr.Image(label="Upload Image", type="pil", visible=True)
pdf_input = gr.File(label="Upload PDF", visible=False)
audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False)
quiz_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Questions", visible=False)
# Hidden state components for mode control
pdf_quiz_mode = gr.Checkbox(visible=False, value=False)
audio_mode = gr.Checkbox(visible=False, value=False)
with gr.Row():
model_choice = gr.Dropdown(["o1", "o3-mini","whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe",], label="Model", value="o1")
submit_btn = gr.Button("Submit", elem_id="submit-btn")
clear_btn = gr.Button("Clear History", elem_id="clear-history")
chat_history = gr.Chatbot()
# Event handlers
input_type.change(
update_input_type,
inputs=[input_type],
outputs=[input_text, image_input, pdf_input, audio_input, quiz_slider, pdf_quiz_mode, audio_mode, model_hint]
)
submit_btn.click(
chatbot,
inputs=[input_text, image_input, pdf_input, audio_input, openai_api_key, model_choice,
pdf_content, quiz_slider, pdf_quiz_mode, audio_mode, chat_history],
outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
)
clear_btn.click(
clear_history,
outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history]
)
return demo
# ---------- Launch ----------
if __name__ == "__main__":
demo = create_interface()
demo.launch() |