Spaces:

shukdev3
/

Multi-modal-o1-Chatbot

Sleeping

App Files Files Community

Multi-modal-o1-Chatbot / app.py

shukdev3

Update app.py

c0826ad verified 6 months ago

Raw

History Blame Contribute Delete

9.56 kB

	import gradio as gr
	import openai
	import base64
	from PIL import Image
	import io
	import fitz # PyMuPDF for PDF handling


	# Extract text from PDF
	def extract_text_from_pdf(pdf_file):
	try:
	text = ""
	pdf_document = fitz.open(pdf_file)
	for page_num in range(len(pdf_document)):
	page = pdf_document[page_num]
	text += page.get_text()
	pdf_document.close()
	return text
	except Exception as e:
	return f"Error extracting text from PDF: {str(e)}"


	# Generate MCQ quiz from PDF
	def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
	if not openai_api_key:
	return "Error: No API key provided."
	openai.api_key = openai_api_key
	limited_content = pdf_content[:8000]
	prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
	For each question:
	1. Write a clear question
	2. Give 4 options (A, B, C, D)
	3. Indicate the correct answer
	4. Briefly explain why the answer is correct
	Document:
	{limited_content}
	"""
	try:
	response = openai.ChatCompletion.create(
	model=model_choice,
	messages=[{"role": "user", "content": prompt}]
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"Error generating quiz: {str(e)}"


	# Convert image to base64
	def get_base64_string_from_image(pil_image):
	buffered = io.BytesIO()
	pil_image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")


	# Transcribe audio
	def transcribe_audio(audio, openai_api_key):
	if not openai_api_key:
	return "Error: No API key provided."
	openai.api_key = openai_api_key
	try:
	with open(audio, 'rb') as f:
	audio_bytes = f.read()
	file_obj = io.BytesIO(audio_bytes)
	file_obj.name = 'audio.wav'
	transcription = openai.Audio.transcribe(file=file_obj, model="whisper-1")
	return transcription.text
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"


	# Generate response for text/pdf (using o1 or o3-mini)
	def generate_text_response(input_text, pdf_content, openai_api_key, reasoning_effort, model_choice):
	if not openai_api_key:
	return "Error: No API key provided."
	openai.api_key = openai_api_key

	if pdf_content and input_text:
	input_text = f"Based on the document below, answer the question:\n\n{input_text}\n\nDocument:\n{pdf_content}"

	try:
	response = openai.ChatCompletion.create(
	model=model_choice,
	messages=[{"role": "user", "content": input_text}],
	max_completion_tokens=2000
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"Error calling OpenAI API: {str(e)}"


	# Generate response for image (using GPT-4.5)
	def generate_image_response(image, input_text, openai_api_key):
	if not openai_api_key:
	return "Error: No API key provided."
	openai.api_key = openai_api_key

	try:
	image_b64 = get_base64_string_from_image(image)

	messages = [
	{"role": "user", "content": [
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
	{"type": "text", "text": input_text}
	]},
	]

	response = openai.ChatCompletion.create(
	model="gpt-4.1",
	messages=messages,
	max_tokens=2000
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"Error calling OpenAI API for image: {str(e)}"


	# Chatbot logic
	def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, history):
	if history is None:
	history = []

	if audio:
	input_text = transcribe_audio(audio, openai_api_key)

	new_pdf_content = pdf_content
	if pdf_file:
	new_pdf_content = extract_text_from_pdf(pdf_file)

	if pdf_quiz_mode:
	if new_pdf_content:
	quiz = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
	history.append({"role": "user", "content": f"📘 Generated {num_quiz_questions} quiz questions"})
	history.append({"role": "assistant", "content": quiz})
	else:
	history.append({"role": "user", "content": "No PDF detected"})
	history.append({"role": "assistant", "content": "Please upload a PDF file first."})
	else:
	# Handle image input separately with GPT-4.5
	if image:
	if not input_text:
	input_text = "Please describe this image."
	response = generate_image_response(image, input_text, openai_api_key)
	history.append({"role": "user", "content": f"🖼️ [Image Uploaded] {input_text}"})
	history.append({"role": "assistant", "content": response})
	# Handle text/PDF with o1 or o3-mini
	elif input_text or new_pdf_content:
	response = generate_text_response(input_text, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
	if input_text:
	history.append({"role": "user", "content": input_text})
	history.append({"role": "assistant", "content": response})
	elif pdf_file:
	history.append({"role": "user", "content": "📄 [PDF Uploaded]"})
	history.append({"role": "assistant", "content": response})
	else:
	history.append({"role": "user", "content": "No input"})
	history.append({"role": "assistant", "content": "Please provide input."})

	return "", None, None, None, new_pdf_content, history


	# Reset all fields
	def clear_history():
	return "", None, None, None, "", []


	# Extract text when PDF uploaded
	def process_pdf(pdf_file):
	if pdf_file is None:
	return ""
	return extract_text_from_pdf(pdf_file)


	# Switch between input modes
	def update_input_type(choice):
	if choice == "Text":
	return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
	elif choice == "Image":
	return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
	elif choice == "Voice":
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
	elif choice == "PDF":
	return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=False)
	elif choice == "PDF(QUIZ)":
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(value=True)


	# Build Gradio interface
	def create_interface():
	with gr.Blocks() as demo:
	gr.Markdown("## 🧠 Multimodal Chatbot — Text \| Image \| Voice \| PDF \| Quiz")
	gr.Markdown("Image chat uses GPT-4.5 \| Text/PDF/Quiz use O1/O3-mini models")

	pdf_content = gr.State("")

	openai_api_key = gr.Textbox(label="🔑 OpenAI API Key", type="password", placeholder="sk-...")

	input_type = gr.Radio(
	["Text", "Image", "Voice", "PDF", "PDF(QUIZ)"],
	label="Choose Input Type",
	value="Text"
	)

	input_text = gr.Textbox(label="Enter your question or text", lines=2, visible=True)
	image_input = gr.Image(label="Upload Image", type="pil", visible=False)
	audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False)
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], visible=False)
	quiz_questions_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Quiz Questions", visible=False)
	quiz_mode = gr.Checkbox(label="Quiz Mode", visible=False, value=False)

	with gr.Row():
	reasoning_effort = gr.Dropdown(["low", "medium", "high"], value="medium", label="Reasoning Effort (for Text/PDF)")
	model_choice = gr.Dropdown(["o1", "o3-mini"], value="o1", label="Model (for Text/PDF/Quiz)")

	submit_btn = gr.Button("Submit")
	clear_btn = gr.Button("Clear Chat")

	chat_history = gr.Chatbot(label="Chat History")

	# Input type handling
	input_type.change(
	fn=update_input_type,
	inputs=[input_type],
	outputs=[input_text, image_input, audio_input, pdf_input, quiz_questions_slider, quiz_mode]
	)

	# PDF upload processing
	pdf_input.change(fn=process_pdf, inputs=[pdf_input], outputs=[pdf_content])

	# Submit
	submit_btn.click(
	fn=chatbot,
	inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content, quiz_questions_slider, quiz_mode, chat_history],
	outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
	)

	# Clear
	clear_btn.click(fn=clear_history, inputs=[], outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history])

	return demo


	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()