Spaces:

Ansnaeem
/

MSDSF25M005

Sleeping

App Files Files Community

MSDSF25M005 / app.py

Ansnaeem

Upload folder using huggingface_hub

2819eb9 verified about 1 month ago

raw

history blame contribute delete

28.1 kB

	"""
	Version 5 — Multimodal AI Assistant (Image Generation)

	Extends Version 4 with:
	- On-demand image generation: after LLM response, decide if visual illustration helps
	- Text-to-image via HF Inference API (FLUX.1-schnell) or DALL-E 3
	- General Chat tab (free-form Q&A with image gen)
	- Model: llama-3.3-70b-versatile \| STT: whisper-large-v3 \| TTS: gTTS
	"""

	# Patch for Hugging Face Spaces: HfFolder removed in huggingface_hub 0.26+
	import huggingface_hub
	if not hasattr(huggingface_hub, "HfFolder"):
	class _HfFolderStub:
	@staticmethod
	def save_token(token): pass
	@staticmethod
	def get_token(): return None
	huggingface_hub.HfFolder = _HfFolderStub

	import json
	import os
	import re
	import tempfile
	import urllib3
	from urllib.parse import urlparse

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from dotenv import load_dotenv
	from groq import Groq
	from youtube_transcript_api import YouTubeTranscriptApi

	from storage import load_storage, save_storage, history_dicts_to_tuples

	load_dotenv()

	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	BRIGHTDATA_API_KEY = os.getenv("BRIGHTDATA_API_KEY")
	BRIGHTDATA_UNLOCKER_ZONE = os.getenv("BRIGHTDATA_UNLOCKER_ZONE", "goodreads_unlocker")
	YOUTUBE_UNLOCKER_ZONE = os.getenv("YOUTUBE_UNLOCKER_ZONE", "").strip() or BRIGHTDATA_UNLOCKER_ZONE
	HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

	if not GROQ_API_KEY:
	raise ValueError("GROQ_API_KEY is not set.")

	client = Groq(api_key=GROQ_API_KEY)
	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

	contexts = {"scraper": "", "youtube": ""}
	storage = load_storage()

	# Model config
	LLM_MODEL = "llama-3.3-70b-versatile" # fallback to llama-3.1-8b-instant if unavailable
	IMAGE_MODELS_HF = ["black-forest-labs/FLUX.1-schnell", "stabilityai/stable-diffusion-xl-base-1.0"]

	# Lazy-load Whisper (heavy model)
	_transcriber = None

	def _get_transcriber():
	global _transcriber
	if _transcriber is None:
	from transformers import pipeline
	_transcriber = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-large-v3",
	device=-1,
	)
	return _transcriber


	# ---------------------------------------------------------------------------
	# Image Generation
	# ---------------------------------------------------------------------------

	_IMAGE_KEYWORDS = (
	"visually", "visual", "show me", "diagram", "image", "picture", "illustrate",
	"illustration", "draw", "sketch", "look like", "looks like", "what does",
	"architecture", "structure", "solar system", "transformer", "neural network",
	"show", "display", "see", "demonstrate",
	)

	def _should_generate_image(user_query: str, bot_response: str) -> bool:
	"""Decide if a visual illustration would help. Uses keyword check + optional LLM fallback."""
	q = (user_query or "").lower()
	for kw in _IMAGE_KEYWORDS:
	if kw in q or re.search(kw.replace(".", "."), q):
	return True
	# Optional: short LLM call for edge cases
	try:
	resp = client.chat.completions.create(
	model="llama-3.1-8b-instant",
	messages=[
	{"role": "system", "content": "Answer YES or NO only. Would a visual illustration help explain the user's question?"},
	{"role": "user", "content": f"User asked: {user_query[:200]}"},
	],
	max_tokens=10,
	temperature=0,
	)
	ans = (resp.choices[0].message.content or "").strip().upper()
	return "YES" in ans
	except Exception:
	return False


	def _generate_image_prompt(user_query: str) -> str:
	"""Create a condensed, descriptive prompt for the image model from the user query."""
	try:
	resp = client.chat.completions.create(
	model="llama-3.1-8b-instant",
	messages=[
	{"role": "system", "content": "Generate a short, descriptive image prompt (max 100 chars) for a text-to-image model. Describe the main visual subject. No quotes."},
	{"role": "user", "content": user_query[:300]},
	],
	max_tokens=80,
	temperature=0.5,
	)
	prompt = (resp.choices[0].message.content or user_query[:100]).strip()
	return prompt[:200] or user_query[:100]
	except Exception:
	return user_query[:150]


	def _generate_image(prompt: str) -> str \| None:
	"""Generate image via HF Inference API (FLUX) or DALL-E 3. Returns path or None."""
	if not prompt:
	return None
	# Try DALL-E 3 first if OpenAI key is set
	if OPENAI_API_KEY:
	try:
	from openai import OpenAI
	oai = OpenAI(api_key=OPENAI_API_KEY)
	resp = oai.images.generate(model="dall-e-3", prompt=prompt, size="1024x1024", n=1)
	url = resp.data[0].url
	r = requests.get(url, timeout=30)
	r.raise_for_status()
	fd, path = tempfile.mkstemp(suffix=".png")
	os.close(fd)
	with open(path, "wb") as f:
	f.write(r.content)
	return path
	except Exception as e:
	import logging
	logging.warning(f"DALL-E 3 image gen failed: {e}")
	# HF Inference Providers (router.huggingface.co - old api-inference is deprecated)
	if HF_TOKEN:
	try:
	from huggingface_hub import InferenceClient
	hf_client = InferenceClient(provider="auto", api_key=HF_TOKEN)
	image = None
	for model_id in IMAGE_MODELS_HF:
	try:
	image = hf_client.text_to_image(prompt, model=model_id)
	if image is not None:
	break
	except Exception:
	continue
	if image is None:
	raise ValueError("All HF image models failed")
	fd, path = tempfile.mkstemp(suffix=".png")
	os.close(fd)
	if hasattr(image, "save"):
	image.save(path)
	else:
	with open(path, "wb") as f:
	f.write(image if isinstance(image, bytes) else image)
	return path
	except Exception as e:
	import logging
	logging.warning(f"HF image gen failed: {e}")
	return None


	# ---------------------------------------------------------------------------
	# Tab 1: Bot-Protected Website Scraper
	# ---------------------------------------------------------------------------

	def scrape_website(url: str):
	if not url:
	return "Please enter a URL.", ""

	parsed = urlparse(url)
	target_url = url
	if "goodreads.com" in (parsed.netloc or "") and (parsed.path in ("", "/")):
	target_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever"

	api_url = "https://api.brightdata.com/request"
	headers = {
	"Authorization": f"Bearer {BRIGHTDATA_API_KEY}",
	"Content-Type": "application/json",
	}
	payload = {
	"zone": BRIGHTDATA_UNLOCKER_ZONE,
	"url": target_url,
	"format": "raw",
	"method": "GET",
	}

	try:
	resp = requests.post(api_url, json=payload, headers=headers, timeout=120, verify=False)
	if resp.status_code in (400, 401):
	contexts["scraper"] = ""
	try:
	err_body = resp.json()
	except Exception:
	err_body = resp.text[:500] if resp.text else ""
	return (
	f"Bright Data error ({resp.status_code}): {resp.reason}. Details: {err_body}. "
	"Check BRIGHTDATA_API_KEY and BRIGHTDATA_UNLOCKER_ZONE.",
	"",
	)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	if "goodreads.com/list/show/1.Best_Books_Ever" in target_url:
	books_data = []
	book_rows = soup.find_all("tr", itemtype="http://schema.org/Book")
	for idx, row in enumerate(book_rows):
	title_elem = row.find("a", class_="bookTitle")
	author_elem = row.find("a", class_="authorName")
	rating_elem = row.find("span", class_="minirating")
	title = title_elem.text.strip() if title_elem else "Unknown Title"
	author = author_elem.text.strip() if author_elem else "Unknown Author"
	rating = rating_elem.text.strip() if rating_elem else "Unknown Rating"
	books_data.append({"Rank": idx + 1, "Title": title, "Author": author, "Rating": rating})
	if books_data:
	lines = ["Here is the scraped data from Goodreads Best Books Ever list:\n"]
	for b in books_data:
	lines.append(f"{b['Rank']}. {b['Title']} by {b['Author']} - {b['Rating']}")
	text_content = "\n".join(lines)
	else:
	text_content = soup.get_text(separator=" ", strip=True)
	else:
	text_content = soup.get_text(separator=" ", strip=True)

	contexts["scraper"] = text_content[:15000]
	preview = text_content[:500] + "..." if len(text_content) > 500 else text_content
	return "Website scraped successfully. You can now chat about it.", preview
	except Exception as e:
	contexts["scraper"] = ""
	return f"Error scraping website: {e}", ""


	# ---------------------------------------------------------------------------
	# Tab 2: YouTube Transcript Q&A
	# ---------------------------------------------------------------------------

	_YOUTUBE_ID_REGEX = re.compile(
	r"(https?://)?(www\.)?"
	r"(youtube\|youtu\|youtube-nocookie)\.(com\|be)/"
	r"(watch\?v=\|embed/\|v/\|.+\?v=)?([^&=%\?]{11})"
	)

	def _extract_video_id(url_or_id: str) -> str:
	match = _YOUTUBE_ID_REGEX.search(url_or_id)
	return match.group(6) if match else url_or_id.strip()


	def _fetch_transcript_via_brightdata(video_id: str) -> str:
	if not BRIGHTDATA_API_KEY:
	raise ValueError("BRIGHTDATA_API_KEY is required for YouTube transcript on this environment.")
	api_url = "https://api.brightdata.com/request"
	headers = {
	"Authorization": f"Bearer {BRIGHTDATA_API_KEY}",
	"Content-Type": "application/json",
	}
	zone = YOUTUBE_UNLOCKER_ZONE
	watch_url = f"https://www.youtube.com/watch?v={video_id}"
	payload = {"zone": zone, "url": watch_url, "format": "raw", "method": "GET"}
	resp = requests.post(api_url, json=payload, headers=headers, timeout=120, verify=False)
	if resp.status_code == 400:
	raise ValueError(f"Bright Data zone '{zone}' rejected the YouTube URL.")
	resp.raise_for_status()
	html = resp.text
	match = re.search(r"ytInitialPlayerResponse\s=\s(\{)", html)
	if not match:
	raise ValueError("Could not find caption data on the video page.")
	start = match.end(1) - 1
	depth, i = 0, start
	while i < len(html):
	if html[i] == "{":
	depth += 1
	elif html[i] == "}":
	depth -= 1
	if depth == 0:
	player = json.loads(html[start : i + 1])
	break
	i += 1
	else:
	raise ValueError("Could not parse caption data from the video page.")
	captions = player.get("captions", {}) or {}
	renderer = captions.get("playerCaptionsTracklistRenderer", {})
	tracks = renderer.get("captionTracks", [])
	if not tracks:
	raise ValueError("No transcript available for this video.")
	base_url = tracks[0].get("baseUrl", "")
	if not base_url:
	raise ValueError("No caption track URL found.")
	caption_url = base_url + ("&" if "?" in base_url else "?") + "fmt=json3"
	payload2 = {"zone": zone, "url": caption_url, "format": "raw", "method": "GET"}
	resp2 = requests.post(api_url, json=payload2, headers=headers, timeout=60, verify=False)
	resp2.raise_for_status()
	caption_data = resp2.json()
	pieces = []
	for event in caption_data.get("events", []):
	for seg in event.get("segs", []):
	text = seg.get("utf8", "").strip()
	if text and text != "\n":
	pieces.append(text)
	return " ".join(pieces)


	def fetch_youtube_transcript(video_input: str):
	if not video_input:
	return "Please enter a YouTube Video URL or ID.", ""

	video_id = _extract_video_id(video_input)
	try:
	api = YouTubeTranscriptApi()
	transcript_list = api.list(video_id)
	transcript = None
	try:
	transcript = transcript_list.find_transcript(["en", "ur"])
	except Exception:
	try:
	transcript = transcript_list.find_generated_transcript(["en", "ur"])
	except Exception:
	for t in transcript_list:
	transcript = t
	break
	if transcript is None:
	raise Exception("No transcript available for this video.")
	transcript_data = transcript.fetch()
	pieces = []
	for t in transcript_data:
	if isinstance(t, dict):
	pieces.append(t.get("text", ""))
	else:
	pieces.append(getattr(t, "text", ""))
	transcript_text = " ".join(pieces)
	contexts["youtube"] = transcript_text[:15000]
	preview = transcript_text[:500] + "..." if len(transcript_text) > 500 else transcript_text
	return "Transcript fetched successfully. You can now chat about the video.", preview
	except Exception as e:
	err_str = str(e).lower()
	is_network_error = "resolve" in err_str or "hostname" in err_str or "no address" in err_str or "max retries" in err_str
	if is_network_error and BRIGHTDATA_API_KEY:
	try:
	transcript_text = _fetch_transcript_via_brightdata(video_id)
	contexts["youtube"] = transcript_text[:15000]
	preview = transcript_text[:500] + "..." if len(transcript_text) > 500 else transcript_text
	return "Transcript fetched via Bright Data. You can now chat about the video.", preview
	except Exception as fallback_err:
	contexts["youtube"] = ""
	return f"Direct fetch failed. Bright Data fallback failed: {fallback_err}", ""
	contexts["youtube"] = ""
	if is_network_error:
	msg = "YouTube transcript fetching failed (network restricted). Add BRIGHTDATA_API_KEY and YOUTUBE_UNLOCKER_ZONE."
	else:
	msg = f"Error: No transcript for video ID ({video_id}). Details: {e}"
	return msg, ""


	# ---------------------------------------------------------------------------
	# Multi-turn chat with image generation
	# ---------------------------------------------------------------------------

	def _build_system_prompt(mode: str) -> str:
	context = contexts.get(mode, "")
	prefs = storage.get("user_preferences", "").strip()
	ctx_placeholder = "(None — the user has NOT scraped or fetched transcript yet. You must refuse to answer and tell them to scrape/fetch first.)"
	base = (
	"You are a helpful assistant. You must use ONLY the provided context to answer. "
	"NEVER use external knowledge. If the context says 'None' or the user has not scraped yet, refuse to answer and tell them to scrape or fetch transcript first. "
	"If the answer is not in the context, say so. You have conversation history for follow-up questions.\n\n"
	f"Context:\n{context.strip() if context else ctx_placeholder}"
	)
	if prefs:
	base += f"\n\nUser preferences (follow these):\n{prefs}"
	return base


	def _chat_with_image(mode: str, message: str, history_key: str, system_prompt_fn):
	"""Shared chat logic with optional image generation. Returns (clear_msg, history_tuples, image_path)."""
	if not message or not message.strip():
	return "", history_dicts_to_tuples(storage.get(history_key, [])), None

	context = contexts.get(mode, "") or "" if mode != "general" else "general"
	if mode != "general" and not context.strip():
	history_dicts = list(storage.get(history_key, []))
	history_dicts.append({"role": "user", "content": message.strip()})
	history_dicts.append({"role": "assistant", "content": "Please scrape a website or fetch a transcript first, then ask questions."})
	storage[history_key] = history_dicts
	save_storage(storage)
	return "", history_dicts_to_tuples(history_dicts), None

	history_dicts = list(storage.get(history_key, []))
	history_dicts.append({"role": "user", "content": message.strip()})
	system_prompt = system_prompt_fn()
	messages = [{"role": "system", "content": system_prompt}]
	for m in history_dicts:
	if m.get("role") in ("user", "assistant"):
	messages.append({"role": m["role"], "content": m.get("content", "")})

	try:
	resp = client.chat.completions.create(
	model=LLM_MODEL,
	messages=messages,
	max_tokens=1024,
	temperature=0.3,
	)
	reply = resp.choices[0].message.content
	except Exception as e:
	try:
	resp = client.chat.completions.create(
	model="llama-3.1-8b-instant",
	messages=messages,
	max_tokens=1024,
	temperature=0.3,
	)
	reply = resp.choices[0].message.content
	except Exception:
	reply = f"Error communicating with Groq: {e}"

	image_path = None
	if _should_generate_image(message.strip(), reply):
	img_prompt = _generate_image_prompt(message.strip())
	image_path = _generate_image(img_prompt)
	if image_path:
	reply += "\n\nGenerated illustration below."
	elif HF_TOKEN:
	reply += "\n\n(Image generation failed — check HF Inference Providers at hf.co/settings/inference-providers)"

	history_dicts.append({"role": "assistant", "content": reply})
	storage[history_key] = history_dicts
	save_storage(storage)
	return "", history_dicts_to_tuples(history_dicts), image_path


	def chat_turn_scraper(message: str, _history_ignored):
	out = _chat_with_image("scraper", message, "scraper_history", lambda: _build_system_prompt("scraper"))
	return out[0], out[1], out[2]


	def chat_turn_youtube(message: str, _history_ignored):
	out = _chat_with_image("youtube", message, "youtube_history", lambda: _build_system_prompt("youtube"))
	return out[0], out[1], out[2]


	def chat_turn_general(message: str, _history_ignored):
	"""General chat: free-form Q&A with image generation. No context constraint."""
	if not message or not message.strip():
	return "", history_dicts_to_tuples(storage.get("general_history", [])), None

	prefs = storage.get("user_preferences", "").strip()
	system_prompt = "You are a helpful assistant. Answer concisely. You may use general knowledge."
	if prefs:
	system_prompt += f"\n\nUser preferences (follow these):\n{prefs}"

	out = _chat_with_image("general", message, "general_history", lambda: system_prompt)
	return out[0], out[1], out[2]


	def save_preferences(prefs: str):
	global storage
	storage["user_preferences"] = prefs or ""
	save_storage(storage)
	return "Preferences saved."


	# ---------------------------------------------------------------------------
	# Tab: Voice Assistant (Speech-to-Text → LLM → Text-to-Speech + optional Image)
	# ---------------------------------------------------------------------------

	def voice_chatbot(audio_input):
	if audio_input is None:
	return "No audio received.", None, "", None
	audio_path = audio_input[0] if isinstance(audio_input, tuple) else (audio_input.get("name") or audio_input.get("path", "") if isinstance(audio_input, dict) else audio_input)
	if not audio_path or not os.path.isfile(audio_path):
	return "Invalid audio file.", None, "", None

	try:
	transcriber = _get_transcriber()
	transcription = transcriber(audio_path)
	user_text = transcription.get("text", "").strip() or "(no speech detected)"
	except Exception as e:
	return f"Transcription error: {e}", None, "", None

	prefs = storage.get("user_preferences", "").strip()
	system_prompt = "You are a helpful assistant. Keep answers concise."
	if prefs:
	system_prompt += f"\n\nUser preferences (follow these):\n{prefs}"

	try:
	resp = client.chat.completions.create(
	model=LLM_MODEL,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_text},
	],
	max_tokens=512,
	temperature=0.7,
	)
	bot_text = resp.choices[0].message.content
	except Exception:
	try:
	resp = client.chat.completions.create(
	model="llama-3.1-8b-instant",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_text},
	],
	max_tokens=512,
	temperature=0.7,
	)
	bot_text = resp.choices[0].message.content
	except Exception as e:
	bot_text = f"Error communicating with Groq: {e}"

	image_path = None
	if _should_generate_image(user_text, bot_text):
	img_prompt = _generate_image_prompt(user_text)
	image_path = _generate_image(img_prompt)
	if image_path:
	bot_text += "\n\nGenerated illustration below."
	elif HF_TOKEN:
	bot_text += "\n\n(Image generation failed — check HF Inference Providers)"

	try:
	from gtts import gTTS
	fd, output_path = tempfile.mkstemp(suffix=".mp3")
	os.close(fd)
	tts = gTTS(text=bot_text, lang="en")
	tts.save(output_path)
	except Exception as e:
	output_path = None
	bot_text += f"\n\n(TTS error: {e})"

	return bot_text, output_path, user_text, image_path


	# ---------------------------------------------------------------------------
	# Gradio UI
	# ---------------------------------------------------------------------------

	with gr.Blocks(title="Scraper Bot v5 — Multimodal AI (Image Gen)") as demo:
	gr.Markdown("# Version 5 — Multimodal AI Assistant (Image Generation)")
	gr.Markdown(
	"Model: llama-3.3-70b-versatile \| STT: whisper-large-v3 \| TTS: gTTS (🔓 Open-source) \n"
	"Extends v4 with on-demand image generation: ask for visual explanations and get illustrations via FLUX.1-schnell (HF) or DALL-E 3."
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### User preferences")
	prefs_input = gr.Textbox(
	label="Preferences",
	value=storage.get("user_preferences", ""),
	placeholder="e.g., Always respond formally.",
	lines=3,
	)
	save_prefs_btn = gr.Button("Save preferences")
	prefs_status = gr.Textbox(label="Status", interactive=False)
	save_prefs_btn.click(save_preferences, inputs=[prefs_input], outputs=[prefs_status])

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("General Chat (Image Gen)"):
	gr.Markdown("Free-form Q&A with optional image generation. Try: Explain the solar system visually or Show me what a transformer architecture looks like.")
	gen_chatbot = gr.Chatbot(
	value=history_dicts_to_tuples(storage.get("general_history", [])),
	height=300,
	label="Chat",
	)
	gen_img_out = gr.Image(label="Generated illustration", type="filepath")
	gen_msg = gr.Textbox(label="Message", placeholder="Ask anything. Say 'visually' or 'show me' for images.")
	gen_msg.submit(
	lambda m, h: chat_turn_general(m, h),
	inputs=[gen_msg, gen_chatbot],
	outputs=[gen_msg, gen_chatbot, gen_img_out],
	)

	with gr.TabItem("Bot-Protected Website Scraper"):
	gr.Markdown("Scrape a URL, then chat. Image gen when you ask for visual explanations.")
	with gr.Row():
	url_input = gr.Textbox(label="URL", placeholder="https://www.goodreads.com/", scale=3)
	scrape_btn = gr.Button("Scrape URL", scale=1)
	scrape_status = gr.Textbox(label="Status", interactive=False)
	scrape_preview = gr.Textbox(label="Content preview", interactive=False, lines=4)
	scraper_chatbot = gr.Chatbot(
	value=history_dicts_to_tuples(storage.get("scraper_history", [])),
	height=280,
	label="Chat",
	)
	scraper_img_out = gr.Image(label="Generated illustration", type="filepath")
	scraper_msg = gr.Textbox(label="Message", placeholder="e.g., What are the top 5 books?")
	scrape_btn.click(scrape_website, inputs=[url_input], outputs=[scrape_status, scrape_preview])
	scraper_msg.submit(
	lambda m, h: chat_turn_scraper(m, h),
	inputs=[scraper_msg, scraper_chatbot],
	outputs=[scraper_msg, scraper_chatbot, scraper_img_out],
	)

	with gr.TabItem("YouTube Transcript Q&A"):
	gr.Markdown("Fetch transcript, then chat. Image gen when you ask for visual explanations.")
	with gr.Row():
	yt_input = gr.Textbox(label="YouTube URL or ID", placeholder="dQw4w9WgXcQ", scale=3)
	yt_btn = gr.Button("Get Transcript", scale=1)
	yt_status = gr.Textbox(label="Status", interactive=False)
	yt_preview = gr.Textbox(label="Transcript preview", interactive=False, lines=4)
	yt_chatbot = gr.Chatbot(
	value=history_dicts_to_tuples(storage.get("youtube_history", [])),
	height=280,
	label="Chat",
	)
	yt_img_out = gr.Image(label="Generated illustration", type="filepath")
	yt_msg = gr.Textbox(label="Message", placeholder="e.g., Summarize the video.")
	yt_btn.click(fetch_youtube_transcript, inputs=[yt_input], outputs=[yt_status, yt_preview])
	yt_msg.submit(
	lambda m, h: chat_turn_youtube(m, h),
	inputs=[yt_msg, yt_chatbot],
	outputs=[yt_msg, yt_chatbot, yt_img_out],
	)

	with gr.TabItem("Voice Assistant"):
	gr.Markdown("Speak your query. Get text + audio + optional image.")
	mic_input = gr.Audio(
	label="Speak",
	sources=["microphone"],
	type="filepath",
	)
	user_text_box = gr.Textbox(label="Transcribed text", interactive=False)
	bot_text_output = gr.Textbox(label="Bot text response", interactive=False)
	bot_audio_output = gr.Audio(label="Bot audio response", type="filepath", interactive=False)
	voice_img_out = gr.Image(label="Generated illustration", type="filepath")

	mic_input.change(
	fn=voice_chatbot,
	inputs=mic_input,
	outputs=[bot_text_output, bot_audio_output, user_text_box, voice_img_out],
	)

	if __name__ == "__main__":
	if os.environ.get("SPACE_ID"):
	demo.launch()
	else:
	demo.launch(server_name="127.0.0.1", server_port=7866)