import os import logging import gradio as gr from huggingface_hub import InferenceClient # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger(__name__) # Environment variables for configuration HF_TOKEN = os.environ.get("HF_TOKEN", "") CAPTION_MODEL = os.environ.get("CAPTION_MODEL", "Salesforce/blip-image-captioning-base") VQA_MODEL = os.environ.get("VQA_MODEL", "dandelin/vilt-b32-finetuned-vqa") logger.info(f"HF_TOKEN configured: {bool(HF_TOKEN)}") logger.info(f"CAPTION_MODEL: {CAPTION_MODEL}") logger.info(f"VQA_MODEL: {VQA_MODEL}") client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient() logger.info("InferenceClient initialized") def caption_image(image): """Generate a caption for the image.""" logger.info(f"caption_image() called | image={image is not None}") if image is None: logger.warning("No image provided") return "📷 Upload an image first!" try: logger.info(f"Calling image_to_text | model={CAPTION_MODEL}") result = client.image_to_text(image, model=CAPTION_MODEL) logger.info(f"Caption: {result.generated_text[:100]}") return result.generated_text except Exception as e: logger.error(f"API error: {e}") return f"❌ Error: {e}" def answer_question(image, question: str): """Answer a question about the image.""" logger.info(f"answer_question() called | image={image is not None} | question={question[:30] if question else 'None'}") if image is None: logger.warning("No image provided") return "📷 Upload an image first!" if not question.strip(): logger.warning("No question provided") return "❓ Ask a question!" try: logger.info(f"Calling visual_question_answering | model={VQA_MODEL}") result = client.visual_question_answering(image=image, question=question, model=VQA_MODEL) top = result[0] logger.info(f"Answer: {top.answer} ({top.score:.1%})") return f"🤖 {top.answer} (confidence: {top.score:.1%})" except Exception as e: logger.error(f"API error: {e}") return f"❌ Error: {e}" logger.info("Building Gradio interface...") with gr.Blocks(title="Vision Chat") as demo: gr.Markdown("# 👁️ Vision Chat\nUpload an image, get a caption, and ask questions about it!") with gr.Row(equal_height=True): with gr.Column(scale=1): img = gr.Image(type="pil", label="📷 Your Image") caption_btn = gr.Button("✨ Generate Caption", variant="primary") with gr.Column(scale=1): caption_out = gr.Textbox(label="Caption", lines=2, interactive=False) question = gr.Textbox(label="❓ Ask a question", placeholder="What color is the animal?") ask_btn = gr.Button("Ask", variant="secondary") answer_out = gr.Textbox(label="Answer", lines=2, interactive=False) caption_btn.click(caption_image, inputs=img, outputs=caption_out) ask_btn.click(answer_question, inputs=[img, question], outputs=answer_out) question.submit(answer_question, inputs=[img, question], outputs=answer_out) demo.queue() logger.info("Starting Gradio server...") demo.launch()