Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)s | %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Environment variables for configuration | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| CAPTION_MODEL = os.environ.get("CAPTION_MODEL", "Salesforce/blip-image-captioning-base") | |
| VQA_MODEL = os.environ.get("VQA_MODEL", "dandelin/vilt-b32-finetuned-vqa") | |
| logger.info(f"HF_TOKEN configured: {bool(HF_TOKEN)}") | |
| logger.info(f"CAPTION_MODEL: {CAPTION_MODEL}") | |
| logger.info(f"VQA_MODEL: {VQA_MODEL}") | |
| client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient() | |
| logger.info("InferenceClient initialized") | |
| def caption_image(image): | |
| """Generate a caption for the image.""" | |
| logger.info(f"caption_image() called | image={image is not None}") | |
| if image is None: | |
| logger.warning("No image provided") | |
| return "π· Upload an image first!" | |
| try: | |
| logger.info(f"Calling image_to_text | model={CAPTION_MODEL}") | |
| result = client.image_to_text(image, model=CAPTION_MODEL) | |
| logger.info(f"Caption: {result.generated_text[:100]}") | |
| return result.generated_text | |
| except Exception as e: | |
| logger.error(f"API error: {e}") | |
| return f"β Error: {e}" | |
| def answer_question(image, question: str): | |
| """Answer a question about the image.""" | |
| logger.info(f"answer_question() called | image={image is not None} | question={question[:30] if question else 'None'}") | |
| if image is None: | |
| logger.warning("No image provided") | |
| return "π· Upload an image first!" | |
| if not question.strip(): | |
| logger.warning("No question provided") | |
| return "β Ask a question!" | |
| try: | |
| logger.info(f"Calling visual_question_answering | model={VQA_MODEL}") | |
| result = client.visual_question_answering(image=image, question=question, model=VQA_MODEL) | |
| top = result[0] | |
| logger.info(f"Answer: {top.answer} ({top.score:.1%})") | |
| return f"π€ {top.answer} (confidence: {top.score:.1%})" | |
| except Exception as e: | |
| logger.error(f"API error: {e}") | |
| return f"β Error: {e}" | |
| logger.info("Building Gradio interface...") | |
| with gr.Blocks(title="Vision Chat") as demo: | |
| gr.Markdown("# ποΈ Vision Chat\nUpload an image, get a caption, and ask questions about it!") | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| img = gr.Image(type="pil", label="π· Your Image") | |
| caption_btn = gr.Button("β¨ Generate Caption", variant="primary") | |
| with gr.Column(scale=1): | |
| caption_out = gr.Textbox(label="Caption", lines=2, interactive=False) | |
| question = gr.Textbox(label="β Ask a question", placeholder="What color is the animal?") | |
| ask_btn = gr.Button("Ask", variant="secondary") | |
| answer_out = gr.Textbox(label="Answer", lines=2, interactive=False) | |
| caption_btn.click(caption_image, inputs=img, outputs=caption_out) | |
| ask_btn.click(answer_question, inputs=[img, question], outputs=answer_out) | |
| question.submit(answer_question, inputs=[img, question], outputs=answer_out) | |
| demo.queue() | |
| logger.info("Starting Gradio server...") | |
| demo.launch() | |