import gradio as gr from transformers import pipeline, CLIPProcessor, CLIPModel, VisionEncoderDecoderModel, TrOCRProcessor from PIL import Image import torch # --- Hugging Face Models --- # 1. Image-to-Text (captioning) captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") # 2. CLIP for image-text & image-image search clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # 3. New Strategy: Optical Character Recognition (OCR) ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed") # --- Functions --- def multi_agent_inference(image, search_text=""): outputs = [] # Strategy 1: Image → Text captioning try: cap = captioner(image)[0]["generated_text"] outputs.append(("Image Captioning", cap)) except Exception as e: outputs.append(("Image Captioning", f"[failed] {e}")) # Strategy 2: Text search in Image (CLIP similarity) if search_text.strip(): try: inputs = clip_processor(text=[search_text], images=image, return_tensors="pt", padding=True) with torch.no_grad(): logits_per_image = clip_model(**inputs).logits_per_image score = logits_per_image.item() outputs.append(("Text-in-Image Search", f"Similarity Score: {score:.4f}")) except Exception as e: outputs.append(("Text-in-Image Search", f"[failed] {e}")) else: outputs.append(("Text-in-Image Search", "No search text entered.")) # Strategy 3: Image-in-Image search (self similarity) try: inputs = clip_processor(text=["This is the same image"], images=image, return_tensors="pt", padding=True) with torch.no_grad(): score = clip_model(**inputs).logits_per_image.item() outputs.append(("Image-in-Image Search", f"Self-similarity Score: {score:.4f}")) except Exception as e: outputs.append(("Image-in-Image Search", f"[failed] {e}")) # Strategy 4: Optical Character Recognition (OCR) try: pixel_values = ocr_processor(images=image, return_tensors="pt").pixel_values generated_ids = ocr_model.generate(pixel_values) extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] outputs.append(("OCR (Extracted Text)", extracted_text)) except Exception as e: outputs.append(("OCR (Extracted Text)", f"[failed] {e}")) return outputs # --- Gradio UI --- with gr.Blocks() as demo: gr.Markdown("## 🤗 Multi-Strategy Hugging Face AI MVP") gr.Markdown("Upload an image → get captions, search for text inside it, " "compare images, and **extract text**. All strategies use " "Hugging Face free OSS models.") with gr.Row(): img_input = gr.Image(type="pil", label="Upload Image") text_input = gr.Textbox(label="Search Text (optional)", placeholder="e.g. cat, car, chip, war...") btn = gr.Button("Run Multi-Agent Analysis") out = gr.Dataframe(headers=["Strategy", "Result"], label="🔍 AI Outputs") btn.click(fn=multi_agent_inference, inputs=[img_input, text_input], outputs=out) demo.launch()