Image-to-text / app.py
GermanySutherland's picture
Update app.py
57356c3 verified
import gradio as gr
from transformers import pipeline, CLIPProcessor, CLIPModel, VisionEncoderDecoderModel, TrOCRProcessor
from PIL import Image
import torch
# --- Hugging Face Models ---
# 1. Image-to-Text (captioning)
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
# 2. CLIP for image-text & image-image search
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# 3. New Strategy: Optical Character Recognition (OCR)
ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
# --- Functions ---
def multi_agent_inference(image, search_text=""):
outputs = []
# Strategy 1: Image β†’ Text captioning
try:
cap = captioner(image)[0]["generated_text"]
outputs.append(("Image Captioning", cap))
except Exception as e:
outputs.append(("Image Captioning", f"[failed] {e}"))
# Strategy 2: Text search in Image (CLIP similarity)
if search_text.strip():
try:
inputs = clip_processor(text=[search_text], images=image, return_tensors="pt", padding=True)
with torch.no_grad():
logits_per_image = clip_model(**inputs).logits_per_image
score = logits_per_image.item()
outputs.append(("Text-in-Image Search", f"Similarity Score: {score:.4f}"))
except Exception as e:
outputs.append(("Text-in-Image Search", f"[failed] {e}"))
else:
outputs.append(("Text-in-Image Search", "No search text entered."))
# Strategy 3: Image-in-Image search (self similarity)
try:
inputs = clip_processor(text=["This is the same image"], images=image, return_tensors="pt", padding=True)
with torch.no_grad():
score = clip_model(**inputs).logits_per_image.item()
outputs.append(("Image-in-Image Search", f"Self-similarity Score: {score:.4f}"))
except Exception as e:
outputs.append(("Image-in-Image Search", f"[failed] {e}"))
# Strategy 4: Optical Character Recognition (OCR)
try:
pixel_values = ocr_processor(images=image, return_tensors="pt").pixel_values
generated_ids = ocr_model.generate(pixel_values)
extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
outputs.append(("OCR (Extracted Text)", extracted_text))
except Exception as e:
outputs.append(("OCR (Extracted Text)", f"[failed] {e}"))
return outputs
# --- Gradio UI ---
with gr.Blocks() as demo:
gr.Markdown("## πŸ€— Multi-Strategy Hugging Face AI MVP")
gr.Markdown("Upload an image β†’ get captions, search for text inside it, "
"compare images, and **extract text**. All strategies use "
"Hugging Face free OSS models.")
with gr.Row():
img_input = gr.Image(type="pil", label="Upload Image")
text_input = gr.Textbox(label="Search Text (optional)",
placeholder="e.g. cat, car, chip, war...")
btn = gr.Button("Run Multi-Agent Analysis")
out = gr.Dataframe(headers=["Strategy", "Result"], label="πŸ” AI Outputs")
btn.click(fn=multi_agent_inference, inputs=[img_input, text_input], outputs=out)
demo.launch()