Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, CLIPProcessor, CLIPModel, VisionEncoderDecoderModel, TrOCRProcessor | |
| from PIL import Image | |
| import torch | |
| # --- Hugging Face Models --- | |
| # 1. Image-to-Text (captioning) | |
| captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") | |
| # 2. CLIP for image-text & image-image search | |
| clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
| clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| # 3. New Strategy: Optical Character Recognition (OCR) | |
| ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") | |
| ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed") | |
| # --- Functions --- | |
| def multi_agent_inference(image, search_text=""): | |
| outputs = [] | |
| # Strategy 1: Image β Text captioning | |
| try: | |
| cap = captioner(image)[0]["generated_text"] | |
| outputs.append(("Image Captioning", cap)) | |
| except Exception as e: | |
| outputs.append(("Image Captioning", f"[failed] {e}")) | |
| # Strategy 2: Text search in Image (CLIP similarity) | |
| if search_text.strip(): | |
| try: | |
| inputs = clip_processor(text=[search_text], images=image, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| logits_per_image = clip_model(**inputs).logits_per_image | |
| score = logits_per_image.item() | |
| outputs.append(("Text-in-Image Search", f"Similarity Score: {score:.4f}")) | |
| except Exception as e: | |
| outputs.append(("Text-in-Image Search", f"[failed] {e}")) | |
| else: | |
| outputs.append(("Text-in-Image Search", "No search text entered.")) | |
| # Strategy 3: Image-in-Image search (self similarity) | |
| try: | |
| inputs = clip_processor(text=["This is the same image"], images=image, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| score = clip_model(**inputs).logits_per_image.item() | |
| outputs.append(("Image-in-Image Search", f"Self-similarity Score: {score:.4f}")) | |
| except Exception as e: | |
| outputs.append(("Image-in-Image Search", f"[failed] {e}")) | |
| # Strategy 4: Optical Character Recognition (OCR) | |
| try: | |
| pixel_values = ocr_processor(images=image, return_tensors="pt").pixel_values | |
| generated_ids = ocr_model.generate(pixel_values) | |
| extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| outputs.append(("OCR (Extracted Text)", extracted_text)) | |
| except Exception as e: | |
| outputs.append(("OCR (Extracted Text)", f"[failed] {e}")) | |
| return outputs | |
| # --- Gradio UI --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π€ Multi-Strategy Hugging Face AI MVP") | |
| gr.Markdown("Upload an image β get captions, search for text inside it, " | |
| "compare images, and **extract text**. All strategies use " | |
| "Hugging Face free OSS models.") | |
| with gr.Row(): | |
| img_input = gr.Image(type="pil", label="Upload Image") | |
| text_input = gr.Textbox(label="Search Text (optional)", | |
| placeholder="e.g. cat, car, chip, war...") | |
| btn = gr.Button("Run Multi-Agent Analysis") | |
| out = gr.Dataframe(headers=["Strategy", "Result"], label="π AI Outputs") | |
| btn.click(fn=multi_agent_inference, inputs=[img_input, text_input], outputs=out) | |
| demo.launch() |