Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from transformers import pipeline, CLIPProcessor, CLIPModel
|
| 3 |
from PIL import Image
|
| 4 |
import torch
|
| 5 |
|
|
@@ -11,6 +11,10 @@ captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captionin
|
|
| 11 |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 12 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# --- Functions ---
|
| 15 |
def multi_agent_inference(image, search_text=""):
|
| 16 |
outputs = []
|
|
@@ -44,21 +48,29 @@ def multi_agent_inference(image, search_text=""):
|
|
| 44 |
except Exception as e:
|
| 45 |
outputs.append(("Image-in-Image Search", f"[failed] {e}"))
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
return outputs
|
| 48 |
|
| 49 |
# --- Gradio UI ---
|
| 50 |
with gr.Blocks() as demo:
|
| 51 |
gr.Markdown("## π€ Multi-Strategy Hugging Face AI MVP")
|
| 52 |
-
gr.Markdown("Upload an image β get captions, search for text inside it,
|
| 53 |
-
"
|
| 54 |
-
|
| 55 |
with gr.Row():
|
| 56 |
img_input = gr.Image(type="pil", label="Upload Image")
|
| 57 |
-
text_input = gr.Textbox(label="Search Text (optional)",
|
|
|
|
| 58 |
btn = gr.Button("Run Multi-Agent Analysis")
|
| 59 |
-
|
| 60 |
out = gr.Dataframe(headers=["Strategy", "Result"], label="π AI Outputs")
|
| 61 |
-
|
| 62 |
btn.click(fn=multi_agent_inference, inputs=[img_input, text_input], outputs=out)
|
| 63 |
|
| 64 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from transformers import pipeline, CLIPProcessor, CLIPModel, VisionEncoderDecoderModel, TrOCRProcessor
|
| 3 |
from PIL import Image
|
| 4 |
import torch
|
| 5 |
|
|
|
|
| 11 |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 12 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 13 |
|
| 14 |
+
# 3. New Strategy: Optical Character Recognition (OCR)
|
| 15 |
+
ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
|
| 16 |
+
ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
|
| 17 |
+
|
| 18 |
# --- Functions ---
|
| 19 |
def multi_agent_inference(image, search_text=""):
|
| 20 |
outputs = []
|
|
|
|
| 48 |
except Exception as e:
|
| 49 |
outputs.append(("Image-in-Image Search", f"[failed] {e}"))
|
| 50 |
|
| 51 |
+
# Strategy 4: Optical Character Recognition (OCR)
|
| 52 |
+
try:
|
| 53 |
+
pixel_values = ocr_processor(images=image, return_tensors="pt").pixel_values
|
| 54 |
+
generated_ids = ocr_model.generate(pixel_values)
|
| 55 |
+
extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 56 |
+
outputs.append(("OCR (Extracted Text)", extracted_text))
|
| 57 |
+
except Exception as e:
|
| 58 |
+
outputs.append(("OCR (Extracted Text)", f"[failed] {e}"))
|
| 59 |
+
|
| 60 |
return outputs
|
| 61 |
|
| 62 |
# --- Gradio UI ---
|
| 63 |
with gr.Blocks() as demo:
|
| 64 |
gr.Markdown("## π€ Multi-Strategy Hugging Face AI MVP")
|
| 65 |
+
gr.Markdown("Upload an image β get captions, search for text inside it, "
|
| 66 |
+
"compare images, and **extract text**. All strategies use "
|
| 67 |
+
"Hugging Face free OSS models.")
|
| 68 |
with gr.Row():
|
| 69 |
img_input = gr.Image(type="pil", label="Upload Image")
|
| 70 |
+
text_input = gr.Textbox(label="Search Text (optional)",
|
| 71 |
+
placeholder="e.g. cat, car, chip, war...")
|
| 72 |
btn = gr.Button("Run Multi-Agent Analysis")
|
|
|
|
| 73 |
out = gr.Dataframe(headers=["Strategy", "Result"], label="π AI Outputs")
|
|
|
|
| 74 |
btn.click(fn=multi_agent_inference, inputs=[img_input, text_input], outputs=out)
|
| 75 |
|
| 76 |
+
demo.launch()
|