GermanySutherland commited on
Commit
57356c3
Β·
verified Β·
1 Parent(s): 76a8cd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -8
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline, CLIPProcessor, CLIPModel
3
  from PIL import Image
4
  import torch
5
 
@@ -11,6 +11,10 @@ captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captionin
11
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
12
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
13
 
 
 
 
 
14
  # --- Functions ---
15
  def multi_agent_inference(image, search_text=""):
16
  outputs = []
@@ -44,21 +48,29 @@ def multi_agent_inference(image, search_text=""):
44
  except Exception as e:
45
  outputs.append(("Image-in-Image Search", f"[failed] {e}"))
46
 
 
 
 
 
 
 
 
 
 
47
  return outputs
48
 
49
  # --- Gradio UI ---
50
  with gr.Blocks() as demo:
51
  gr.Markdown("## πŸ€— Multi-Strategy Hugging Face AI MVP")
52
- gr.Markdown("Upload an image β†’ get captions, search for text inside it, compare images. "
53
- "All strategies use Hugging Face free OSS models.")
54
-
55
  with gr.Row():
56
  img_input = gr.Image(type="pil", label="Upload Image")
57
- text_input = gr.Textbox(label="Search Text (optional)", placeholder="e.g. cat, car, chip, war...")
 
58
  btn = gr.Button("Run Multi-Agent Analysis")
59
-
60
  out = gr.Dataframe(headers=["Strategy", "Result"], label="πŸ” AI Outputs")
61
-
62
  btn.click(fn=multi_agent_inference, inputs=[img_input, text_input], outputs=out)
63
 
64
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import pipeline, CLIPProcessor, CLIPModel, VisionEncoderDecoderModel, TrOCRProcessor
3
  from PIL import Image
4
  import torch
5
 
 
11
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
12
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
13
 
14
+ # 3. New Strategy: Optical Character Recognition (OCR)
15
+ ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
16
+ ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
17
+
18
  # --- Functions ---
19
  def multi_agent_inference(image, search_text=""):
20
  outputs = []
 
48
  except Exception as e:
49
  outputs.append(("Image-in-Image Search", f"[failed] {e}"))
50
 
51
+ # Strategy 4: Optical Character Recognition (OCR)
52
+ try:
53
+ pixel_values = ocr_processor(images=image, return_tensors="pt").pixel_values
54
+ generated_ids = ocr_model.generate(pixel_values)
55
+ extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
56
+ outputs.append(("OCR (Extracted Text)", extracted_text))
57
+ except Exception as e:
58
+ outputs.append(("OCR (Extracted Text)", f"[failed] {e}"))
59
+
60
  return outputs
61
 
62
  # --- Gradio UI ---
63
  with gr.Blocks() as demo:
64
  gr.Markdown("## πŸ€— Multi-Strategy Hugging Face AI MVP")
65
+ gr.Markdown("Upload an image β†’ get captions, search for text inside it, "
66
+ "compare images, and **extract text**. All strategies use "
67
+ "Hugging Face free OSS models.")
68
  with gr.Row():
69
  img_input = gr.Image(type="pil", label="Upload Image")
70
+ text_input = gr.Textbox(label="Search Text (optional)",
71
+ placeholder="e.g. cat, car, chip, war...")
72
  btn = gr.Button("Run Multi-Agent Analysis")
 
73
  out = gr.Dataframe(headers=["Strategy", "Result"], label="πŸ” AI Outputs")
 
74
  btn.click(fn=multi_agent_inference, inputs=[img_input, text_input], outputs=out)
75
 
76
+ demo.launch()