AkshayDS26 commited on
Commit
eb6dc77
·
1 Parent(s): 4e43fda

Add application file

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoProcessor
3
+ from datasets import load_dataset
4
+
5
+ import gradio as g
6
+
7
+
8
+ # Initialize the processor and the model
9
+ processor = AutoProcessor.from_pretrained("AkshaySiraswar/Florence-2-FT-DocVQA", trust_remote_code=True, force_download=True)
10
+ model = AutoModelForCausalLM.from_pretrained("AkshaySiraswar/Florence-2-FT-DocVQA", trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
11
+ r
12
+ def generate_response(image, question):
13
+ try:
14
+ if image.mode != "RGB":
15
+ image = image.convert("RGB")
16
+
17
+ inputs = processor(text=question, images=image, return_tensors="pt")
18
+
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+ model.to(device)
21
+ inputs = {key: value.to(device) for key, value in inputs.items()}
22
+
23
+ generated_ids = model.generate(
24
+ input_ids=inputs["input_ids"],
25
+ pixel_values=inputs["pixel_values"],
26
+ max_length=1024,
27
+ num_beams=3,
28
+ early_stopping=True
29
+ )
30
+
31
+ response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
32
+ return response
33
+ except Exception as e:
34
+ return f"Error processing image: {e}"
35
+
36
+ # Example images for demonstration (update paths as needed)
37
+ examples = [
38
+ ["demo.jpg", "what is the address in the page?"],
39
+ ["demo.jpg", "what is the phone number?"],
40
+ ["demo.jpg", "what is the email address?"]
41
+ ]
42
+
43
+ # Gradio interface
44
+ iface = gr.Interface(
45
+ fn=generate_response,
46
+ inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")],
47
+ outputs=gr.Textbox(label="Response"),
48
+ examples=examples,
49
+ title="Image to Text Extractor",
50
+ description="Upload an image and provide a question. This tool will extract the relevant information from the image based on your question."
51
+ )
52
+
53
+ # Launch the interface
54
+ iface.launch()