Spaces:

Sanket17
/

omniparser

Runtime error

App Files Files Community

Sanket17 commited on Dec 14, 2024

Commit

3304bbb

verified ·

1 Parent(s): 5c1773f

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -22

app.py CHANGED Viewed

@@ -1,30 +1,66 @@
-from fastapi import FastAPI, UploadFile, File
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
 import torch
-# Initialize FastAPI
-app = FastAPI()
-# Load the model and processor
-MODEL_NAME = "microsoft/OmniParser-blip2-caption"
-processor = AutoProcessor.from_pretrained(MODEL_NAME)
-model = AutoModelForVisualQuestionAnswering.from_pretrained(MODEL_NAME)
-@app.get("/")
-async def home():
-    return {"message": "Welcome to OmniParser API!"}
-@app.post("/predict/")
-async def predict(file: UploadFile = File(...)):
-    # Read and preprocess the image
-    image = Image.open(file.file).convert("RGB")
-    inputs = processor(images=image, return_tensors="pt")
-    # Perform inference
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # Decode results
-    caption = processor.decode(outputs.logits.argmax(-1).squeeze().tolist())
-    return {"caption": caption}

+from flask import Flask, request, jsonify
+from transformers import AutoProcessor, AutoModelForCausalLM
 from PIL import Image
+import io
 import torch
+import base64
+from ultralytics import YOLO
+from utils import check_ocr_box, get_som_labeled_img  # Import utility functions
+# Initialize Flask app
+app = Flask(__name__)
+# Load YOLO model
+yolo_model = YOLO('best.pt').to('cuda')
+# Load Florence model
+processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("weights/icon_caption_florence", torch_dtype=torch.float16, trust_remote_code=True).to('cuda')
+caption_model_processor = {'processor': processor, 'model': model}
+@app.route('/predict', methods=['POST'])
+def predict():
+    data = request.get_json()
+    image_data = base64.b64decode(data['image'])
+    image = Image.open(io.BytesIO(image_data))
+    # Process the image and get predictions
+    result_image, parsed_content, coordinates = process(image, data['box_threshold'], data['iou_threshold'])
+    # Encode the result image back to base64
+    buffered = io.BytesIO()
+    result_image.save(buffered, format="PNG")
+    result_image_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return jsonify({
+        'result_image': result_image_str,
+        'parsed_content': parsed_content,
+        'coordinates': coordinates
+    })
+def process(image_input, box_threshold, iou_threshold):
+    # Your image processing code here
+    image_save_path = 'imgs/saved_image_demo.png'
+    image_input.save(image_save_path)
+    image = Image.open(image_save_path)
+    box_overlay_ratio = image.size[0] / 3200
+    draw_bbox_config = {
+        'text_scale': 0.8 * box_overlay_ratio,
+        'text_thickness': max(int(2 * box_overlay_ratio), 1),
+        'text_padding': max(int(3 * box_overlay_ratio), 1),
+        'thickness': max(int(3 * box_overlay_ratio), 1),
+    }
+    # Implement check_ocr_box, get_som_labeled_img as in your reference
+    # Replace these function calls with actual implementations
+    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=True)
+    text, ocr_bbox = ocr_bbox_rslt
+    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox, draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text, iou_threshold=iou_threshold)
+    result_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
+    parsed_content_list = '\n'.join(parsed_content_list)
+    return result_image, str(parsed_content_list), str(label_coordinates)
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=5000)