Spaces:

Sanket17
/

omniparser

Runtime error

App Files Files Community

Sanket17 commited on Dec 14, 2024

Commit

26966db

verified ·

1 Parent(s): afdc45a

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -54

app.py CHANGED Viewed

@@ -1,62 +1,50 @@
-from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import JSONResponse
-from transformers import AutoProcessor, AutoModelForCausalLM
 from PIL import Image
-import io
 import torch
-import base64
-from ultralytics import YOLO
-from utils import check_ocr_box, get_som_labeled_img  # Import utility functions
 app = FastAPI()
-# Load YOLO model
-yolo_model = YOLO('best.pt').to('cuda')
-# Load Florence model
-processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained("microsoft/OmniParser/icon_caption_florence", torch_dtype=torch.float16, trust_remote_code=True).to('cuda')
-caption_model_processor = {'processor': processor, 'model': model}
-@app.post("/predict")
-async def predict(image: UploadFile = File(...), box_threshold: float = 0.05, iou_threshold: float = 0.1):
-    image_data = await image.read()
-    image = Image.open(io.BytesIO(image_data))
-    # Process the image and get predictions
-    result_image, parsed_content, coordinates = process(image, box_threshold, iou_threshold)
-    # Encode the result image back to base64
-    buffered = io.BytesIO()
-    result_image.save(buffered, format="PNG")
-    result_image_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    return JSONResponse(content={
-        'result_image': result_image_str,
-        'parsed_content': parsed_content,
-        'coordinates': coordinates
-    })
-def process(image_input, box_threshold, iou_threshold):
-    # Your image processing code here
-    image_save_path = 'imgs/saved_image_demo.png'
-    image_input.save(image_save_path)
-    image = Image.open(image_save_path)
-    box_overlay_ratio = image.size[0] / 3200
-    draw_bbox_config = {
-        'text_scale': 0.8 * box_overlay_ratio,
-        'text_thickness': max(int(2 * box_overlay_ratio), 1),
-        'text_padding': max(int(3 * box_overlay_ratio), 1),
-        'thickness': max(int(3 * box_overlay_ratio), 1),
-    }
-    # Implement check_ocr_box, get_som_labeled_img as in your reference
-    # Replace these function calls with actual implementations
-    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=True)
-    text, ocr_bbox = ocr_bbox_rslt
-    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox, draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text, iou_threshold=iou_threshold)
-    result_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-    parsed_content_list = '\n'.join(parsed_content_list)
-    return result_image, str(parsed_content_list), str(label_coordinates)

+from fastapi import FastAPI, UploadFile, Form
 from fastapi.responses import JSONResponse
+from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
 from PIL import Image
 import torch
+import uvicorn
+import os
+# Initialize FastAPI app
 app = FastAPI()
+# Access the Hugging Face token from the secret section
+hf_token = os.getenv("HP_token")
+# Load model and processor with the token
+processor = AutoProcessor.from_pretrained("Sanket17/hello", use_auth_token=hf_token)
+model = AutoModelForVisualQuestionAnswering.from_pretrained("Sanket17/hello", use_auth_token=hf_token)
+@app.post("/vqa/")
+async def visual_question_answer(file: UploadFile, question: str = Form(...)):
+    """
+    Endpoint for visual question answering.
+    - file: Upload an image file
+    - question: Textual question about the image
+    """
+    try:
+        # Load image
+        image = Image.open(file.file).convert("RGB")
+        # Preprocess inputs
+        inputs = processor(images=image, text=question, return_tensors="pt")
+        # Get model predictions
+        outputs = model(**inputs)
+        # Decode the answer (check model output for correct handling)
+        answer = outputs.logits.argmax(dim=-1).item()  # Example way to get the answer index
+        # If the output logits contain a mapping, we can return the answer string
+        answer_str = processor.decode([answer])  # Assuming you get the answer index from logits
+        # Return JSON response
+        return JSONResponse(content={"question": question, "answer": answer_str})
+    except Exception as e:
+        return JSONResponse(content={"error": str(e)}, status_code=500)
+# Start the FastAPI server
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)