Spaces:
Configuration error
Configuration error
Upload 51 files
Browse files
src/components/MultiSourceCaptioningView.tsx
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import { useState, useRef, useEffect } from "react";
|
| 2 |
import { useVLMContext } from "../context/useVLMContext";
|
| 3 |
-
import {
|
| 4 |
|
| 5 |
const MODES = ["Webcam", "URL", "File"] as const;
|
| 6 |
type Mode = typeof MODES[number];
|
|
@@ -8,53 +8,6 @@ type Mode = typeof MODES[number];
|
|
| 8 |
const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
|
| 9 |
const EXAMPLE_PROMPT = "Detect all birds in the image. For each bird, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"bird\", \"bbox_2d\": [x1, y1, x2, y2]}]";
|
| 10 |
|
| 11 |
-
function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
|
| 12 |
-
if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
|
| 13 |
-
const label = arr[0];
|
| 14 |
-
return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
|
| 15 |
-
}
|
| 16 |
-
return [];
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
-
function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
|
| 20 |
-
if (!raw) return [];
|
| 21 |
-
let boxes = [];
|
| 22 |
-
if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
|
| 23 |
-
boxes = raw.image;
|
| 24 |
-
} else if (Array.isArray(raw)) {
|
| 25 |
-
boxes = raw;
|
| 26 |
-
} else if (typeof raw === "object" && raw !== null) {
|
| 27 |
-
boxes = [raw];
|
| 28 |
-
}
|
| 29 |
-
return boxes
|
| 30 |
-
.map((obj: any) => {
|
| 31 |
-
if (!obj || !obj.bbox_2d) return null;
|
| 32 |
-
let bbox = obj.bbox_2d;
|
| 33 |
-
// If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
|
| 34 |
-
if (
|
| 35 |
-
Array.isArray(bbox) &&
|
| 36 |
-
bbox.length === 2 &&
|
| 37 |
-
Array.isArray(bbox[0]) &&
|
| 38 |
-
Array.isArray(bbox[1]) &&
|
| 39 |
-
bbox[0].length === 2 &&
|
| 40 |
-
bbox[1].length === 2
|
| 41 |
-
) {
|
| 42 |
-
bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
|
| 43 |
-
}
|
| 44 |
-
// If bbox_2d is [x1, y1, x2, y2], use as-is
|
| 45 |
-
if (
|
| 46 |
-
Array.isArray(bbox) &&
|
| 47 |
-
bbox.length === 4 &&
|
| 48 |
-
bbox.every((v: any) => typeof v === "number")
|
| 49 |
-
) {
|
| 50 |
-
return { ...obj, bbox_2d: bbox };
|
| 51 |
-
}
|
| 52 |
-
// Otherwise, skip
|
| 53 |
-
return null;
|
| 54 |
-
})
|
| 55 |
-
.filter((obj: any) => obj);
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
function isImageFile(file: File) {
|
| 59 |
return file.type.startsWith("image/");
|
| 60 |
}
|
|
|
|
| 1 |
import { useState, useRef, useEffect } from "react";
|
| 2 |
import { useVLMContext } from "../context/useVLMContext";
|
| 3 |
+
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
| 4 |
|
| 5 |
const MODES = ["Webcam", "URL", "File"] as const;
|
| 6 |
type Mode = typeof MODES[number];
|
|
|
|
| 8 |
const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
|
| 9 |
const EXAMPLE_PROMPT = "Detect all birds in the image. For each bird, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"bird\", \"bbox_2d\": [x1, y1, x2, y2]}]";
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
function isImageFile(file: File) {
|
| 12 |
return file.type.startsWith("image/");
|
| 13 |
}
|