qqc1989 commited on Aug 20, 2025

Commit

40616d4

verified ·

1 Parent(s): 687916d

Upload 21 files

Browse files

Files changed (22) hide show

.gitattributes +9 -0
README.md +155 -3
config.json +0 -0
football.jpg +3 -0
install/bin/axcl_aarch64/test_detect_by_text +3 -0
install/bin/axcl_x86/test_detect_by_text +3 -0
install/bin/host_650/test_detect_by_text +3 -0
install/lib/axcl_aarch64/libyoloworld.so +3 -0
install/lib/axcl_x86/libyoloworld.so +3 -0
install/lib/host_650/libyoloworld.so +3 -0
models/clip_b1_u16_ax630c.axmodel +3 -0
models/clip_b1_u16_ax650.axmodel +3 -0
models/yolo_u16_ax630c.axmodel +3 -0
models/yolo_u16_ax650.axmodel +3 -0
pyyoloworld/example.py +62 -0
pyyoloworld/gardio_example.jpg +3 -0
pyyoloworld/gradio_example.py +103 -0
pyyoloworld/pyaxdev.py +149 -0
pyyoloworld/pyyoloworld.py +135 -0
pyyoloworld/requirements.txt +4 -0
result.png +3 -0
vocab.txt +0 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text
+football.jpg filter=lfs diff=lfs merge=lfs -text
+install/bin/axcl_aarch64/test_detect_by_text filter=lfs diff=lfs merge=lfs -text
+install/bin/axcl_x86/test_detect_by_text filter=lfs diff=lfs merge=lfs -text
+install/bin/host_650/test_detect_by_text filter=lfs diff=lfs merge=lfs -text
+install/lib/axcl_aarch64/libyoloworld.so filter=lfs diff=lfs merge=lfs -text
+install/lib/axcl_x86/libyoloworld.so filter=lfs diff=lfs merge=lfs -text
+install/lib/host_650/libyoloworld.so filter=lfs diff=lfs merge=lfs -text
+pyyoloworld/gardio_example.jpg filter=lfs diff=lfs merge=lfs -text
+result.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,155 @@
----
-license: mit
----

+---
+license: mit
+language:
+- en
+- zh
+tags:
+- YOLO World
+---
+# YOLOWorld
+This SDK enables efficient Open-Vocabulary-Object-Detection using YOLO-Worldv2 Large, optimized for Axera’s NPU-based SoC platforms including AX650 Series, AX630C Series,  AX8850 Series, or Axera's dedicated AI accelerator.
+## References links:
+For those who are interested in model conversion, you can try to export axmodel through
+- [The github repo of yoloworld.axera open source](https://github.com/AXERA-TECH/yoloworld.axera)
+- [How to convert the yoloworld models](https://github.com/AXERA-TECH/ONNX-YOLO-World-Open-Vocabulary-Object-Detection)
+- [Pulsar2 Link, How to Convert ONNX to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/pulsar2/introduction.html)
+## Support Platform
+- AX650
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+- AX630C
+  - [爱芯派2](https://axera-pi-2-docs-cn.readthedocs.io/zh-cn/latest/index.html)
+  - [Module-LLM](https://docs.m5stack.com/zh_CN/module/Module-LLM)
+  - [LLM630 Compute Kit](https://docs.m5stack.com/zh_CN/core/LLM630%20Compute%20Kit)
+## Performance
+| Model | Input Shape |  Latency (ms) | CMM Usage (MB) |
+|-------|------------|--------------|------------|
+| yolo_u16_ax650.axmodel| 1 x 640 x 640 x 3 |  9.522 ms | 21 MB |
+| clip_b1_u16_ax650.axmodel | 1 x 77 |  2.997 ms | 137 MB |
+| yolo_u16_ax630c.axmodel | 1 x 640 x 640 x 3 |  43.450 ms | 31 MB |
+| clip_b1_u16_ax630c.axmodel | 1 x 77 |  10.703 ms | 134 MB |
+## How to use
+Download all files from this repository to the device
+```
+(py312) axera@raspberrypi:~/samples/yoloworldv2 $ tree
+.
+├── config.json
+├── football.jpg
+├── install
+│   ├── bin
+│   │   ├── axcl_aarch64
+│   │   │   └── test_detect_by_text
+│   │   ├── axcl_x86
+│   │   │   └── test_detect_by_text
+│   │   └── host_650
+│   │       └── test_detect_by_text
+│   └── lib
+│       ├── axcl_aarch64
+│       │   └── libyoloworld.so
+│       ├── axcl_x86
+│       │   └── libyoloworld.so
+│       └── host_650
+│           └── libyoloworld.so
+├── models
+│   ├── clip_b1_u16_ax630c.axmodel
+│   ├── clip_b1_u16_ax650.axmodel
+│   ├── yolo_u16_ax630c.axmodel
+│   └── yolo_u16_ax650.axmodel
+├── pyyoloworld
+│   ├── example.py
+│   ├── gardio_example.jpg
+│   ├── gradio_example.py
+│   ├── libyoloworld.so
+│   ├── pyaxdev.py
+│   ├── __pycache__
+│   │   ├── pyaxdev.cpython-312.pyc
+│   │   └── pyyoloworld.cpython-312.pyc
+│   ├── pyyoloworld.py
+│   └── requirements.txt
+├── README.md
+└── vocab.txt
+13 directories, 23 files
+```
+### python env requirement
+```
+pip install -r pyyoloworld/requirements.txt
+```
+#### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro)
+TODO
+#### Inference with M.2 Accelerator card
+[What is M.2 Accelerator card?](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html), Show this DEMO based on Raspberry PI 5.
+```
+(py312) axera@raspberrypi:~/samples/yoloworldv2-new.hg $ export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libstdc++.so.6
+(py312) axera@raspberrypi:~/samples/yoloworldv2-new.hg $ cp install/lib/axcl_aarch64/libyoloworld.so pyyoloworld/
+(py312) axera@raspberrypi:~/samples/yoloworldv2-new.hg $ cd pyyoloworld/
+(py312) axera@raspberrypi:~/samples/yoloworldv2-new.hg/pyyoloworld $ python gradio_example.py --yoloworld ../models/yolo_u16_ax650.axmodel --tenc ../models/clip_b1_u16_ax650.axmodel --vocab ../vocab.txt
+Trying to load: /home/axera/samples/yoloworldv2-new.hg/pyyoloworld/aarch64/libyoloworld.so
+✅ Successfully loaded: /home/axera/samples/yoloworldv2-new.hg/pyyoloworld/libyoloworld.so
+[I][                             run][  31]: AXCLWorker start with devid 0
+input size: 2
+    name:   images [unknown] [unknown]
+        1 x 640 x 640 x 3   size: 1228800
+    name: txt_feats [unknown] [unknown]
+        1 x 4 x 512   size: 8192
+output size: 3
+    name:  stride8
+        1 x 80 x 80 x 68   size: 1740800
+    name: stride16
+        1 x 40 x 40 x 68   size: 435200
+    name: stride32
+        1 x 20 x 20 x 68   size: 108800
+[I][                       yw_create][ 408]: num_classes: 4, num_features: 512, input w: 640, h: 640
+is_output_nhwc: 1
+input size: 1
+    name: text_token [unknown] [unknown]
+        1 x 77   size: 308
+output size: 1
+    name:     2202
+        1 x 1 x 512   size: 2048
+[I][               load_text_encoder][  44]: text feature len 512
+[I][                  load_tokenizer][  60]: text token len 77
+* Running on local URL:  http://0.0.0.0:7860
+* To create a public link, set `share=True` in `launch()`.
+```
+If your Raspberry PI 5 IP Address is 192.168.1.100, so using this URL `http://192.168.1.100:7860` with your WebApp.
+Input：`man`, `shoes`, `ball`, `person` and the test image
+![](./football.jpg)
+Result：
+![](result.png)

config.json ADDED Viewed

File without changes

football.jpg ADDED Viewed

Git LFS Details

SHA256: e7c4b752ef447bfec409888cea8709be15c01d0f6bf91bd16b7762deb90950dc
Pointer size: 131 Bytes
Size of remote file: 325 kB

install/bin/axcl_aarch64/test_detect_by_text ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0be75d1a0ac72f7b9081c6f7ac5c20dfc0e9d11d1996fcc6c8de3e26a6b281db
+size 157416

install/bin/axcl_x86/test_detect_by_text ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c4a8aa10ee6141d1931f83258eb7f84d819eb81b4b7d608fd436a2577d0f5fd
+size 112048

install/bin/host_650/test_detect_by_text ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71f4a149a398db124b3e75010b1b672a935d093f2eb5772d459a4f248eebd665
+size 5925168

install/lib/axcl_aarch64/libyoloworld.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:983c341755649bffd7fd35675bf114a29938e09204aeec1d3ca5cd32f20ddaab
+size 1179736

install/lib/axcl_x86/libyoloworld.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d07d6349940db32177b87bf6fddd476b838ef26ab3184f7869e710ec7a39f7c
+size 1155448

install/lib/host_650/libyoloworld.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66e0b7b2f6ebf92538751dd00c192e1675752689a98fb6adf9cf4ecbae9daf41
+size 4373192

models/clip_b1_u16_ax630c.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:980f80dd17847b7db685e66bc0ddfaa00e5bfff56b05cb3467e6da8058d6b9c7
+size 140712067

models/clip_b1_u16_ax650.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22afd07e0cbc8ca35be930aa171b37feaa5653d0e01402e1c35bdec8dee5da32
+size 143852095

models/yolo_u16_ax630c.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8519e96fb61801e1bdd186547a8a32d1e9e10e94d5365f4d6b72bee63d0927cb
+size 14722509

models/yolo_u16_ax650.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9304f60c84fb06db8a0cc9742d7280e7e41cc4f4d7f516ab335863d9da873c3c
+size 14161499

pyyoloworld/example.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+from pyaxdev import enum_devices, sys_init, sys_deinit, AxDeviceType
+from pyyoloworld import YOLOWORLD
+import cv2
+import glob
+import argparse
+import tqdm
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--yoloworld', type=str, default='cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel')
+    parser.add_argument('--tenc', type=str, default='cnclip/cnclip_vit_l14_336px_text_u16.axmodel')
+    parser.add_argument('--vocab', type=str, default='cnclip/cn_vocab.txt')
+    parser.add_argument('--image', type=str)
+    args = parser.parse_args()
+    # 枚举设备
+    devices_info = enum_devices()
+    print("可用设备:", devices_info)
+    if devices_info['host']['available']:
+        print("host device available")
+        sys_init(AxDeviceType.host_device, -1)
+    elif devices_info['devices']['count'] > 0:
+        print("axcl device available, use device-0")
+        sys_init(AxDeviceType.axcl_device, 0)
+    else:
+        raise Exception("No available device")
+    try:
+        # 创建CLIP实例
+        yw = YOLOWORLD({
+            'text_encoder_path': args.tenc,
+            'tokenizer_path': args.vocab,
+            'yoloworld_path': args.yoloworld,
+        })
+        yw.set_classes(["person", "dog", "car", "horse"])
+        img = cv2.imread(args.image)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        results = yw.detect(img)
+        print(results)
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        for result in results:
+            x = result['x']
+            y = result['y']
+            w = result['w']
+            h = result['h']
+            conf = result['score']
+            class_id = result['label']
+            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
+            cv2.putText(img, f"{class_id}: {conf:.2f}", (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
+        cv2.imwrite('result.jpg', img)
+    finally:
+        # 反初始化系统
+        if devices_info['host']['available']:
+            sys_deinit(AxDeviceType.host_device, -1)
+        elif devices_info['devices']['count'] > 0:
+            sys_deinit(AxDeviceType.axcl_device, 0)

pyyoloworld/gardio_example.jpg ADDED Viewed

Git LFS Details

SHA256: 71aa61d84008c3443a927e516f57826aa2595bdeabda7cf2de353c5a84553c0b
Pointer size: 131 Bytes
Size of remote file: 413 kB

pyyoloworld/gradio_example.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import colorsys
+import gradio as gr
+import cv2
+from pyaxdev import enum_devices, sys_init, sys_deinit, AxDeviceType
+from pyyoloworld import YOLOWORLD
+import numpy as np
+from PIL import Image
+import argparse
+import random
+parser = argparse.ArgumentParser()
+parser.add_argument('--yoloworld', type=str, default='cnclip/cnclip_vit_l14_336px_vision_u16u8.axmodel')
+parser.add_argument('--tenc', type=str, default='cnclip/cnclip_vit_l14_336px_text_u16.axmodel')
+parser.add_argument('--vocab', type=str, default='cnclip/cn_vocab.txt')
+args = parser.parse_args()
+# ========== 模型和设备初始化 ==========
+devices_info = enum_devices()
+if devices_info['host']['available']:
+    sys_init(AxDeviceType.host_device, -1)
+    device_type = AxDeviceType.host_device
+    device_id = -1
+elif devices_info['devices']['count'] > 0:
+    sys_init(AxDeviceType.axcl_device, 0)
+    device_type = AxDeviceType.axcl_device
+    device_id = 0
+else:
+    raise Exception("No available device")
+yw = YOLOWORLD({
+            'text_encoder_path': args.tenc,
+            'tokenizer_path': args.vocab,
+            'yoloworld_path': args.yoloworld,
+        })
+def generate_vivid_colors(n):
+    colors = []
+    for i in range(n):
+        # 均匀分布的 hue，饱和度、亮度都设高一点
+        h = i / n
+        s = 0.9 + random.random() * 0.1     # 饱和度 0.9~1.0
+        v = 0.9 + random.random() * 0.1     # 亮度 0.9~1.0
+        r, g, b = colorsys.hsv_to_rgb(h, s, v)
+        colors.append((int(r * 255), int(g * 255), int(b * 255)))
+    return colors
+colors = generate_vivid_colors(4)
+# ========== 推理函数 ==========
+def detect_image(image, class1, class2, class3, class4, threshold):
+    if image is None:
+        return None
+    class_list = [class1, class2, class3, class4]
+    if len(class_list) == 0:
+        return image  # 未设类别时不检测
+    yw.set_classes(class_list)
+    yw.set_threshold(threshold)
+    # 转换为 RGB 格式
+    img = np.array(image.convert('RGB'))  # PIL -> np.ndarray
+    results = yw.detect(img)
+    # 可视化
+    for result in results:
+        x, y, w, h = result['x'], result['y'], result['w'], result['h']
+        conf = result['score']
+        label = result['label']
+        cv2.rectangle(img, (x, y), (x + w, y + h), colors[label], 3)
+        cv2.putText(img, f"{class_list[label]}: {conf:.2f}", (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 2, colors[label], 3)
+    return Image.fromarray(img)  # 返回PIL图像
+# ========== 控制分类框数量 ==========
+NUM_CLASSES = 4  # 可调节输入框数量
+# ========== 构建Gradio界面 ==========
+with gr.Blocks() as demo:
+    gr.Markdown("# YOLOWORLD 图像检测 Demo")
+    with gr.Row():
+        with gr.Column():
+            class1 = gr.Textbox(label="类别 0", value="person")
+            class2 = gr.Textbox(label="类别 1", value="dog")
+            class3 = gr.Textbox(label="类别 2", value="car")
+            class4 = gr.Textbox(label="类别 3", value="horse")
+            threshold_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="阈值")
+            image_input = gr.Image(type="pil", label="输入图片",height=415)
+        with gr.Column():
+            detect_button = gr.Button("检测")
+            image_output = gr.Image(type="pil", label="检测结果", height=800)
+    # 绑定事件
+    detect_button.click(
+        fn=detect_image,
+        inputs=[image_input, class1, class2, class3, class4, threshold_slider],
+        outputs=image_output
+    )
+# ========== 启动 ==========
+demo.launch(server_name="0.0.0.0")

pyyoloworld/pyaxdev.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import ctypes
+import os
+import platform
+lib_name = 'libyoloworld.so'
+def check_error(code: int):
+    if code != 0:
+        raise Exception(f"API错误: {code}")
+base_dir = os.path.dirname(__file__)
+arch = platform.machine()
+if arch == 'x86_64':
+    arch_dir = 'x86_64'
+elif arch in ('aarch64', 'arm64'):
+    arch_dir = 'aarch64'
+else:
+    raise RuntimeError(f"Unsupported architecture: {arch}")
+lib_paths = [
+    os.path.join(base_dir, arch_dir, lib_name),
+    os.path.join(base_dir, lib_name)
+]
+last_error = None
+diagnostic_shown = set()
+for lib_path in lib_paths:
+    try:
+        print(f"Trying to load: {lib_path}")
+        _lib = ctypes.CDLL(lib_path)
+        print(f"✅ Successfully loaded: {lib_path}")
+        break
+    except OSError as e:
+        last_error = e
+        err_str = str(e)
+        print(f"\n❌ Failed to load: {lib_path}")
+        print(f"   {err_str}")
+        # Only show GLIBCXX tip once
+        if "GLIBCXX" in err_str and "not found" in err_str:
+            if "missing_glibcxx" not in diagnostic_shown:
+                diagnostic_shown.add("missing_glibcxx")
+                print("🔍 Detected missing GLIBCXX version in libstdc++.so.6")
+                print("💡 This usually happens when your environment (like Conda) uses an older libstdc++")
+                print(f"👉 Try running with system libstdc++ preloaded:")
+                print(f"   export LD_PRELOAD=/usr/lib/{arch_dir}-linux-gnu/libstdc++.so.6\n")
+        elif "No such file" in err_str:
+            if "file_not_found" not in diagnostic_shown:
+                diagnostic_shown.add("file_not_found")
+                print("🔍 File not found. Please verify that libclip.so exists and the path is correct.\n")
+        elif "wrong ELF class" in err_str:
+            if "elf_mismatch" not in diagnostic_shown:
+                diagnostic_shown.add("elf_mismatch")
+                print("🔍 ELF class mismatch — likely due to architecture conflict (e.g., loading x86_64 .so on aarch64).")
+                print(f"👉 Run `file {lib_path}` to verify the binary architecture.\n")
+        else:
+            if "generic_error" not in diagnostic_shown:
+                diagnostic_shown.add("generic_error")
+                print("📎 Tip: Use `ldd` to inspect missing dependencies:")
+                print(f"   ldd {lib_path}\n")
+else:
+    raise RuntimeError(f"\n❗ Failed to load libclip.so.\nLast error:\n{last_error}")
+# 定义枚举类型
+class AxDeviceType(ctypes.c_int):
+    unknown_device = 0
+    host_device = 1
+    axcl_device = 2
+# 定义结构体
+class AxMemInfo(ctypes.Structure):
+    _fields_ = [
+        ('remain', ctypes.c_int),
+        ('total', ctypes.c_int)
+    ]
+class AxHostInfo(ctypes.Structure):
+    _fields_ = [
+        ('available', ctypes.c_char),
+        ('version', ctypes.c_char * 32),
+        ('mem_info', AxMemInfo)
+    ]
+class AxDeviceInfo(ctypes.Structure):
+    _fields_ = [
+        ('temp', ctypes.c_int),
+        ('cpu_usage', ctypes.c_int),
+        ('npu_usage', ctypes.c_int),
+        ('mem_info', AxMemInfo)
+    ]
+class AxDevices(ctypes.Structure):
+    _fields_ = [
+        ('host', AxHostInfo),
+        ('host_version', ctypes.c_char * 32),
+        ('dev_version', ctypes.c_char * 32),
+        ('count', ctypes.c_ubyte),
+        ('devices_info', AxDeviceInfo * 16)
+    ]
+_lib.ax_dev_enum_devices.argtypes = [ctypes.POINTER(AxDevices)]
+_lib.ax_dev_enum_devices.restype = ctypes.c_int
+_lib.ax_dev_sys_init.argtypes = [AxDeviceType, ctypes.c_char]
+_lib.ax_dev_sys_init.restype = ctypes.c_int
+_lib.ax_dev_sys_deinit.argtypes = [AxDeviceType, ctypes.c_char]
+_lib.ax_dev_sys_deinit.restype = ctypes.c_int
+def enum_devices():
+    devices = AxDevices()
+    check_error(_lib.ax_dev_enum_devices(ctypes.byref(devices)))
+    return {
+        'host': {
+            'available': bool(devices.host.available[0]),
+            'version': devices.host.version.decode('utf-8'),
+            'mem_info': {
+                'remain': devices.host.mem_info.remain,
+                'total': devices.host.mem_info.total
+            }
+        },
+        'devices': {
+            'host_version': devices.host_version.decode('utf-8'),
+            'dev_version': devices.dev_version.decode('utf-8'),
+            'count': devices.count,
+            'devices_info': [{
+                'temp': dev.temp,
+                'cpu_usage': dev.cpu_usage,
+                'npu_usage': dev.npu_usage,
+                'mem_info': {
+                    'remain': dev.mem_info.remain,
+                    'total': dev.mem_info.total
+                }
+            } for dev in devices.devices_info[:devices.count]]
+        }
+    }
+def sys_init(dev_type: AxDeviceType = AxDeviceType.axcl_device, devid: int = 0):
+    check_error(_lib.ax_dev_sys_init(dev_type, devid))
+def sys_deinit(dev_type: AxDeviceType = AxDeviceType.axcl_device, devid: int = 0):
+    check_error(_lib.ax_dev_sys_deinit(dev_type, devid))

pyyoloworld/pyyoloworld.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import ctypes
+from typing import List, Tuple
+import numpy as np
+from pyaxdev import _lib, AxDeviceType, check_error
+YOLOWORLD_CLASSES_NUM = 4
+YOLOWORLD_CLASSES_MAX_LEN = 64
+class YWInit(ctypes.Structure):
+    _fields_ = [
+        ('dev_type', AxDeviceType),
+        ('devid', ctypes.c_char),
+        ('text_encoder_path', ctypes.c_char * 128),
+        ('yoloworld_path', ctypes.c_char * 128),
+        ('tokenizer_path', ctypes.c_char * 128),
+        ('threshold', ctypes.c_float)
+    ]
+class YWClasses(ctypes.Structure):
+    _fields_ = [
+        ("classes", ctypes.c_char * YOLOWORLD_CLASSES_MAX_LEN * YOLOWORLD_CLASSES_NUM),
+    ]
+class YWImage(ctypes.Structure):
+    _fields_ = [
+        ('data', ctypes.POINTER(ctypes.c_ubyte)),
+        ('width', ctypes.c_int),
+        ('height', ctypes.c_int),
+        ('channels', ctypes.c_int),
+        ('stride', ctypes.c_int)
+    ]
+class YWObject(ctypes.Structure):
+    _fields_ = [
+        ('label', ctypes.c_int),
+        ('score', ctypes.c_float),
+        ('x', ctypes.c_int),
+        ('y', ctypes.c_int),
+        ('w', ctypes.c_int),
+        ('h', ctypes.c_int),
+    ]
+class YWObjects(ctypes.Structure):
+    _fields_ = [
+        ('objects', YWObject * 32),
+        ('num', ctypes.c_int),
+    ]
+_lib.yw_create.argtypes = [ctypes.POINTER(YWInit), ctypes.POINTER(ctypes.c_void_p)]
+_lib.yw_create.restype = ctypes.c_int
+_lib.yw_destroy.argtypes = [ctypes.c_void_p]
+_lib.yw_destroy.restype = ctypes.c_int
+_lib.yw_set_classes.argtypes = [ctypes.c_void_p, ctypes.POINTER(YWClasses)]
+_lib.yw_set_classes.restype = ctypes.c_int
+_lib.yw_set_threshold.argtypes = [ctypes.c_void_p, ctypes.c_float]
+_lib.yw_set_threshold.restype = ctypes.c_int
+_lib.yw_detect.argtypes = [ctypes.c_void_p, ctypes.POINTER(YWImage), ctypes.POINTER(YWObjects)]
+_lib.yw_detect.restype = ctypes.c_int
+class YOLOWORLD:
+    def __init__(self, init_info: dict):
+        self.handle = None
+        self.init_info = YWInit()
+        # 设置初始化参数
+        self.init_info.dev_type = init_info.get('dev_type', AxDeviceType.axcl_device)
+        self.init_info.devid = init_info.get('devid', 0)
+        self.init_info.threshold = init_info.get('threshold', 0.1)
+        # 设置路径
+        for path_name in ['text_encoder_path', 'yoloworld_path', 'tokenizer_path']:
+            if path_name in init_info:
+                setattr(self.init_info, path_name, init_info[path_name].encode('utf-8'))
+        # 创建CLIP实例
+        handle = ctypes.c_void_p()
+        check_error(_lib.yw_create(ctypes.byref(self.init_info), ctypes.byref(handle)))
+        self.handle = handle
+    def __del__(self):
+        if self.handle:
+            _lib.yw_destroy(self.handle)
+    def set_classes(self, class_list: List[str]):
+        yw_classes = YWClasses()
+        for i, name in enumerate(class_list):
+            if i >= YOLOWORLD_CLASSES_NUM:
+                break
+            name_bytes = name.encode("utf-8")
+            if len(name_bytes) >= YOLOWORLD_CLASSES_MAX_LEN:
+                raise ValueError(f"Class name '{name}' too long (max {YOLOWORLD_CLASSES_MAX_LEN - 1})")
+            # 清零整行（可省略，默认值已是0）
+            for j in range(YOLOWORLD_CLASSES_MAX_LEN):
+                yw_classes.classes[i][j] = 0
+            # 拷贝字符串
+            for j in range(len(name_bytes)):
+                yw_classes.classes[i][j] = name_bytes[j]
+        check_error(_lib.yw_set_classes(self.handle, ctypes.byref(yw_classes), 0))
+    def set_threshold(self, threshold):
+        check_error(_lib.yw_set_threshold(self.handle, threshold))
+    def detect(self, image_data: np.ndarray) -> None:
+        image = YWImage()
+        image.data = ctypes.cast(image_data.ctypes.data, ctypes.POINTER(ctypes.c_ubyte))
+        image.width = image_data.shape[1]
+        image.height = image_data.shape[0]
+        image.channels = image_data.shape[2]
+        image.stride = image_data.shape[1] * image_data.shape[2]
+        objects = YWObjects()
+        check_error(_lib.yw_detect(self.handle, ctypes.byref(image), ctypes.byref(objects)))
+        ret = []
+        for i in range(objects.num):
+            ret.append({
+                'label': objects.objects[i].label,
+                'score': objects.objects[i].score,
+                'x': objects.objects[i].x,
+                'y': objects.objects[i].y,
+                'w': objects.objects[i].w,
+                'h': objects.objects[i].h,
+            })
+        return ret

pyyoloworld/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+opencv-python
+tqdm
+Pillow

result.png ADDED Viewed

Git LFS Details

SHA256: 536a1e0c395db4050a9943fccea990f68383a2ac81b906a5d63ecf62a808fb98
Pointer size: 131 Bytes
Size of remote file: 583 kB

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff