Spaces:
Build error
Build error
| import gradio as gr | |
| import torch | |
| from ultralyticsplus import YOLO | |
| import numpy as np | |
| from PIL import Image | |
| from base64 import b64encode | |
| from io import BytesIO | |
| from gtts import gTTS | |
| from mtranslate import translate | |
| from speech_recognition import AudioFile, Recognizer | |
| import time | |
| from sahi.prediction import ObjectPrediction, PredictionScore | |
| from sahi.utils.cv import ( | |
| get_bool_mask_from_coco_segmentation, | |
| read_image_as_pil, | |
| visualize_object_predictions, | |
| ) | |
| model = YOLO('ultralyticsplus/yolov8s') | |
| CLASS = model.model.names | |
| def tts(text: str, language="ja") -> object: | |
| """Converts text into autoplay html. | |
| Args: | |
| text (str): generated answer of bot | |
| Returns: | |
| html: autoplay object | |
| """ | |
| tts_object = gTTS(text=text, lang=language, slow=False) | |
| bytes_object = BytesIO() | |
| tts_object.write_to_fp(bytes_object) | |
| bytes_object.seek(0) | |
| b64 = b64encode(bytes_object.getvalue()).decode() | |
| html = f""" | |
| <audio controls autoplay> | |
| <source src="data:audio/wav;base64,{b64}" type="audio/wav"> | |
| </audio> | |
| """ | |
| return html | |
| def yolov8_inference( | |
| image, | |
| area_thres=0.35, | |
| defaul_bot_voice="おはいようございます" | |
| ): | |
| """ | |
| YOLOv8 inference function | |
| Args: | |
| image: Input image | |
| Returns: | |
| Rendered image | |
| """ | |
| # time.sleep(1) | |
| # set model parameters | |
| model.overrides['conf'] = 0.25 # NMS confidence threshold | |
| model.overrides['iou'] = 0.45 # NMS IoU threshold | |
| model.overrides['agnostic_nms'] = False # NMS class-agnostic | |
| model.overrides['max_det'] = 1000 # maximum number of detections per image | |
| results = model.predict(image, show=False)[0] | |
| image = read_image_as_pil(image) | |
| np_image = np.ascontiguousarray(image) | |
| masks, boxes = results.masks, results.boxes | |
| area_image = image.width*image.height | |
| object_predictions = [] | |
| html_bot_voice = "" | |
| most_close = 0 | |
| if boxes is not None: | |
| det_ind = 0 | |
| for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls): | |
| if int(cls) != 0: | |
| continue | |
| box = xyxy.tolist() | |
| area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image | |
| object_prediction = ObjectPrediction( | |
| bbox=box, | |
| category_name=CLASS[int(cls)], | |
| category_id=int(cls), | |
| score=area_rate, | |
| ) | |
| object_predictions.append(object_prediction) | |
| det_ind += 1 | |
| if area_rate >= most_close: | |
| out_img = image.crop(tuple(box)).resize((image.width, image.height)) | |
| most_close = area_rate | |
| if area_rate >= area_thres: | |
| html_bot_voice = tts(defaul_bot_voice, language="ja") | |
| # result = visualize_object_predictions( | |
| # image=np_image, | |
| # object_prediction_list=object_predictions, | |
| # rect_th=2, | |
| # text_th=2, | |
| # ) | |
| # return Image.fromarray(result["image"]), html_bot_voice | |
| return out_img, html_bot_voice | |
| outputs = [gr.Image(type="filepath", label="Robot View"), | |
| gr.HTML()] | |
| title = "Detomo Aisatsu Robot" | |
| demo_app = gr.Interface( | |
| fn=yolov8_inference, | |
| inputs=gr.Image(source="webcam", streaming=True, label="Input Image"), | |
| outputs=outputs, | |
| title=title, | |
| live=True, | |
| ) | |
| demo_app.launch(debug=True) |