Add --live flag and /live command for streaming mode
Browse files- chat_minicpmo.py +15 -2
chat_minicpmo.py
CHANGED
|
@@ -380,7 +380,7 @@ def run_interactive(model, processor, args):
|
|
| 380 |
current_file = args.file
|
| 381 |
current_audio = args.audio
|
| 382 |
print("MiniCPM-o 4.5 MLX Chat")
|
| 383 |
-
print("Commands: /image <path> | /audio <path> | /clear | /quit")
|
| 384 |
if current_file:
|
| 385 |
print(f"Loaded image: {current_file}")
|
| 386 |
if current_audio:
|
|
@@ -414,6 +414,11 @@ def run_interactive(model, processor, args):
|
|
| 414 |
current_file = None
|
| 415 |
print(f"Audio loaded: {current_audio}\n")
|
| 416 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
print()
|
| 419 |
|
|
@@ -449,6 +454,8 @@ def main():
|
|
| 449 |
python chat_minicpmo.py photo.jpg -p "What's in this image?"
|
| 450 |
python chat_minicpmo.py --audio speech.wav -p "Transcribe this."
|
| 451 |
python chat_minicpmo.py --audio speech.wav # interactive with audio
|
|
|
|
|
|
|
| 452 |
python chat_minicpmo.py # interactive mode
|
| 453 |
""",
|
| 454 |
)
|
|
@@ -463,13 +470,19 @@ def main():
|
|
| 463 |
parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens")
|
| 464 |
parser.add_argument("--temp", type=float, default=0.0, help="Temperature")
|
| 465 |
parser.add_argument("--max-slices", type=int, default=9, help="Max image slices")
|
|
|
|
|
|
|
|
|
|
| 466 |
args = parser.parse_args()
|
| 467 |
|
| 468 |
print("Loading model...", flush=True)
|
| 469 |
model, processor = load(args.model, trust_remote_code=True)
|
| 470 |
print("Model ready.\n")
|
| 471 |
|
| 472 |
-
if args.
|
|
|
|
|
|
|
|
|
|
| 473 |
run_once(model, processor, args)
|
| 474 |
else:
|
| 475 |
run_interactive(model, processor, args)
|
|
|
|
| 380 |
current_file = args.file
|
| 381 |
current_audio = args.audio
|
| 382 |
print("MiniCPM-o 4.5 MLX Chat")
|
| 383 |
+
print("Commands: /image <path> | /audio <path> | /live | /clear | /quit")
|
| 384 |
if current_file:
|
| 385 |
print(f"Loaded image: {current_file}")
|
| 386 |
if current_audio:
|
|
|
|
| 414 |
current_file = None
|
| 415 |
print(f"Audio loaded: {current_audio}\n")
|
| 416 |
continue
|
| 417 |
+
if prompt.lower() == "/live":
|
| 418 |
+
from streaming import run_live_mode
|
| 419 |
+
run_live_mode(model, processor, args)
|
| 420 |
+
print()
|
| 421 |
+
continue
|
| 422 |
|
| 423 |
print()
|
| 424 |
|
|
|
|
| 454 |
python chat_minicpmo.py photo.jpg -p "What's in this image?"
|
| 455 |
python chat_minicpmo.py --audio speech.wav -p "Transcribe this."
|
| 456 |
python chat_minicpmo.py --audio speech.wav # interactive with audio
|
| 457 |
+
python chat_minicpmo.py --live # full duplex streaming
|
| 458 |
+
python chat_minicpmo.py --live --capture-region 0,0,1920,1080
|
| 459 |
python chat_minicpmo.py # interactive mode
|
| 460 |
""",
|
| 461 |
)
|
|
|
|
| 470 |
parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens")
|
| 471 |
parser.add_argument("--temp", type=float, default=0.0, help="Temperature")
|
| 472 |
parser.add_argument("--max-slices", type=int, default=9, help="Max image slices")
|
| 473 |
+
parser.add_argument("--live", action="store_true", help="Full duplex streaming mode")
|
| 474 |
+
parser.add_argument("--capture-region", default=None, help="Screen region x,y,w,h (default: primary monitor)")
|
| 475 |
+
parser.add_argument("--audio-device", default="BlackHole", help="Audio input device (default: BlackHole)")
|
| 476 |
args = parser.parse_args()
|
| 477 |
|
| 478 |
print("Loading model...", flush=True)
|
| 479 |
model, processor = load(args.model, trust_remote_code=True)
|
| 480 |
print("Model ready.\n")
|
| 481 |
|
| 482 |
+
if args.live:
|
| 483 |
+
from streaming import run_live_mode
|
| 484 |
+
run_live_mode(model, processor, args)
|
| 485 |
+
elif args.prompt:
|
| 486 |
run_once(model, processor, args)
|
| 487 |
else:
|
| 488 |
run_interactive(model, processor, args)
|