Spaces:
Running
Running
docs: Add Advanced Mode implementation plan
Browse files
docs/advanced-mode-implementation-plan.md
ADDED
|
@@ -0,0 +1,1389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Advanced 2-Stage Meeting Summarization - Complete Implementation Plan
|
| 2 |
+
|
| 3 |
+
**Project:** Tiny Scribe - Advanced Mode
|
| 4 |
+
**Date:** 2026-02-04
|
| 5 |
+
**Status:** Ready for Implementation
|
| 6 |
+
**Estimated Effort:** 13-19 hours
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## Table of Contents
|
| 11 |
+
|
| 12 |
+
1. [Executive Summary](#executive-summary)
|
| 13 |
+
2. [Design Decisions](#design-decisions)
|
| 14 |
+
3. [Model Registries](#model-registries)
|
| 15 |
+
4. [UI Implementation](#ui-implementation)
|
| 16 |
+
5. [Model Management Infrastructure](#model-management-infrastructure)
|
| 17 |
+
6. [Extraction Pipeline](#extraction-pipeline)
|
| 18 |
+
7. [Implementation Checklist](#implementation-checklist)
|
| 19 |
+
8. [Testing Strategy](#testing-strategy)
|
| 20 |
+
9. [Implementation Priority](#implementation-priority)
|
| 21 |
+
10. [Risk Assessment](#risk-assessment)
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## Executive Summary
|
| 26 |
+
|
| 27 |
+
This plan details the implementation of a **3-model Advanced Summarization Pipeline** for Tiny Scribe, featuring:
|
| 28 |
+
|
| 29 |
+
- ✅ **3 independent model registries** (Extraction, Embedding, Synthesis)
|
| 30 |
+
- ✅ **User-configurable extraction context** (2K-8K tokens, default 4K)
|
| 31 |
+
- ✅ **Reasoning/thinking model support** with independent toggles per stage
|
| 32 |
+
- ✅ **Sequential model loading** for memory efficiency
|
| 33 |
+
- ✅ **Bilingual support** (English + Traditional Chinese zh-TW)
|
| 34 |
+
- ✅ **Fail-fast error handling** with graceful UI feedback
|
| 35 |
+
- ✅ **Complete independence** from Standard mode
|
| 36 |
+
|
| 37 |
+
### Architecture
|
| 38 |
+
|
| 39 |
+
```
|
| 40 |
+
Stage 1: EXTRACTION → Parse transcript → Create windows → Extract JSON items
|
| 41 |
+
Stage 2: DEDUPLICATION → Compute embeddings → Remove semantic duplicates
|
| 42 |
+
Stage 3: SYNTHESIS → Generate executive summary from deduplicated items
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### Key Metrics
|
| 46 |
+
|
| 47 |
+
| Metric | Value |
|
| 48 |
+
|---------|-------|
|
| 49 |
+
| **New Code** | ~1,800 lines |
|
| 50 |
+
| **Modified Code** | ~60 lines |
|
| 51 |
+
| **Total Models** | 33 unique (13 + 4 + 16) |
|
| 52 |
+
| **Default Models** | `lfm2_extract_1.2b`, `granite-107m`, `qwen3_1.7b_q4` |
|
| 53 |
+
| **Memory Strategy** | Sequential load/unload (safe for HF Spaces Free Tier) |
|
| 54 |
+
|
| 55 |
+
---
|
| 56 |
+
|
| 57 |
+
## Design Decisions
|
| 58 |
+
|
| 59 |
+
### Q1: Extraction Model List Composition
|
| 60 |
+
**Decision:** Option A - 13 models (≤1.7B + 2 LFM2-Extract)
|
| 61 |
+
|
| 62 |
+
**Rationale:** Maximum flexibility for users, includes specialized extraction models
|
| 63 |
+
|
| 64 |
+
### Q2: Independence from Standard Mode
|
| 65 |
+
**Decision:** Option A - Extraction fully independent, Synthesis references `AVAILABLE_MODELS`
|
| 66 |
+
|
| 67 |
+
**Rationale:** Avoid duplication while maintaining clear separation of concerns
|
| 68 |
+
|
| 69 |
+
### Q3: Extraction n_ctx UI Control
|
| 70 |
+
**Decision:** Option A - Slider (2K-8K, step 1024, default 4K)
|
| 71 |
+
|
| 72 |
+
**Rationale:** Maximum flexibility for users to balance precision vs speed
|
| 73 |
+
|
| 74 |
+
### Q4: Default Models
|
| 75 |
+
**Decision:**
|
| 76 |
+
- Extraction: `lfm2_extract_1.2b` (specialized, high quality)
|
| 77 |
+
- Embedding: `granite-107m` (fastest, good enough)
|
| 78 |
+
- Synthesis: `qwen3_1.7b_q4` (larger than extraction, better quality)
|
| 79 |
+
|
| 80 |
+
**Rationale:** Balanced defaults optimized for quality and speed
|
| 81 |
+
|
| 82 |
+
### Q5: Model Key Naming
|
| 83 |
+
**Decision:** Keep same keys (no prefix like `adv_synth_`)
|
| 84 |
+
|
| 85 |
+
**Rationale:** Simpler, less duplication, clear role-based config resolution
|
| 86 |
+
|
| 87 |
+
### Q6: Model Overlap Between Stages
|
| 88 |
+
**Decision:** Allow overlap with independent settings per role
|
| 89 |
+
|
| 90 |
+
**Rationale:** Same model can be extraction + synthesis with different parameters
|
| 91 |
+
|
| 92 |
+
### Q7: Reasoning Checkbox UI Flow
|
| 93 |
+
**Decision:** Option B - Separate checkboxes for extraction and synthesis
|
| 94 |
+
|
| 95 |
+
**Rationale:** Independent control per stage, clearer user intent
|
| 96 |
+
|
| 97 |
+
### Q8: Thinking Block Display
|
| 98 |
+
**Decision:** Option A - Reuse "MODEL THINKING PROCESS" field
|
| 99 |
+
|
| 100 |
+
**Rationale:** Consistent with Standard mode, no UI layout changes needed
|
| 101 |
+
|
| 102 |
+
### Q9: Window Token Counting with User n_ctx
|
| 103 |
+
**Decision:** Option A - Strict adherence to user's slider value
|
| 104 |
+
|
| 105 |
+
**Rationale:** Respect user's explicit choice, they may want larger/smaller windows
|
| 106 |
+
|
| 107 |
+
### Q10: Model Loading Error Handling
|
| 108 |
+
**Decision:** Option C - Graceful failure with UI error message
|
| 109 |
+
|
| 110 |
+
**Rationale:** Most user-friendly, allows retry with different model
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## Model Registries
|
| 115 |
+
|
| 116 |
+
### 1. EXTRACTION_MODELS (13 models)
|
| 117 |
+
|
| 118 |
+
**Location:** `/home/luigi/tiny-scribe/app.py`
|
| 119 |
+
|
| 120 |
+
**Features:**
|
| 121 |
+
- ✅ Independent from `AVAILABLE_MODELS`
|
| 122 |
+
- ✅ User-adjustable `n_ctx` (2K-8K, default 4K)
|
| 123 |
+
- ✅ Extraction-optimized settings (temp 0.1-0.3)
|
| 124 |
+
- ✅ 2 hybrid models with reasoning toggle
|
| 125 |
+
|
| 126 |
+
**Complete Registry:**
|
| 127 |
+
|
| 128 |
+
```python
|
| 129 |
+
EXTRACTION_MODELS = {
|
| 130 |
+
"falcon_h1_100m": {
|
| 131 |
+
"name": "Falcon-H1 100M",
|
| 132 |
+
"repo_id": "mradermacher/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
|
| 133 |
+
"filename": "*Q8_0.gguf",
|
| 134 |
+
"max_context": 32768,
|
| 135 |
+
"default_n_ctx": 4096,
|
| 136 |
+
"params_size": "100M",
|
| 137 |
+
"supports_reasoning": False,
|
| 138 |
+
"supports_toggle": False,
|
| 139 |
+
"inference_settings": {
|
| 140 |
+
"temperature": 0.2,
|
| 141 |
+
"top_p": 0.9,
|
| 142 |
+
"top_k": 30,
|
| 143 |
+
"repeat_penalty": 1.0,
|
| 144 |
+
},
|
| 145 |
+
},
|
| 146 |
+
"gemma3_270m": {
|
| 147 |
+
"name": "Gemma-3 270M",
|
| 148 |
+
"repo_id": "unsloth/gemma-3-270m-it-qat-GGUF",
|
| 149 |
+
"filename": "*Q8_0.gguf",
|
| 150 |
+
"max_context": 32768,
|
| 151 |
+
"default_n_ctx": 4096,
|
| 152 |
+
"params_size": "270M",
|
| 153 |
+
"supports_reasoning": False,
|
| 154 |
+
"supports_toggle": False,
|
| 155 |
+
"inference_settings": {
|
| 156 |
+
"temperature": 0.3,
|
| 157 |
+
"top_p": 0.9,
|
| 158 |
+
"top_k": 40,
|
| 159 |
+
"repeat_penalty": 1.0,
|
| 160 |
+
},
|
| 161 |
+
},
|
| 162 |
+
"ernie_300m": {
|
| 163 |
+
"name": "ERNIE-4.5 0.3B (131K Context)",
|
| 164 |
+
"repo_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
|
| 165 |
+
"filename": "*Q8_0.gguf",
|
| 166 |
+
"max_context": 131072,
|
| 167 |
+
"default_n_ctx": 4096,
|
| 168 |
+
"params_size": "300M",
|
| 169 |
+
"supports_reasoning": False,
|
| 170 |
+
"supports_toggle": False,
|
| 171 |
+
"inference_settings": {
|
| 172 |
+
"temperature": 0.2,
|
| 173 |
+
"top_p": 0.9,
|
| 174 |
+
"top_k": 30,
|
| 175 |
+
"repeat_penalty": 1.0,
|
| 176 |
+
},
|
| 177 |
+
},
|
| 178 |
+
"granite_350m": {
|
| 179 |
+
"name": "Granite-4.0 350M",
|
| 180 |
+
"repo_id": "unsloth/granite-4.0-h-350m-GGUF",
|
| 181 |
+
"filename": "*Q8_0.gguf",
|
| 182 |
+
"max_context": 32768,
|
| 183 |
+
"default_n_ctx": 4096,
|
| 184 |
+
"params_size": "350M",
|
| 185 |
+
"supports_reasoning": False,
|
| 186 |
+
"supports_toggle": False,
|
| 187 |
+
"inference_settings": {
|
| 188 |
+
"temperature": 0.1,
|
| 189 |
+
"top_p": 0.95,
|
| 190 |
+
"top_k": 30,
|
| 191 |
+
"repeat_penalty": 1.0,
|
| 192 |
+
},
|
| 193 |
+
},
|
| 194 |
+
"lfm2_350m": {
|
| 195 |
+
"name": "LFM2 350M",
|
| 196 |
+
"repo_id": "LiquidAI/LFM2-350M-GGUF",
|
| 197 |
+
"filename": "*Q8_0.gguf",
|
| 198 |
+
"max_context": 32768,
|
| 199 |
+
"default_n_ctx": 4096,
|
| 200 |
+
"params_size": "350M",
|
| 201 |
+
"supports_reasoning": False,
|
| 202 |
+
"supports_toggle": False,
|
| 203 |
+
"inference_settings": {
|
| 204 |
+
"temperature": 0.2,
|
| 205 |
+
"top_p": 0.9,
|
| 206 |
+
"top_k": 40,
|
| 207 |
+
"repeat_penalty": 1.0,
|
| 208 |
+
},
|
| 209 |
+
},
|
| 210 |
+
"bitcpm4_500m": {
|
| 211 |
+
"name": "BitCPM4 0.5B (128K Context)",
|
| 212 |
+
"repo_id": "openbmb/BitCPM4-0.5B-GGUF",
|
| 213 |
+
"filename": "*q4_0.gguf",
|
| 214 |
+
"max_context": 131072,
|
| 215 |
+
"default_n_ctx": 4096,
|
| 216 |
+
"params_size": "500M",
|
| 217 |
+
"supports_reasoning": False,
|
| 218 |
+
"supports_toggle": False,
|
| 219 |
+
"inference_settings": {
|
| 220 |
+
"temperature": 0.2,
|
| 221 |
+
"top_p": 0.9,
|
| 222 |
+
"top_k": 30,
|
| 223 |
+
"repeat_penalty": 1.0,
|
| 224 |
+
},
|
| 225 |
+
},
|
| 226 |
+
"hunyuan_500m": {
|
| 227 |
+
"name": "Hunyuan 0.5B (256K Context)",
|
| 228 |
+
"repo_id": "mradermacher/Hunyuan-0.5B-Instruct-GGUF",
|
| 229 |
+
"filename": "*Q8_0.gguf",
|
| 230 |
+
"max_context": 262144,
|
| 231 |
+
"default_n_ctx": 4096,
|
| 232 |
+
"params_size": "500M",
|
| 233 |
+
"supports_reasoning": False,
|
| 234 |
+
"supports_toggle": False,
|
| 235 |
+
"inference_settings": {
|
| 236 |
+
"temperature": 0.2,
|
| 237 |
+
"top_p": 0.9,
|
| 238 |
+
"top_k": 30,
|
| 239 |
+
"repeat_penalty": 1.0,
|
| 240 |
+
},
|
| 241 |
+
},
|
| 242 |
+
"qwen3_600m_q4": {
|
| 243 |
+
"name": "Qwen3 0.6B Q4 (32K Context)",
|
| 244 |
+
"repo_id": "unsloth/Qwen3-0.6B-GGUF",
|
| 245 |
+
"filename": "*Q4_0.gguf",
|
| 246 |
+
"max_context": 32768,
|
| 247 |
+
"default_n_ctx": 4096,
|
| 248 |
+
"params_size": "600M",
|
| 249 |
+
"supports_reasoning": True, # ← HYBRID MODEL
|
| 250 |
+
"supports_toggle": True, # ← User can toggle reasoning
|
| 251 |
+
"inference_settings": {
|
| 252 |
+
"temperature": 0.3,
|
| 253 |
+
"top_p": 0.9,
|
| 254 |
+
"top_k": 20,
|
| 255 |
+
"repeat_penalty": 1.0,
|
| 256 |
+
},
|
| 257 |
+
},
|
| 258 |
+
"granite_3_1_1b_q8": {
|
| 259 |
+
"name": "Granite 3.1 1B-A400M Instruct (128K Context)",
|
| 260 |
+
"repo_id": "bartowski/granite-3.1-1b-a400m-instruct-GGUF",
|
| 261 |
+
"filename": "*Q8_0.gguf",
|
| 262 |
+
"max_context": 131072,
|
| 263 |
+
"default_n_ctx": 4096,
|
| 264 |
+
"params_size": "1B",
|
| 265 |
+
"supports_reasoning": False,
|
| 266 |
+
"supports_toggle": False,
|
| 267 |
+
"inference_settings": {
|
| 268 |
+
"temperature": 0.3,
|
| 269 |
+
"top_p": 0.9,
|
| 270 |
+
"top_k": 30,
|
| 271 |
+
"repeat_penalty": 1.0,
|
| 272 |
+
},
|
| 273 |
+
},
|
| 274 |
+
"falcon_h1_1.5b_q4": {
|
| 275 |
+
"name": "Falcon-H1 1.5B Q4",
|
| 276 |
+
"repo_id": "unsloth/Falcon-H1-1.5B-Deep-Instruct-GGUF",
|
| 277 |
+
"filename": "*Q4_K_M.gguf",
|
| 278 |
+
"max_context": 32768,
|
| 279 |
+
"default_n_ctx": 4096,
|
| 280 |
+
"params_size": "1.5B",
|
| 281 |
+
"supports_reasoning": False,
|
| 282 |
+
"supports_toggle": False,
|
| 283 |
+
"inference_settings": {
|
| 284 |
+
"temperature": 0.2,
|
| 285 |
+
"top_p": 0.9,
|
| 286 |
+
"top_k": 30,
|
| 287 |
+
"repeat_penalty": 1.0,
|
| 288 |
+
},
|
| 289 |
+
},
|
| 290 |
+
"qwen3_1.7b_q4": {
|
| 291 |
+
"name": "Qwen3 1.7B Q4 (32K Context)",
|
| 292 |
+
"repo_id": "unsloth/Qwen3-1.7B-GGUF",
|
| 293 |
+
"filename": "*Q4_0.gguf",
|
| 294 |
+
"max_context": 32768,
|
| 295 |
+
"default_n_ctx": 4096,
|
| 296 |
+
"params_size": "1.7B",
|
| 297 |
+
"supports_reasoning": True, # ← HYBRID MODEL
|
| 298 |
+
"supports_toggle": True, # ← User can toggle reasoning
|
| 299 |
+
"inference_settings": {
|
| 300 |
+
"temperature": 0.3,
|
| 301 |
+
"top_p": 0.9,
|
| 302 |
+
"top_k": 20,
|
| 303 |
+
"repeat_penalty": 1.0,
|
| 304 |
+
},
|
| 305 |
+
},
|
| 306 |
+
# ===== SPECIALIZED EXTRACTION MODELS =====
|
| 307 |
+
"lfm2_extract_350m": {
|
| 308 |
+
"name": "🎯 LFM2-Extract 350M (Specialized)",
|
| 309 |
+
"repo_id": "LiquidAI/LFM2-350M-Extract-GGUF",
|
| 310 |
+
"filename": "*Q8_0.gguf",
|
| 311 |
+
"max_context": 32768,
|
| 312 |
+
"default_n_ctx": 4096,
|
| 313 |
+
"params_size": "350M",
|
| 314 |
+
"supports_reasoning": False,
|
| 315 |
+
"supports_toggle": False,
|
| 316 |
+
"description": "Optimized for extraction tasks",
|
| 317 |
+
"inference_settings": {
|
| 318 |
+
"temperature": 0.2,
|
| 319 |
+
"top_p": 0.9,
|
| 320 |
+
"top_k": 30,
|
| 321 |
+
"repeat_penalty": 1.0,
|
| 322 |
+
},
|
| 323 |
+
},
|
| 324 |
+
"lfm2_extract_1.2b": {
|
| 325 |
+
"name": "🎯 LFM2-Extract 1.2B (High Quality)",
|
| 326 |
+
"repo_id": "LiquidAI/LFM2-1.2B-Extract-GGUF",
|
| 327 |
+
"filename": "*Q4_0.gguf",
|
| 328 |
+
"max_context": 32768,
|
| 329 |
+
"default_n_ctx": 4096,
|
| 330 |
+
"params_size": "1.2B",
|
| 331 |
+
"supports_reasoning": False,
|
| 332 |
+
"supports_toggle": False,
|
| 333 |
+
"description": "Higher quality extraction for complex meetings",
|
| 334 |
+
"inference_settings": {
|
| 335 |
+
"temperature": 0.2,
|
| 336 |
+
"top_p": 0.9,
|
| 337 |
+
"top_k": 30,
|
| 338 |
+
"repeat_penalty": 1.0,
|
| 339 |
+
},
|
| 340 |
+
},
|
| 341 |
+
}
|
| 342 |
+
```
|
| 343 |
+
|
| 344 |
+
**Hybrid Models (Reasoning Support):**
|
| 345 |
+
- `qwen3_600m_q4` - 600M, user-toggleable reasoning
|
| 346 |
+
- `qwen3_1.7b_q4` - 1.7B, user-toggleable reasoning
|
| 347 |
+
|
| 348 |
+
---
|
| 349 |
+
|
| 350 |
+
### 2. SYNTHESIS_MODELS (16 models)
|
| 351 |
+
|
| 352 |
+
**Location:** `/home/luigi/tiny-scribe/app.py`
|
| 353 |
+
|
| 354 |
+
**Features:**
|
| 355 |
+
- ✅ References models #9-24 from `AVAILABLE_MODELS`
|
| 356 |
+
- ✅ Inherits generation settings from Standard mode
|
| 357 |
+
- ✅ 3 hybrid + 5 thinking-only models with reasoning support
|
| 358 |
+
- ✅ Default: `qwen3_1.7b_q4`
|
| 359 |
+
|
| 360 |
+
**Registry Definition:**
|
| 361 |
+
|
| 362 |
+
```python
|
| 363 |
+
# Synthesis models reference existing AVAILABLE_MODELS (Standard mode)
|
| 364 |
+
SYNTHESIS_MODELS = {
|
| 365 |
+
k: AVAILABLE_MODELS[k] for k in [
|
| 366 |
+
"granite_3_1_1b_q8", # #9 - 1B, 128K ctx
|
| 367 |
+
"falcon_h1_1.5b_q4", # #10 - 1.5B, 32K ctx
|
| 368 |
+
"qwen3_1.7b_q4", # #11 - 1.7B, 32K ctx, reasoning toggle (DEFAULT)
|
| 369 |
+
"granite_3_3_2b_q4", # #12 - 2B, 128K ctx
|
| 370 |
+
"youtu_llm_2b_q8", # #13 - 2B, 128K ctx, reasoning toggle
|
| 371 |
+
"lfm2_2_6b_transcript", # #14 - 2.6B, 32K ctx, transcript-optimized
|
| 372 |
+
"breeze_3b_q4", # #15 - 3B, 32K ctx
|
| 373 |
+
"granite_3_1_3b_q4", # #16 - 3B, 128K ctx
|
| 374 |
+
"qwen3_4b_thinking_q3", # #17 - 4B, 256K ctx, thinking-only
|
| 375 |
+
"granite4_tiny_q3", # #18 - 7B, 128K ctx
|
| 376 |
+
"ernie_21b_pt_q1", # #19 - 21B, 128K ctx
|
| 377 |
+
"ernie_21b_thinking_q1", # #20 - 21B, 128K ctx, thinking-only
|
| 378 |
+
"glm_4_7_flash_reap_30b", # #21 - 30B, 128K ctx, thinking-only
|
| 379 |
+
"glm_4_7_flash_30b_iq2", # #22 - 30B, 128K ctx
|
| 380 |
+
"qwen3_30b_thinking_q1", # #23 - 30B, 256K ctx, thinking-only
|
| 381 |
+
"qwen3_30b_instruct_q1", # #24 - 30B, 256K ctx
|
| 382 |
+
]
|
| 383 |
+
}
|
| 384 |
+
```
|
| 385 |
+
|
| 386 |
+
**Reasoning Models:**
|
| 387 |
+
- Hybrid (toggleable): `qwen3_1.7b_q4`, `youtu_llm_2b_q8`
|
| 388 |
+
- Thinking-only: `qwen3_4b_thinking_q3`, `ernie_21b_thinking_q1`, `glm_4_7_flash_reap_30b`, `qwen3_30b_thinking_q1`
|
| 389 |
+
|
| 390 |
+
---
|
| 391 |
+
|
| 392 |
+
### 3. EMBEDDING_MODELS (4 models)
|
| 393 |
+
|
| 394 |
+
**Location:** `/home/luigi/tiny-scribe/meeting_summarizer/extraction.py`
|
| 395 |
+
|
| 396 |
+
**Features:**
|
| 397 |
+
- ✅ Dedicated embedding models (not in AVAILABLE_MODELS)
|
| 398 |
+
- ✅ Used exclusively for deduplication phase
|
| 399 |
+
- ✅ Range: 384-dim to 1024-dim
|
| 400 |
+
- ✅ Default: `granite-107m`
|
| 401 |
+
|
| 402 |
+
**Registry:**
|
| 403 |
+
|
| 404 |
+
```python
|
| 405 |
+
EMBEDDING_MODELS = {
|
| 406 |
+
"granite-107m": {
|
| 407 |
+
"name": "Granite 107M Multilingual (384-dim)",
|
| 408 |
+
"repo_id": "ibm-granite/granite-embedding-107m-multilingual",
|
| 409 |
+
"filename": "*Q8_0.gguf",
|
| 410 |
+
"embedding_dim": 384,
|
| 411 |
+
"max_context": 2048,
|
| 412 |
+
"description": "Fastest, multilingual, good for quick deduplication",
|
| 413 |
+
},
|
| 414 |
+
"granite-278m": {
|
| 415 |
+
"name": "Granite 278M Multilingual (768-dim)",
|
| 416 |
+
"repo_id": "ibm-granite/granite-embedding-278m-multilingual",
|
| 417 |
+
"filename": "*Q8_0.gguf",
|
| 418 |
+
"embedding_dim": 768,
|
| 419 |
+
"max_context": 2048,
|
| 420 |
+
"description": "Balanced speed/quality, multilingual",
|
| 421 |
+
},
|
| 422 |
+
"gemma-300m": {
|
| 423 |
+
"name": "Embedding Gemma 300M (768-dim)",
|
| 424 |
+
"repo_id": "unsloth/embeddinggemma-300m-GGUF",
|
| 425 |
+
"filename": "*Q8_0.gguf",
|
| 426 |
+
"embedding_dim": 768,
|
| 427 |
+
"max_context": 2048,
|
| 428 |
+
"description": "Google embedding model, strong semantics",
|
| 429 |
+
},
|
| 430 |
+
"qwen-600m": {
|
| 431 |
+
"name": "Qwen3 Embedding 600M (1024-dim)",
|
| 432 |
+
"repo_id": "Qwen/Qwen3-Embedding-0.6B-GGUF",
|
| 433 |
+
"filename": "*Q8_0.gguf",
|
| 434 |
+
"embedding_dim": 1024,
|
| 435 |
+
"max_context": 2048,
|
| 436 |
+
"description": "Highest quality, best for critical dedup",
|
| 437 |
+
},
|
| 438 |
+
}
|
| 439 |
+
```
|
| 440 |
+
|
| 441 |
+
---
|
| 442 |
+
|
| 443 |
+
## UI Implementation
|
| 444 |
+
|
| 445 |
+
### Advanced Mode Controls (Option B: Separate Reasoning Checkboxes)
|
| 446 |
+
|
| 447 |
+
**Location:** `/home/luigi/tiny-scribe/app.py`, Gradio interface section
|
| 448 |
+
|
| 449 |
+
```python
|
| 450 |
+
# ===== ADVANCED MODE CONTROLS =====
|
| 451 |
+
with gr.Group(visible=False) as advanced_controls:
|
| 452 |
+
gr.Markdown("### 🧠 Advanced 3-Model Pipeline Configuration")
|
| 453 |
+
|
| 454 |
+
# Model Selection Row
|
| 455 |
+
with gr.Row():
|
| 456 |
+
extraction_model = gr.Dropdown(
|
| 457 |
+
choices=list(EXTRACTION_MODELS.keys()),
|
| 458 |
+
value="lfm2_extract_1.2b", # ⭐ DEFAULT
|
| 459 |
+
label="🔍 Stage 1: Extraction Model (≤1.7B)",
|
| 460 |
+
info="Extracts structured items (action_items, decisions, key_points, questions) from windows"
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
embedding_model = gr.Dropdown(
|
| 464 |
+
choices=list(EMBEDDING_MODELS.keys()),
|
| 465 |
+
value="granite-107m", # ⭐ DEFAULT
|
| 466 |
+
label="🧬 Stage 2: Embedding Model",
|
| 467 |
+
info="Computes semantic embeddings for deduplication across categories"
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
synthesis_model = gr.Dropdown(
|
| 471 |
+
choices=list(SYNTHESIS_MODELS.keys()),
|
| 472 |
+
value="qwen3_1.7b_q4", # ⭐ DEFAULT
|
| 473 |
+
label="✨ Stage 3: Synthesis Model (1B-30B)",
|
| 474 |
+
info="Generates final executive summary from deduplicated items"
|
| 475 |
+
)
|
| 476 |
+
|
| 477 |
+
# Extraction Parameters Row
|
| 478 |
+
with gr.Row():
|
| 479 |
+
extraction_n_ctx = gr.Slider(
|
| 480 |
+
minimum=2048,
|
| 481 |
+
maximum=8192,
|
| 482 |
+
step=1024,
|
| 483 |
+
value=4096, # ⭐ DEFAULT 4K
|
| 484 |
+
label="🪟 Extraction Context Window (n_ctx)",
|
| 485 |
+
info="Smaller = more windows (higher precision), Larger = fewer windows (faster processing)"
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
overlap_turns = gr.Slider(
|
| 489 |
+
minimum=1,
|
| 490 |
+
maximum=5,
|
| 491 |
+
step=1,
|
| 492 |
+
value=2,
|
| 493 |
+
label="🔄 Window Overlap (speaker turns)",
|
| 494 |
+
info="Number of speaker turns shared between adjacent windows (reduces information loss)"
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
# Deduplication Parameters Row
|
| 498 |
+
with gr.Row():
|
| 499 |
+
similarity_threshold = gr.Slider(
|
| 500 |
+
minimum=0.70,
|
| 501 |
+
maximum=0.95,
|
| 502 |
+
step=0.01,
|
| 503 |
+
value=0.85,
|
| 504 |
+
label="🎯 Deduplication Similarity Threshold",
|
| 505 |
+
info="Items with cosine similarity above this are considered duplicates (higher = stricter)"
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
# SEPARATE REASONING CONTROLS (Q7: Option B)
|
| 509 |
+
with gr.Row():
|
| 510 |
+
enable_extraction_reasoning = gr.Checkbox(
|
| 511 |
+
value=False,
|
| 512 |
+
visible=False, # Conditional visibility based on extraction model
|
| 513 |
+
label="🧠 Enable Reasoning for Extraction",
|
| 514 |
+
info="Use thinking process before JSON output (Qwen3 hybrid models only)"
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
enable_synthesis_reasoning = gr.Checkbox(
|
| 518 |
+
value=True,
|
| 519 |
+
visible=True, # Conditional visibility based on synthesis model
|
| 520 |
+
label="🧠 Enable Reasoning for Synthesis",
|
| 521 |
+
info="Use thinking process for final summary generation"
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
# Output Settings Row
|
| 525 |
+
with gr.Row():
|
| 526 |
+
adv_output_language = gr.Radio(
|
| 527 |
+
choices=["en", "zh-TW"],
|
| 528 |
+
value="en",
|
| 529 |
+
label="🌐 Output Language",
|
| 530 |
+
info="Extraction auto-detects language from transcript, synthesis uses this setting"
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
adv_max_tokens = gr.Slider(
|
| 534 |
+
minimum=512,
|
| 535 |
+
maximum=4096,
|
| 536 |
+
step=128,
|
| 537 |
+
value=2048,
|
| 538 |
+
label="📏 Max Synthesis Tokens",
|
| 539 |
+
info="Maximum tokens for final executive summary"
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# Logging Control
|
| 543 |
+
enable_detailed_logging = gr.Checkbox(
|
| 544 |
+
value=True,
|
| 545 |
+
label="📝 Enable Detailed Trace Logging",
|
| 546 |
+
info="Save JSONL trace file (embedded in download JSON) for debugging pipeline"
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
# Model Info Accordion
|
| 550 |
+
with gr.Accordion("📋 Model Details & Settings", open=False):
|
| 551 |
+
with gr.Row():
|
| 552 |
+
with gr.Column():
|
| 553 |
+
extraction_model_info = gr.Markdown("**Extraction Model**\n\nSelect a model to see details")
|
| 554 |
+
with gr.Column():
|
| 555 |
+
embedding_model_info = gr.Markdown("**Embedding Model**\n\nSelect a model to see details")
|
| 556 |
+
with gr.Column():
|
| 557 |
+
synthesis_model_info = gr.Markdown("**Synthesis Model**\n\nSelect a model to see details")
|
| 558 |
+
```
|
| 559 |
+
|
| 560 |
+
---
|
| 561 |
+
|
| 562 |
+
### Conditional Reasoning Checkbox Visibility Logic
|
| 563 |
+
|
| 564 |
+
```python
|
| 565 |
+
def update_extraction_reasoning_visibility(model_key):
|
| 566 |
+
"""Show/hide extraction reasoning checkbox based on model capabilities."""
|
| 567 |
+
config = EXTRACTION_MODELS.get(model_key, {})
|
| 568 |
+
supports_toggle = config.get("supports_toggle", False)
|
| 569 |
+
|
| 570 |
+
if supports_toggle:
|
| 571 |
+
# Hybrid model (qwen3_600m_q4, qwen3_1.7b_q4)
|
| 572 |
+
return gr.update(
|
| 573 |
+
visible=True,
|
| 574 |
+
value=False,
|
| 575 |
+
interactive=True,
|
| 576 |
+
label="🧠 Enable Reasoning for Extraction"
|
| 577 |
+
)
|
| 578 |
+
elif config.get("supports_reasoning", False) and not supports_toggle:
|
| 579 |
+
# Thinking-only model (none currently in extraction, but future-proof)
|
| 580 |
+
return gr.update(
|
| 581 |
+
visible=True,
|
| 582 |
+
value=True,
|
| 583 |
+
interactive=False,
|
| 584 |
+
label="🧠 Reasoning Mode for Extraction (Always On)"
|
| 585 |
+
)
|
| 586 |
+
else:
|
| 587 |
+
# Non-reasoning model
|
| 588 |
+
return gr.update(visible=False, value=False)
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
def update_synthesis_reasoning_visibility(model_key):
|
| 592 |
+
"""Show/hide synthesis reasoning checkbox based on model capabilities."""
|
| 593 |
+
# Reuse existing logic from Standard mode
|
| 594 |
+
return update_reasoning_visibility(model_key) # Existing function
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
# Wire up event handlers
|
| 598 |
+
extraction_model.change(
|
| 599 |
+
fn=update_extraction_reasoning_visibility,
|
| 600 |
+
inputs=[extraction_model],
|
| 601 |
+
outputs=[enable_extraction_reasoning]
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
synthesis_model.change(
|
| 605 |
+
fn=update_synthesis_reasoning_visibility,
|
| 606 |
+
inputs=[synthesis_model],
|
| 607 |
+
outputs=[enable_synthesis_reasoning]
|
| 608 |
+
)
|
| 609 |
+
```
|
| 610 |
+
|
| 611 |
+
---
|
| 612 |
+
|
| 613 |
+
### Model Info Display Functions
|
| 614 |
+
|
| 615 |
+
```python
|
| 616 |
+
def get_extraction_model_info(model_key):
|
| 617 |
+
"""Generate markdown info for extraction model."""
|
| 618 |
+
config = EXTRACTION_MODELS.get(model_key, {})
|
| 619 |
+
settings = config.get("inference_settings", {})
|
| 620 |
+
|
| 621 |
+
reasoning_support = ""
|
| 622 |
+
if config.get("supports_toggle"):
|
| 623 |
+
reasoning_support = "\n**Reasoning:** Hybrid (user-toggleable)"
|
| 624 |
+
elif config.get("supports_reasoning"):
|
| 625 |
+
reasoning_support = "\n**Reasoning:** Thinking-only (always on)"
|
| 626 |
+
|
| 627 |
+
return f"""**{config.get('name', 'Unknown')}**
|
| 628 |
+
|
| 629 |
+
**Size:** {config.get('params_size', 'N/A')}
|
| 630 |
+
**Max Context:** {config.get('max_context', 0):,} tokens
|
| 631 |
+
**Default n_ctx:** {config.get('default_n_ctx', 4096):,} tokens (user-adjustable via slider)
|
| 632 |
+
**Repository:** `{config.get('repo_id', 'N/A')}`{reasoning_support}
|
| 633 |
+
|
| 634 |
+
**Extraction-Optimized Settings:**
|
| 635 |
+
- Temperature: {settings.get('temperature', 'N/A')} (deterministic for JSON)
|
| 636 |
+
- Top P: {settings.get('top_p', 'N/A')}
|
| 637 |
+
- Top K: {settings.get('top_k', 'N/A')}
|
| 638 |
+
- Repeat Penalty: {settings.get('repeat_penalty', 'N/A')}
|
| 639 |
+
"""
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
def get_embedding_model_info(model_key):
|
| 643 |
+
"""Generate markdown info for embedding model."""
|
| 644 |
+
from meeting_summarizer.extraction import EMBEDDING_MODELS
|
| 645 |
+
config = EMBEDDING_MODELS.get(model_key, {})
|
| 646 |
+
|
| 647 |
+
return f"""**{config.get('name', 'Unknown')}**
|
| 648 |
+
|
| 649 |
+
**Embedding Dimension:** {config.get('embedding_dim', 'N/A')}
|
| 650 |
+
**Context:** {config.get('max_context', 0):,} tokens
|
| 651 |
+
**Repository:** `{config.get('repo_id', 'N/A')}`
|
| 652 |
+
|
| 653 |
+
**Description:** {config.get('description', 'N/A')}
|
| 654 |
+
"""
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
def get_synthesis_model_info(model_key):
|
| 658 |
+
"""Generate markdown info for synthesis model."""
|
| 659 |
+
config = SYNTHESIS_MODELS.get(model_key, {})
|
| 660 |
+
settings = config.get("inference_settings", {})
|
| 661 |
+
|
| 662 |
+
reasoning_support = ""
|
| 663 |
+
if config.get("supports_toggle"):
|
| 664 |
+
reasoning_support = "\n**Reasoning:** Hybrid (user-toggleable)"
|
| 665 |
+
elif config.get("supports_reasoning"):
|
| 666 |
+
reasoning_support = "\n**Reasoning:** Thinking-only (always on)"
|
| 667 |
+
|
| 668 |
+
return f"""**{config.get('name', 'Unknown')}**
|
| 669 |
+
|
| 670 |
+
**Max Context:** {config.get('max_context', 0):,} tokens
|
| 671 |
+
**Repository:** `{config.get('repo_id', 'N/A')}`{reasoning_support}
|
| 672 |
+
|
| 673 |
+
**Synthesis-Optimized Settings:**
|
| 674 |
+
- Temperature: {settings.get('temperature', 'N/A')} (from Standard mode)
|
| 675 |
+
- Top P: {settings.get('top_p', 'N/A')}
|
| 676 |
+
- Top K: {settings.get('top_k', 'N/A')}
|
| 677 |
+
- Repeat Penalty: {settings.get('repeat_penalty', 'N/A')}
|
| 678 |
+
"""
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
# Wire up info update handlers
|
| 682 |
+
extraction_model.change(
|
| 683 |
+
fn=get_extraction_model_info,
|
| 684 |
+
inputs=[extraction_model],
|
| 685 |
+
outputs=[extraction_model_info]
|
| 686 |
+
)
|
| 687 |
+
|
| 688 |
+
embedding_model.change(
|
| 689 |
+
fn=get_embedding_model_info,
|
| 690 |
+
inputs=[embedding_model],
|
| 691 |
+
outputs=[embedding_model_info]
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
synthesis_model.change(
|
| 695 |
+
fn=get_synthesis_model_info,
|
| 696 |
+
inputs=[synthesis_model],
|
| 697 |
+
outputs=[synthesis_model_info]
|
| 698 |
+
)
|
| 699 |
+
```
|
| 700 |
+
|
| 701 |
+
---
|
| 702 |
+
|
| 703 |
+
## Model Management Infrastructure
|
| 704 |
+
|
| 705 |
+
### Role-Aware Configuration Resolver
|
| 706 |
+
|
| 707 |
+
```python
|
| 708 |
+
def get_model_config(model_key: str, model_role: str) -> Dict[str, Any]:
|
| 709 |
+
"""
|
| 710 |
+
Get model configuration based on role.
|
| 711 |
+
|
| 712 |
+
Ensures same model (e.g., qwen3_1.7b_q4) uses DIFFERENT settings
|
| 713 |
+
for extraction vs synthesis.
|
| 714 |
+
|
| 715 |
+
Args:
|
| 716 |
+
model_key: Model identifier (e.g., "qwen3_1.7b_q4")
|
| 717 |
+
model_role: "extraction" or "synthesis"
|
| 718 |
+
|
| 719 |
+
Returns:
|
| 720 |
+
Model configuration dict with role-specific settings
|
| 721 |
+
|
| 722 |
+
Raises:
|
| 723 |
+
ValueError: If model_key not available for specified role
|
| 724 |
+
"""
|
| 725 |
+
if model_role == "extraction":
|
| 726 |
+
if model_key not in EXTRACTION_MODELS:
|
| 727 |
+
available = ", ".join(list(EXTRACTION_MODELS.keys())[:3]) + "..."
|
| 728 |
+
raise ValueError(
|
| 729 |
+
f"Model '{model_key}' not available for extraction role. "
|
| 730 |
+
f"Available: {available}"
|
| 731 |
+
)
|
| 732 |
+
return EXTRACTION_MODELS[model_key]
|
| 733 |
+
|
| 734 |
+
elif model_role == "synthesis":
|
| 735 |
+
if model_key not in SYNTHESIS_MODELS:
|
| 736 |
+
available = ", ".join(list(SYNTHESIS_MODELS.keys())[:3]) + "..."
|
| 737 |
+
raise ValueError(
|
| 738 |
+
f"Model '{model_key}' not available for synthesis role. "
|
| 739 |
+
f"Available: {available}"
|
| 740 |
+
)
|
| 741 |
+
return SYNTHESIS_MODELS[model_key]
|
| 742 |
+
|
| 743 |
+
else:
|
| 744 |
+
raise ValueError(
|
| 745 |
+
f"Unknown model role: '{model_role}'. "
|
| 746 |
+
f"Must be 'extraction' or 'synthesis'"
|
| 747 |
+
)
|
| 748 |
+
```
|
| 749 |
+
|
| 750 |
+
---
|
| 751 |
+
|
| 752 |
+
### Role-Aware Model Loader (Q9: Option A - Respect user's n_ctx choice)
|
| 753 |
+
|
| 754 |
+
```python
|
| 755 |
+
def load_model_for_role(
|
| 756 |
+
model_key: str,
|
| 757 |
+
model_role: str,
|
| 758 |
+
n_threads: int = 2,
|
| 759 |
+
user_n_ctx: Optional[int] = None # For extraction, from slider
|
| 760 |
+
) -> Tuple[Llama, str]:
|
| 761 |
+
"""
|
| 762 |
+
Load model with role-specific configuration.
|
| 763 |
+
|
| 764 |
+
Args:
|
| 765 |
+
model_key: Model identifier
|
| 766 |
+
model_role: "extraction" or "synthesis"
|
| 767 |
+
n_threads: CPU threads
|
| 768 |
+
user_n_ctx: User-specified n_ctx (extraction only, from slider)
|
| 769 |
+
|
| 770 |
+
Returns:
|
| 771 |
+
(loaded_model, info_message)
|
| 772 |
+
|
| 773 |
+
Raises:
|
| 774 |
+
Exception: If model loading fails (Q10: Option C - fail gracefully)
|
| 775 |
+
"""
|
| 776 |
+
try:
|
| 777 |
+
config = get_model_config(model_key, model_role)
|
| 778 |
+
|
| 779 |
+
# Calculate n_ctx (Q9: Option A - Strict adherence to user's choice)
|
| 780 |
+
if model_role == "extraction" and user_n_ctx is not None:
|
| 781 |
+
n_ctx = min(user_n_ctx, config["max_context"], MAX_USABLE_CTX)
|
| 782 |
+
else:
|
| 783 |
+
# Synthesis or default extraction
|
| 784 |
+
n_ctx = min(config.get("max_context", 8192), MAX_USABLE_CTX)
|
| 785 |
+
|
| 786 |
+
# Detect GPU support
|
| 787 |
+
requested_ngl = int(os.environ.get("N_GPU_LAYERS", 0))
|
| 788 |
+
n_gpu_layers = requested_ngl
|
| 789 |
+
|
| 790 |
+
if requested_ngl != 0:
|
| 791 |
+
try:
|
| 792 |
+
from llama_cpp import llama_supports_gpu_offload
|
| 793 |
+
gpu_available = llama_supports_gpu_offload()
|
| 794 |
+
if not gpu_available:
|
| 795 |
+
logger.warning("GPU requested but not available. Using CPU.")
|
| 796 |
+
n_gpu_layers = 0
|
| 797 |
+
except Exception as e:
|
| 798 |
+
logger.warning(f"Could not detect GPU: {e}. Using CPU.")
|
| 799 |
+
n_gpu_layers = 0
|
| 800 |
+
|
| 801 |
+
# Load model
|
| 802 |
+
logger.info(f"Loading {config['name']} for {model_role} role (n_ctx={n_ctx:,})")
|
| 803 |
+
|
| 804 |
+
llm = Llama.from_pretrained(
|
| 805 |
+
repo_id=config["repo_id"],
|
| 806 |
+
filename=config["filename"],
|
| 807 |
+
n_ctx=n_ctx,
|
| 808 |
+
n_batch=min(2048, n_ctx),
|
| 809 |
+
n_threads=n_threads,
|
| 810 |
+
n_threads_batch=n_threads,
|
| 811 |
+
n_gpu_layers=n_gpu_layers,
|
| 812 |
+
verbose=False,
|
| 813 |
+
seed=1337,
|
| 814 |
+
)
|
| 815 |
+
|
| 816 |
+
info_msg = (
|
| 817 |
+
f"✅ Loaded: {config['name']} for {model_role} "
|
| 818 |
+
f"(n_ctx={n_ctx:,}, threads={n_threads})"
|
| 819 |
+
)
|
| 820 |
+
logger.info(info_msg)
|
| 821 |
+
|
| 822 |
+
return llm, info_msg
|
| 823 |
+
|
| 824 |
+
except Exception as e:
|
| 825 |
+
# Q10: Option C - Fail gracefully, let user select different model
|
| 826 |
+
error_msg = (
|
| 827 |
+
f"❌ Failed to load {model_key} for {model_role}: {str(e)}\n\n"
|
| 828 |
+
f"Please select a different model and try again."
|
| 829 |
+
)
|
| 830 |
+
logger.error(error_msg, exc_info=True)
|
| 831 |
+
raise Exception(error_msg)
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
def unload_model(llm: Llama, model_name: str = "model") -> None:
|
| 835 |
+
"""Explicitly unload model and trigger garbage collection."""
|
| 836 |
+
if llm:
|
| 837 |
+
logger.info(f"Unloading {model_name}")
|
| 838 |
+
del llm
|
| 839 |
+
gc.collect()
|
| 840 |
+
time.sleep(0.5) # Allow OS to reclaim memory
|
| 841 |
+
```
|
| 842 |
+
|
| 843 |
+
---
|
| 844 |
+
|
| 845 |
+
## Extraction Pipeline
|
| 846 |
+
|
| 847 |
+
### Extraction System Prompt Builder (Bilingual + Reasoning)
|
| 848 |
+
|
| 849 |
+
```python
|
| 850 |
+
def build_extraction_system_prompt(
|
| 851 |
+
output_language: str,
|
| 852 |
+
supports_reasoning: bool,
|
| 853 |
+
supports_toggle: bool,
|
| 854 |
+
enable_reasoning: bool
|
| 855 |
+
) -> str:
|
| 856 |
+
"""
|
| 857 |
+
Build extraction system prompt with optional reasoning mode.
|
| 858 |
+
|
| 859 |
+
Args:
|
| 860 |
+
output_language: "en" or "zh-TW" (auto-detected from transcript)
|
| 861 |
+
supports_reasoning: Model has reasoning capability
|
| 862 |
+
supports_toggle: User can toggle reasoning on/off
|
| 863 |
+
enable_reasoning: User's choice (only applies if supports_toggle=True)
|
| 864 |
+
|
| 865 |
+
Returns:
|
| 866 |
+
System prompt string
|
| 867 |
+
"""
|
| 868 |
+
# Determine reasoning mode
|
| 869 |
+
if supports_toggle and enable_reasoning:
|
| 870 |
+
# Hybrid model with reasoning enabled
|
| 871 |
+
reasoning_instruction_en = """
|
| 872 |
+
Use your reasoning capabilities to analyze the content before extracting.
|
| 873 |
+
|
| 874 |
+
Your reasoning should:
|
| 875 |
+
1. Identify key decision points and action items
|
| 876 |
+
2. Distinguish explicit decisions from general discussion
|
| 877 |
+
3. Categorize information appropriately (action vs point vs question)
|
| 878 |
+
|
| 879 |
+
After reasoning, output ONLY valid JSON."""
|
| 880 |
+
|
| 881 |
+
reasoning_instruction_zh = """
|
| 882 |
+
使用你的推理能力分析內容後再進行提取。
|
| 883 |
+
|
| 884 |
+
你的推理應該:
|
| 885 |
+
1. 識別關鍵決策點和行動項目
|
| 886 |
+
2. 區分明確決策與一般討論
|
| 887 |
+
3. 適當分類資訊(行動 vs 要點 vs 問題)
|
| 888 |
+
|
| 889 |
+
推理後,僅輸出 JSON。"""
|
| 890 |
+
else:
|
| 891 |
+
reasoning_instruction_en = ""
|
| 892 |
+
reasoning_instruction_zh = ""
|
| 893 |
+
|
| 894 |
+
# Build full prompt
|
| 895 |
+
if output_language == "zh-TW":
|
| 896 |
+
return f"""你是會議分析助手。從逐字稿中提取結構化資訊。
|
| 897 |
+
{reasoning_instruction_zh}
|
| 898 |
+
|
| 899 |
+
僅輸出有效的 JSON,使用此精確架構:
|
| 900 |
+
{{
|
| 901 |
+
"action_items": ["包含負責人和截止日期的任務", ...],
|
| 902 |
+
"decisions": ["包含理由的決策", ...],
|
| 903 |
+
"key_points": ["重要討論要點", ...],
|
| 904 |
+
"open_questions": ["未解決的問題或疑慮", ...]
|
| 905 |
+
}}
|
| 906 |
+
|
| 907 |
+
規則:
|
| 908 |
+
- 每個項目必須是完整、��立的句子
|
| 909 |
+
- 在每個項目中包含上下文(誰、什麼、何時)
|
| 910 |
+
- 如果類別沒有項目,使用空陣列 []
|
| 911 |
+
- 僅輸出 JSON,無 markdown,無解釋"""
|
| 912 |
+
|
| 913 |
+
else: # English
|
| 914 |
+
return f"""You are a meeting analysis assistant. Extract structured information from transcript.
|
| 915 |
+
{reasoning_instruction_en}
|
| 916 |
+
|
| 917 |
+
Output ONLY valid JSON with this exact schema:
|
| 918 |
+
{{
|
| 919 |
+
"action_items": ["Task with owner and deadline", ...],
|
| 920 |
+
"decisions": ["Decision made with rationale", ...],
|
| 921 |
+
"key_points": ["Important discussion point", ...],
|
| 922 |
+
"open_questions": ["Unresolved question or concern", ...]
|
| 923 |
+
}}
|
| 924 |
+
|
| 925 |
+
Rules:
|
| 926 |
+
- Each item must be a complete, standalone sentence
|
| 927 |
+
- Include context (who, what, when) in each item
|
| 928 |
+
- If a category has no items, use empty array []
|
| 929 |
+
- Output ONLY JSON, no markdown, no explanations"""
|
| 930 |
+
```
|
| 931 |
+
|
| 932 |
+
---
|
| 933 |
+
|
| 934 |
+
### Extraction Streaming with Reasoning Parsing (Q8: Option A - Show in "MODEL THINKING PROCESS")
|
| 935 |
+
|
| 936 |
+
```python
|
| 937 |
+
def stream_extract_from_window(
|
| 938 |
+
extraction_llm: Llama,
|
| 939 |
+
window: Window,
|
| 940 |
+
window_id: int,
|
| 941 |
+
total_windows: int,
|
| 942 |
+
tracer: Tracer,
|
| 943 |
+
tokenizer: NativeTokenizer,
|
| 944 |
+
enable_reasoning: bool = False
|
| 945 |
+
) -> Generator[Tuple[str, str, Dict[str, List[str]], bool], None, None]:
|
| 946 |
+
"""
|
| 947 |
+
Stream extraction from single window with live progress + optional reasoning.
|
| 948 |
+
|
| 949 |
+
Yields:
|
| 950 |
+
(ticker_text, thinking_text, partial_items, is_complete)
|
| 951 |
+
- ticker_text: Progress ticker for UI
|
| 952 |
+
- thinking_text: Reasoning/thinking blocks (if extraction model supports it)
|
| 953 |
+
- partial_items: Current extracted items
|
| 954 |
+
- is_complete: True on final yield
|
| 955 |
+
"""
|
| 956 |
+
# Auto-detect language from window content
|
| 957 |
+
has_cjk = bool(re.search(r'[\u4e00-\u9fff]', window.content))
|
| 958 |
+
output_language = "zh-TW" if has_cjk else "en"
|
| 959 |
+
|
| 960 |
+
# Build system prompt with reasoning support
|
| 961 |
+
config = EXTRACTION_MODELS[window.model_key] # Assuming we pass model_key in Window
|
| 962 |
+
system_prompt = build_extraction_system_prompt(
|
| 963 |
+
output_language=output_language,
|
| 964 |
+
supports_reasoning=config.get("supports_reasoning", False),
|
| 965 |
+
supports_toggle=config.get("supports_toggle", False),
|
| 966 |
+
enable_reasoning=enable_reasoning
|
| 967 |
+
)
|
| 968 |
+
|
| 969 |
+
user_prompt = f"Transcript:\n\n{window.content}"
|
| 970 |
+
|
| 971 |
+
messages = [
|
| 972 |
+
{"role": "system", "content": system_prompt},
|
| 973 |
+
{"role": "user", "content": user_prompt}
|
| 974 |
+
]
|
| 975 |
+
|
| 976 |
+
# Stream extraction
|
| 977 |
+
full_response = ""
|
| 978 |
+
thinking_content = ""
|
| 979 |
+
start_time = time.time()
|
| 980 |
+
first_token_time = None
|
| 981 |
+
token_count = 0
|
| 982 |
+
|
| 983 |
+
try:
|
| 984 |
+
stream = extraction_llm.create_chat_completion(
|
| 985 |
+
messages=messages,
|
| 986 |
+
max_tokens=1024,
|
| 987 |
+
temperature=config["inference_settings"]["temperature"],
|
| 988 |
+
top_p=config["inference_settings"]["top_p"],
|
| 989 |
+
top_k=config["inference_settings"]["top_k"],
|
| 990 |
+
repeat_penalty=config["inference_settings"]["repeat_penalty"],
|
| 991 |
+
stream=True,
|
| 992 |
+
)
|
| 993 |
+
|
| 994 |
+
for chunk in stream:
|
| 995 |
+
if 'choices' in chunk and len(chunk['choices']) > 0:
|
| 996 |
+
delta = chunk['choices'][0].get('delta', {})
|
| 997 |
+
content = delta.get('content', '')
|
| 998 |
+
|
| 999 |
+
if content:
|
| 1000 |
+
if first_token_time is None:
|
| 1001 |
+
first_token_time = time.time()
|
| 1002 |
+
|
| 1003 |
+
token_count += 1
|
| 1004 |
+
full_response += content
|
| 1005 |
+
|
| 1006 |
+
# Parse thinking blocks if reasoning enabled
|
| 1007 |
+
if enable_reasoning and config.get("supports_reasoning"):
|
| 1008 |
+
thinking, remaining = parse_thinking_blocks(full_response, streaming=True)
|
| 1009 |
+
thinking_content = thinking or ""
|
| 1010 |
+
json_text = remaining
|
| 1011 |
+
else:
|
| 1012 |
+
json_text = full_response
|
| 1013 |
+
|
| 1014 |
+
# Try to parse JSON
|
| 1015 |
+
partial_items = _try_parse_extraction_json(json_text)
|
| 1016 |
+
|
| 1017 |
+
# Calculate progress metrics
|
| 1018 |
+
elapsed = time.time() - start_time
|
| 1019 |
+
tps = token_count / elapsed if elapsed > 0 else 0
|
| 1020 |
+
remaining_tokens = 1024 - token_count
|
| 1021 |
+
eta = int(remaining_tokens / tps) if tps > 0 else 0
|
| 1022 |
+
|
| 1023 |
+
# Get item counts for ticker
|
| 1024 |
+
items_count = {
|
| 1025 |
+
"action_items": len(partial_items.get("action_items", [])),
|
| 1026 |
+
"decisions": len(partial_items.get("decisions", [])),
|
| 1027 |
+
"key_points": len(partial_items.get("key_points", [])),
|
| 1028 |
+
"open_questions": len(partial_items.get("open_questions", []))
|
| 1029 |
+
}
|
| 1030 |
+
|
| 1031 |
+
# Get last extracted item as snippet
|
| 1032 |
+
last_item = ""
|
| 1033 |
+
for category in ["action_items", "decisions", "key_points", "open_questions"]:
|
| 1034 |
+
if partial_items.get(category):
|
| 1035 |
+
last_item = partial_items[category][-1]
|
| 1036 |
+
break
|
| 1037 |
+
|
| 1038 |
+
# Format progress ticker
|
| 1039 |
+
input_tokens = tokenizer.count(window.content)
|
| 1040 |
+
ticker = format_progress_ticker(
|
| 1041 |
+
current_window=window_id,
|
| 1042 |
+
total_windows=total_windows,
|
| 1043 |
+
window_tokens=input_tokens,
|
| 1044 |
+
max_tokens=4096, # Reference max for percentage
|
| 1045 |
+
items_found=items_count,
|
| 1046 |
+
tokens_per_sec=tps,
|
| 1047 |
+
eta_seconds=eta,
|
| 1048 |
+
current_snippet=last_item
|
| 1049 |
+
)
|
| 1050 |
+
|
| 1051 |
+
# Q8: Option A - Show in "MODEL THINKING PROCESS" field
|
| 1052 |
+
yield (ticker, thinking_content, partial_items, False)
|
| 1053 |
+
|
| 1054 |
+
# Final parse
|
| 1055 |
+
if enable_reasoning and config.get("supports_reasoning"):
|
| 1056 |
+
thinking, remaining = parse_thinking_blocks(full_response)
|
| 1057 |
+
thinking_content = thinking or ""
|
| 1058 |
+
json_text = remaining
|
| 1059 |
+
else:
|
| 1060 |
+
json_text = full_response
|
| 1061 |
+
|
| 1062 |
+
final_items = _try_parse_extraction_json(json_text)
|
| 1063 |
+
|
| 1064 |
+
if not final_items:
|
| 1065 |
+
# JSON parsing failed - FAIL ENTIRE PIPELINE (strict mode)
|
| 1066 |
+
error_msg = f"Failed to parse JSON from window {window_id}. Response: {json_text[:200]}"
|
| 1067 |
+
tracer.log_extraction(
|
| 1068 |
+
window_id=window_id,
|
| 1069 |
+
extraction=None,
|
| 1070 |
+
llm_response=_sample_llm_response(full_response),
|
| 1071 |
+
error=error_msg
|
| 1072 |
+
)
|
| 1073 |
+
raise ValueError(error_msg)
|
| 1074 |
+
|
| 1075 |
+
# Log successful extraction
|
| 1076 |
+
tracer.log_extraction(
|
| 1077 |
+
window_id=window_id,
|
| 1078 |
+
extraction=final_items,
|
| 1079 |
+
llm_response=_sample_llm_response(full_response),
|
| 1080 |
+
thinking=_sample_llm_response(thinking_content) if thinking_content else None,
|
| 1081 |
+
error=None
|
| 1082 |
+
)
|
| 1083 |
+
|
| 1084 |
+
# Final ticker
|
| 1085 |
+
elapsed = time.time() - start_time
|
| 1086 |
+
tps = token_count / elapsed if elapsed > 0 else 0
|
| 1087 |
+
items_count = {k: len(v) for k, v in final_items.items()}
|
| 1088 |
+
|
| 1089 |
+
ticker = format_progress_ticker(
|
| 1090 |
+
current_window=window_id,
|
| 1091 |
+
total_windows=total_windows,
|
| 1092 |
+
window_tokens=input_tokens,
|
| 1093 |
+
max_tokens=4096,
|
| 1094 |
+
items_found=items_count,
|
| 1095 |
+
tokens_per_sec=tps,
|
| 1096 |
+
eta_seconds=0,
|
| 1097 |
+
current_snippet="✅ Extraction complete"
|
| 1098 |
+
)
|
| 1099 |
+
|
| 1100 |
+
yield (ticker, thinking_content, final_items, True)
|
| 1101 |
+
|
| 1102 |
+
except Exception as e:
|
| 1103 |
+
# Log error and re-raise to fail entire pipeline
|
| 1104 |
+
tracer.log_extraction(
|
| 1105 |
+
window_id=window_id,
|
| 1106 |
+
extraction=None,
|
| 1107 |
+
llm_response=_sample_llm_response(full_response) if full_response else "",
|
| 1108 |
+
error=str(e)
|
| 1109 |
+
)
|
| 1110 |
+
raise
|
| 1111 |
+
```
|
| 1112 |
+
|
| 1113 |
+
---
|
| 1114 |
+
|
| 1115 |
+
## Implementation Checklist
|
| 1116 |
+
|
| 1117 |
+
### Files to Create
|
| 1118 |
+
|
| 1119 |
+
- [ ] `/home/luigi/tiny-scribe/meeting_summarizer/extraction.py` (~900 lines)
|
| 1120 |
+
- [ ] `NativeTokenizer` class
|
| 1121 |
+
- [ ] `EmbeddingModel` class + `EMBEDDING_MODELS` registry
|
| 1122 |
+
- [ ] `format_progress_ticker()` function
|
| 1123 |
+
- [ ] `stream_extract_from_window()` function (with reasoning support)
|
| 1124 |
+
- [ ] `deduplicate_items()` function
|
| 1125 |
+
- [ ] `stream_synthesize_executive_summary()` function
|
| 1126 |
+
|
| 1127 |
+
### Files to Modify
|
| 1128 |
+
|
| 1129 |
+
- [ ] `/home/luigi/tiny-scribe/meeting_summarizer/__init__.py`
|
| 1130 |
+
- [ ] Remove `filter_validated_items` import/export
|
| 1131 |
+
|
| 1132 |
+
- [ ] `/home/luigi/tiny-scribe/meeting_summarizer/trace.py`
|
| 1133 |
+
- [ ] Add `log_extraction()` method
|
| 1134 |
+
- [ ] Add `log_deduplication()` method
|
| 1135 |
+
- [ ] Add `log_synthesis()` method
|
| 1136 |
+
|
| 1137 |
+
- [ ] `/home/luigi/tiny-scribe/app.py` (~800 lines added/modified)
|
| 1138 |
+
- [ ] Add `EXTRACTION_MODELS` registry (13 models)
|
| 1139 |
+
- [ ] Add `SYNTHESIS_MODELS` reference
|
| 1140 |
+
- [ ] Add `get_model_config()` function
|
| 1141 |
+
- [ ] Add `load_model_for_role()` function
|
| 1142 |
+
- [ ] Add `unload_model()` function
|
| 1143 |
+
- [ ] Add `build_extraction_system_prompt()` function
|
| 1144 |
+
- [ ] Add `summarize_advanced()` generator function
|
| 1145 |
+
- [ ] Add Advanced mode UI controls
|
| 1146 |
+
- [ ] Add reasoning visibility logic
|
| 1147 |
+
- [ ] Add model info display functions
|
| 1148 |
+
- [ ] Update `download_summary_json()` for trace embedding
|
| 1149 |
+
|
| 1150 |
+
### Code Statistics
|
| 1151 |
+
|
| 1152 |
+
| Metric | Count |
|
| 1153 |
+
|--------|-------|
|
| 1154 |
+
| **New Lines** | ~1,800 |
|
| 1155 |
+
| **Modified Lines** | ~60 |
|
| 1156 |
+
| **Removed Lines** | ~2 |
|
| 1157 |
+
| **New Functions** | 12 |
|
| 1158 |
+
| **New Classes** | 2 |
|
| 1159 |
+
| **UI Controls** | 11 |
|
| 1160 |
+
|
| 1161 |
+
---
|
| 1162 |
+
|
| 1163 |
+
## Testing Strategy
|
| 1164 |
+
|
| 1165 |
+
### Phase 1: Model Registry Validation
|
| 1166 |
+
|
| 1167 |
+
```bash
|
| 1168 |
+
python -c "
|
| 1169 |
+
from app import EXTRACTION_MODELS, SYNTHESIS_MODELS
|
| 1170 |
+
from meeting_summarizer.extraction import EMBEDDING_MODELS
|
| 1171 |
+
|
| 1172 |
+
assert len(EXTRACTION_MODELS) == 13, 'Extraction models count mismatch'
|
| 1173 |
+
assert len(EMBEDDING_MODELS) == 4, 'Embedding models count mismatch'
|
| 1174 |
+
assert len(SYNTHESIS_MODELS) == 16, 'Synthesis models count mismatch'
|
| 1175 |
+
|
| 1176 |
+
# Verify independent settings
|
| 1177 |
+
ext_qwen = EXTRACTION_MODELS['qwen3_1.7b_q4']['inference_settings']['temperature']
|
| 1178 |
+
syn_qwen = SYNTHESIS_MODELS['qwen3_1.7b_q4']['inference_settings']['temperature']
|
| 1179 |
+
assert ext_qwen == 0.3, f'Extraction temp wrong: {ext_qwen}'
|
| 1180 |
+
assert syn_qwen == 0.6, f'Synthesis temp wrong: {syn_qwen}'
|
| 1181 |
+
|
| 1182 |
+
print('✅ All model registries validated!')
|
| 1183 |
+
"
|
| 1184 |
+
```
|
| 1185 |
+
|
| 1186 |
+
### Phase 2: UI Control Validation
|
| 1187 |
+
|
| 1188 |
+
**Manual Checks:**
|
| 1189 |
+
1. Select "Advanced" mode
|
| 1190 |
+
2. Verify 3 dropdowns show correct counts (13, 4, 16)
|
| 1191 |
+
3. Verify default models selected
|
| 1192 |
+
4. Adjust extraction_n_ctx slider (2K → 8K)
|
| 1193 |
+
5. Select qwen3_600m_q4 for extraction → reasoning checkbox appears
|
| 1194 |
+
6. Select lfm2_extract_1.2b for extraction → reasoning checkbox hidden
|
| 1195 |
+
7. Select qwen3_4b_thinking_q3 for synthesis → reasoning locked ON
|
| 1196 |
+
8. Verify model info panels update on selection
|
| 1197 |
+
|
| 1198 |
+
### Phase 3: Pipeline Test - min.txt (Quick)
|
| 1199 |
+
|
| 1200 |
+
**Configuration:**
|
| 1201 |
+
- Extraction: `lfm2_extract_1.2b` (default)
|
| 1202 |
+
- Extraction n_ctx: 4096 (default)
|
| 1203 |
+
- Embedding: `granite-107m` (default)
|
| 1204 |
+
- Synthesis: `qwen3_1.7b_q4` (default)
|
| 1205 |
+
- Similarity threshold: 0.85 (default)
|
| 1206 |
+
|
| 1207 |
+
**Expected:**
|
| 1208 |
+
- 1 window created
|
| 1209 |
+
- ~2-4 items extracted
|
| 1210 |
+
- 0-1 duplicates removed
|
| 1211 |
+
- Final summary generated
|
| 1212 |
+
- Total time: ~30-60s
|
| 1213 |
+
- Download JSON contains trace
|
| 1214 |
+
|
| 1215 |
+
### Phase 4: Pipeline Test - Reasoning Models
|
| 1216 |
+
|
| 1217 |
+
**Configuration:**
|
| 1218 |
+
- Extraction: `qwen3_600m_q4`
|
| 1219 |
+
- ☑ Enable Reasoning for Extraction (test hybrid model)
|
| 1220 |
+
- Extraction n_ctx: 2048 (smaller windows)
|
| 1221 |
+
- Embedding: `granite-278m` (test balanced embedding)
|
| 1222 |
+
- Synthesis: `qwen3_1.7b_q4`
|
| 1223 |
+
- ☑ Enable Reasoning for Synthesis
|
| 1224 |
+
|
| 1225 |
+
**Expected:**
|
| 1226 |
+
- More windows (~4-6 with 2K context)
|
| 1227 |
+
- "MODEL THINKING PROCESS" shows extraction thinking + ticker
|
| 1228 |
+
- ~10-15 items extracted
|
| 1229 |
+
- ~2-4 duplicates removed
|
| 1230 |
+
- Final summary with thinking blocks
|
| 1231 |
+
- Total time: ~2-3 min
|
| 1232 |
+
|
| 1233 |
+
### Phase 5: Pipeline Test - full.txt (Production)
|
| 1234 |
+
|
| 1235 |
+
**Configuration:**
|
| 1236 |
+
- Extraction: `lfm2_extract_1.2b` (high quality)
|
| 1237 |
+
- Extraction n_ctx: 4096 (default)
|
| 1238 |
+
- Embedding: `qwen-600m` (highest quality)
|
| 1239 |
+
- Synthesis: `qwen3_4b_thinking_q3` (4B thinking model)
|
| 1240 |
+
- Output language: zh-TW (test Chinese)
|
| 1241 |
+
|
| 1242 |
+
**Expected:**
|
| 1243 |
+
- ~3-5 windows (4K context)
|
| 1244 |
+
- ~40-60 items extracted
|
| 1245 |
+
- ~10-15 duplicates removed
|
| 1246 |
+
- Final summary in Traditional Chinese
|
| 1247 |
+
- Total time: ~5-8 min
|
| 1248 |
+
- Download JSON with embedded trace (~1-2MB)
|
| 1249 |
+
|
| 1250 |
+
### Phase 6: Error Handling Test (Q10: Option C)
|
| 1251 |
+
|
| 1252 |
+
**Scenarios:**
|
| 1253 |
+
1. Disconnect internet during model download
|
| 1254 |
+
2. Manually corrupt model cache
|
| 1255 |
+
3. Use invalid model repo_id in EXTRACTION_MODELS
|
| 1256 |
+
|
| 1257 |
+
**Expected behavior:**
|
| 1258 |
+
- Error message displayed in UI: "❌ Failed to load lfm2_extract_1.2b..."
|
| 1259 |
+
- Pipeline stops (doesn't try fallback)
|
| 1260 |
+
- User can select different model and retry
|
| 1261 |
+
- Trace file saved with error details
|
| 1262 |
+
|
| 1263 |
+
---
|
| 1264 |
+
|
| 1265 |
+
## Implementation Priority
|
| 1266 |
+
|
| 1267 |
+
### Suggested Implementation Sequence (13-19 hours total)
|
| 1268 |
+
|
| 1269 |
+
**1. Model Registries (1-2 hours)**
|
| 1270 |
+
- [ ] Add `EXTRACTION_MODELS` to `app.py`
|
| 1271 |
+
- [ ] Add `SYNTHESIS_MODELS` reference
|
| 1272 |
+
- [ ] Add `EMBEDDING_MODELS` to `extraction.py`
|
| 1273 |
+
- [ ] Validate with smoke test
|
| 1274 |
+
|
| 1275 |
+
**2. Core Infrastructure (2-3 hours)**
|
| 1276 |
+
- [ ] Implement `get_model_config()`
|
| 1277 |
+
- [ ] Implement `load_model_for_role()` with user_n_ctx support
|
| 1278 |
+
- [ ] Implement `unload_model()`
|
| 1279 |
+
- [ ] Implement `build_extraction_system_prompt()` with reasoning support
|
| 1280 |
+
- [ ] Update `trace.py` with 3 new logging methods
|
| 1281 |
+
- [ ] Update `__init__.py`
|
| 1282 |
+
|
| 1283 |
+
**3. Extraction Module (3-4 hours)**
|
| 1284 |
+
- [ ] Implement `NativeTokenizer` class
|
| 1285 |
+
- [ ] Implement `EmbeddingModel` class
|
| 1286 |
+
- [ ] Implement `format_progress_ticker()`
|
| 1287 |
+
- [ ] Implement `stream_extract_from_window()` with reasoning parsing
|
| 1288 |
+
- [ ] Implement `deduplicate_items()`
|
| 1289 |
+
- [ ] Implement `stream_synthesize_executive_summary()`
|
| 1290 |
+
|
| 1291 |
+
**4. UI Integration (2-3 hours)**
|
| 1292 |
+
- [ ] Add Advanced mode controls to Gradio interface
|
| 1293 |
+
- [ ] Implement reasoning checkbox visibility logic
|
| 1294 |
+
- [ ] Implement model info display functions
|
| 1295 |
+
- [ ] Wire up all event handlers
|
| 1296 |
+
- [ ] Test UI responsiveness
|
| 1297 |
+
|
| 1298 |
+
**5. Pipeline Orchestration (3-4 hours)**
|
| 1299 |
+
- [ ] Implement `summarize_advanced()` generator function
|
| 1300 |
+
- [ ] Sequential model loading/unloading logic
|
| 1301 |
+
- [ ] Error handling with graceful failures
|
| 1302 |
+
- [ ] Progress ticker updates
|
| 1303 |
+
- [ ] Trace embedding in download JSON
|
| 1304 |
+
|
| 1305 |
+
**6. Testing & Validation (2-3 hours)**
|
| 1306 |
+
- [ ] Run all test phases (min.txt → full.txt)
|
| 1307 |
+
- [ ] Validate reasoning models behavior
|
| 1308 |
+
- [ ] Test error handling scenarios
|
| 1309 |
+
- [ ] Performance optimization (if needed)
|
| 1310 |
+
|
| 1311 |
+
---
|
| 1312 |
+
|
| 1313 |
+
## Risk Assessment
|
| 1314 |
+
|
| 1315 |
+
| Risk | Probability | Impact | Mitigation |
|
| 1316 |
+
|-------|-------------|--------|------------|
|
| 1317 |
+
| **LFM2-Extract models don't exist on HuggingFace** | Medium | High | Verify repo availability before implementation; prepare fallback to qwen3_600m_q4 |
|
| 1318 |
+
| **Memory overflow on HF Spaces Free Tier** | Low | High | Sequential loading/unloading tested; add memory monitoring |
|
| 1319 |
+
| **Reasoning output breaks JSON parsing** | Medium | Medium | Robust thinking block parsing with fallback; strict error handling |
|
| 1320 |
+
| **User n_ctx slider causes OOM** | Low | Medium | Cap at MAX_USABLE_CTX (32K); show warning if user sets too high |
|
| 1321 |
+
| **Embedding models slow down pipeline** | Medium | Low | Default to granite-107m (fastest); user can upgrade if needed |
|
| 1322 |
+
| **Trace file too large** | Low | Low | Response sampling (400 chars) already implemented; compress if >5MB |
|
| 1323 |
+
|
| 1324 |
+
---
|
| 1325 |
+
|
| 1326 |
+
## Appendix: Model Comparison Tables
|
| 1327 |
+
|
| 1328 |
+
### Extraction Models (13)
|
| 1329 |
+
|
| 1330 |
+
| Model | Size | Context | Reasoning | Settings |
|
| 1331 |
+
|--------|------|---------|-----------|----------|
|
| 1332 |
+
| falcon_h1_100m | 100M | 32K | No | temp=0.2 |
|
| 1333 |
+
| gemma3_270m | 270M | 32K | No | temp=0.3 |
|
| 1334 |
+
| ernie_300m | 300M | 131K | No | temp=0.2 |
|
| 1335 |
+
| granite_350m | 350M | 32K | No | temp=0.1 |
|
| 1336 |
+
| lfm2_350m | 350M | 32K | No | temp=0.2 |
|
| 1337 |
+
| bitcpm4_500m | 500M | 128K | No | temp=0.2 |
|
| 1338 |
+
| hunyuan_500m | 500M | 256K | No | temp=0.2 |
|
| 1339 |
+
| qwen3_600m_q4 | 600M | 32K | **Hybrid** | temp=0.3 |
|
| 1340 |
+
| granite_3_1_1b_q8 | 1B | 128K | No | temp=0.3 |
|
| 1341 |
+
| falcon_h1_1.5b_q4 | 1.5B | 32K | No | temp=0.2 |
|
| 1342 |
+
| qwen3_1.7b_q4 | 1.7B | 32K | **Hybrid** | temp=0.3 |
|
| 1343 |
+
| lfm2_extract_350m | 350M | 32K | No | temp=0.2 |
|
| 1344 |
+
| lfm2_extract_1.2b | 1.2B | 32K | No | temp=0.2 |
|
| 1345 |
+
|
| 1346 |
+
### Synthesis Models (16)
|
| 1347 |
+
|
| 1348 |
+
| Model | Size | Context | Reasoning | Settings |
|
| 1349 |
+
|--------|------|---------|-----------|----------|
|
| 1350 |
+
| granite_3_1_1b_q8 | 1B | 128K | No | temp=0.7 |
|
| 1351 |
+
| falcon_h1_1.5b_q4 | 1.5B | 32K | No | temp=0.1 |
|
| 1352 |
+
| qwen3_1.7b_q4 | 1.7B | 32K | Hybrid | temp=0.6 |
|
| 1353 |
+
| granite_3_3_2b_q4 | 2B | 128K | No | temp=0.7 |
|
| 1354 |
+
| youtu_llm_2b_q8 | 2B | 128K | Hybrid | temp=0.7 |
|
| 1355 |
+
| lfm2_2_6b_transcript | 2.6B | 32K | No | temp=0.6 |
|
| 1356 |
+
| breeze_3b_q4 | 3B | 32K | No | temp=0.6 |
|
| 1357 |
+
| granite_3_1_3b_q4 | 3B | 128K | No | temp=0.7 |
|
| 1358 |
+
| qwen3_4b_thinking_q3 | 4B | 256K | **Thinking-only** | temp=0.6 |
|
| 1359 |
+
| granite4_tiny_q3 | 7B | 128K | No | temp=0.7 |
|
| 1360 |
+
| ernie_21b_pt_q1 | 21B | 128K | No | temp=0.7 |
|
| 1361 |
+
| ernie_21b_thinking_q1 | 21B | 128K | **Thinking-only** | temp=0.8 |
|
| 1362 |
+
| glm_4_7_flash_reap_30b | 30B | 128K | **Thinking-only** | temp=0.6 |
|
| 1363 |
+
| glm_4_7_flash_30b_iq2 | 30B | 128K | No | temp=0.6 |
|
| 1364 |
+
| qwen3_30b_thinking_q1 | 30B | 256K | **Thinking-only** | temp=0.6 |
|
| 1365 |
+
| qwen3_30b_instruct_q1 | 30B | 256K | No | temp=0.6 |
|
| 1366 |
+
|
| 1367 |
+
### Embedding Models (4)
|
| 1368 |
+
|
| 1369 |
+
| Model | Size | Dimension | Speed | Quality |
|
| 1370 |
+
|--------|------|-----------|-------|---------|
|
| 1371 |
+
| granite-107m | 107M | 384 | Fastest | Good |
|
| 1372 |
+
| granite-278m | 278M | 768 | Balanced | Better |
|
| 1373 |
+
| gemma-300m | 300M | 768 | Fast | Good |
|
| 1374 |
+
| qwen-600m | 600M | 1024 | Slower | Best |
|
| 1375 |
+
|
| 1376 |
+
---
|
| 1377 |
+
|
| 1378 |
+
## Next Steps
|
| 1379 |
+
|
| 1380 |
+
Once approved, implementation will proceed in the order outlined in the Priority section. All code will be committed with descriptive messages referencing this plan document.
|
| 1381 |
+
|
| 1382 |
+
**Ready for implementation approval.**
|
| 1383 |
+
|
| 1384 |
+
---
|
| 1385 |
+
|
| 1386 |
+
**Document Version:** 1.0
|
| 1387 |
+
**Last Updated:** 2026-02-04
|
| 1388 |
+
**Author:** Claude (Anthropic)
|
| 1389 |
+
**Reviewer:** [Pending]
|