Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,1372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
import openai
|
| 5 |
+
from google.colab import userdata
|
| 6 |
+
import PyPDF2
|
| 7 |
+
import docx
|
| 8 |
+
from pptx import Presentation
|
| 9 |
+
import json
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import numpy as np
|
| 12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 13 |
+
import requests
|
| 14 |
+
from bs4 import BeautifulSoup
|
| 15 |
+
from duckduckgo_search import DDGS
|
| 16 |
+
import re
|
| 17 |
+
from urllib.parse import urlparse
|
| 18 |
+
|
| 19 |
+
# 全域變數儲存處理後的資料
|
| 20 |
+
processed_data_store = None
|
| 21 |
+
|
| 22 |
+
# 檢查檔案大小(Whisper API 限制 25MB)
|
| 23 |
+
def check_file_size(file_path):
|
| 24 |
+
file_size = os.path.getsize(file_path) / (1024 * 1024) # 轉換為 MB
|
| 25 |
+
if file_size > 25:
|
| 26 |
+
raise ValueError("檔案大小超過 25MB,請上傳較小的檔案。")
|
| 27 |
+
return file_size
|
| 28 |
+
|
| 29 |
+
# 語音轉文字主函數
|
| 30 |
+
def transcribe_audio(audio_file, language="zh"):
|
| 31 |
+
try:
|
| 32 |
+
if not os.path.splitext(audio_file)[1].lower() in ['.wav', '.mp3']:
|
| 33 |
+
return "錯誤:僅支援 .wav 或 .mp3 檔案格式!", None
|
| 34 |
+
|
| 35 |
+
check_file_size(audio_file)
|
| 36 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 37 |
+
|
| 38 |
+
with open(audio_file, "rb") as file:
|
| 39 |
+
transcription = client.audio.transcriptions.create(
|
| 40 |
+
model="gpt-4o-transcribe",
|
| 41 |
+
file=file,
|
| 42 |
+
language=language
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
transcript_text = transcription.text
|
| 46 |
+
|
| 47 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
|
| 48 |
+
temp_file.write(transcript_text.encode('utf-8'))
|
| 49 |
+
temp_file_path = temp_file.name
|
| 50 |
+
|
| 51 |
+
return transcript_text, temp_file_path
|
| 52 |
+
|
| 53 |
+
except ValueError as ve:
|
| 54 |
+
return f"錯誤:{str(ve)}", None
|
| 55 |
+
except Exception as e:
|
| 56 |
+
return f"轉錄失敗:{str(e)}", None
|
| 57 |
+
|
| 58 |
+
# 文檔內容提取函數
|
| 59 |
+
def extract_text_from_pdf(file_path):
|
| 60 |
+
try:
|
| 61 |
+
with open(file_path, 'rb') as file:
|
| 62 |
+
reader = PyPDF2.PdfReader(file)
|
| 63 |
+
text = ""
|
| 64 |
+
for page in reader.pages:
|
| 65 |
+
text += page.extract_text() + "\n"
|
| 66 |
+
return text.strip()
|
| 67 |
+
except Exception as e:
|
| 68 |
+
return f"PDF 讀取錯誤:{str(e)}"
|
| 69 |
+
|
| 70 |
+
def extract_text_from_docx(file_path):
|
| 71 |
+
try:
|
| 72 |
+
doc = docx.Document(file_path)
|
| 73 |
+
text = ""
|
| 74 |
+
for paragraph in doc.paragraphs:
|
| 75 |
+
text += paragraph.text + "\n"
|
| 76 |
+
return text.strip()
|
| 77 |
+
except Exception as e:
|
| 78 |
+
return f"DOCX 讀取錯誤:{str(e)}"
|
| 79 |
+
|
| 80 |
+
def extract_text_from_pptx(file_path):
|
| 81 |
+
try:
|
| 82 |
+
prs = Presentation(file_path)
|
| 83 |
+
text = ""
|
| 84 |
+
for slide in prs.slides:
|
| 85 |
+
for shape in slide.shapes:
|
| 86 |
+
if hasattr(shape, "text"):
|
| 87 |
+
text += shape.text + "\n"
|
| 88 |
+
return text.strip()
|
| 89 |
+
except Exception as e:
|
| 90 |
+
return f"PPTX 讀取錯誤:{str(e)}"
|
| 91 |
+
|
| 92 |
+
def extract_document_content(file_path):
|
| 93 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
| 94 |
+
|
| 95 |
+
if file_ext == '.pdf':
|
| 96 |
+
return extract_text_from_pdf(file_path)
|
| 97 |
+
elif file_ext == '.docx':
|
| 98 |
+
return extract_text_from_docx(file_path)
|
| 99 |
+
elif file_ext in ['.ppt', '.pptx']:
|
| 100 |
+
return extract_text_from_pptx(file_path)
|
| 101 |
+
else:
|
| 102 |
+
return f"不支援的檔案格式:{file_ext}"
|
| 103 |
+
|
| 104 |
+
# 文字分塊處理
|
| 105 |
+
def chunk_text(text, chunk_size=1000, overlap=200):
|
| 106 |
+
if len(text) <= chunk_size:
|
| 107 |
+
return [text]
|
| 108 |
+
|
| 109 |
+
chunks = []
|
| 110 |
+
start = 0
|
| 111 |
+
|
| 112 |
+
while start < len(text):
|
| 113 |
+
end = start + chunk_size
|
| 114 |
+
|
| 115 |
+
if end < len(text):
|
| 116 |
+
for i in range(end, start + chunk_size//2, -1):
|
| 117 |
+
if text[i] in ['。', '!', '?', '\n', '.', '!', '?']:
|
| 118 |
+
end = i + 1
|
| 119 |
+
break
|
| 120 |
+
|
| 121 |
+
chunk = text[start:end].strip()
|
| 122 |
+
if chunk:
|
| 123 |
+
chunks.append(chunk)
|
| 124 |
+
|
| 125 |
+
start = end - overlap if end < len(text) else end
|
| 126 |
+
|
| 127 |
+
return chunks
|
| 128 |
+
|
| 129 |
+
# 生成嵌入向量
|
| 130 |
+
def generate_embedding(text, client):
|
| 131 |
+
"""使用 OpenAI API 生成文字嵌入向量"""
|
| 132 |
+
try:
|
| 133 |
+
response = client.embeddings.create(
|
| 134 |
+
model="text-embedding-ada-002",
|
| 135 |
+
input=text
|
| 136 |
+
)
|
| 137 |
+
return response.data[0].embedding
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"生成嵌入向量失敗:{str(e)}")
|
| 140 |
+
return None
|
| 141 |
+
|
| 142 |
+
# 新增:生成摘要功能
|
| 143 |
+
def generate_summary(text, summary_type="教材", max_tokens=400):
|
| 144 |
+
"""使用 OpenAI API 生成內容摘要"""
|
| 145 |
+
if not text or not text.strip():
|
| 146 |
+
return "無內容可摘要。"
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 150 |
+
|
| 151 |
+
if summary_type == "教材":
|
| 152 |
+
prompt = f"""請用繁體中文將以下教材內容摘要成300字的概述,包含:
|
| 153 |
+
1. 主要學科領域
|
| 154 |
+
2. 核心概念和主題
|
| 155 |
+
3. 學習重點
|
| 156 |
+
4. 內容結構
|
| 157 |
+
|
| 158 |
+
教材內容:
|
| 159 |
+
{text[:4000]} # 限制輸入長度避免超過token限制
|
| 160 |
+
|
| 161 |
+
請提供簡潔且全面的概述:"""
|
| 162 |
+
else: # 逐字稿
|
| 163 |
+
prompt = f"""請用��體中文將以下錄音逐字稿摘要成300字,包含:
|
| 164 |
+
1. 主要討論主題
|
| 165 |
+
2. 重要觀點和概念
|
| 166 |
+
3. 關鍵資訊摘要
|
| 167 |
+
|
| 168 |
+
逐字稿內容:
|
| 169 |
+
{text[:4000]}
|
| 170 |
+
|
| 171 |
+
請提供簡潔且重點突出的摘要:"""
|
| 172 |
+
|
| 173 |
+
response = client.chat.completions.create(
|
| 174 |
+
model="gpt-4o",
|
| 175 |
+
messages=[{"role": "user", "content": prompt}],
|
| 176 |
+
temperature=0.5,
|
| 177 |
+
max_tokens=max_tokens
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
summary = response.choices[0].message.content
|
| 181 |
+
return summary
|
| 182 |
+
|
| 183 |
+
except Exception as e:
|
| 184 |
+
return f"摘要生成失敗:{str(e)}"
|
| 185 |
+
|
| 186 |
+
# 新增:生成逐字稿摘要
|
| 187 |
+
def generate_transcript_summary(transcript_content):
|
| 188 |
+
"""為逐字稿生成摘要"""
|
| 189 |
+
if not transcript_content or not transcript_content.strip():
|
| 190 |
+
return "尚未有逐字稿內容"
|
| 191 |
+
|
| 192 |
+
return generate_summary(transcript_content, summary_type="逐字稿")
|
| 193 |
+
|
| 194 |
+
# 智能領域分析函數
|
| 195 |
+
def extract_domain_keywords_from_materials(material_context):
|
| 196 |
+
"""從教材上下文中動態提取領域關鍵詞"""
|
| 197 |
+
if not material_context:
|
| 198 |
+
return {}
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 202 |
+
|
| 203 |
+
prompt = f"""分析以下教材檔案名稱和內容,提取出主要的學科領域和相關關鍵詞:
|
| 204 |
+
|
| 205 |
+
教材內容:{material_context}
|
| 206 |
+
|
| 207 |
+
請提供:
|
| 208 |
+
1. 主要學科領域(例如:計算機科學、物理學、化學、生物學、經濟學等)
|
| 209 |
+
2. 5-10個該領域的核心英文關鍵詞
|
| 210 |
+
3. 可能產生歧義的詞彙(如果有的話)
|
| 211 |
+
|
| 212 |
+
格式:
|
| 213 |
+
領域:[學科名稱]
|
| 214 |
+
關鍵詞:[keyword1, keyword2, keyword3, ...]
|
| 215 |
+
歧義詞:[ambiguous_term1, ambiguous_term2, ...]"""
|
| 216 |
+
|
| 217 |
+
response = client.chat.completions.create(
|
| 218 |
+
model="gpt-4",
|
| 219 |
+
messages=[{"role": "user", "content": prompt}],
|
| 220 |
+
temperature=0.3,
|
| 221 |
+
max_tokens=300
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
analysis = response.choices[0].message.content
|
| 225 |
+
return parse_domain_analysis(analysis)
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"領域分析失敗:{str(e)}")
|
| 229 |
+
return {}
|
| 230 |
+
|
| 231 |
+
def parse_domain_analysis(analysis):
|
| 232 |
+
"""解析領域分析結果"""
|
| 233 |
+
result = {
|
| 234 |
+
'domain': '',
|
| 235 |
+
'keywords': [],
|
| 236 |
+
'ambiguous_terms': []
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
lines = analysis.split('\n')
|
| 240 |
+
for line in lines:
|
| 241 |
+
line = line.strip()
|
| 242 |
+
if line.startswith('領域:'):
|
| 243 |
+
result['domain'] = line.replace('領域:', '').strip()
|
| 244 |
+
elif line.startswith('關鍵詞:'):
|
| 245 |
+
keywords_str = line.replace('關鍵詞:', '').strip()
|
| 246 |
+
keywords = [kw.strip().strip('[]') for kw in keywords_str.split(',')]
|
| 247 |
+
result['keywords'] = [kw for kw in keywords if kw]
|
| 248 |
+
elif line.startswith('歧義詞:'):
|
| 249 |
+
ambiguous_str = line.replace('歧義詞:', '').strip()
|
| 250 |
+
ambiguous = [term.strip().strip('[]') for term in ambiguous_str.split(',')]
|
| 251 |
+
result['ambiguous_terms'] = [term for term in ambiguous if term]
|
| 252 |
+
|
| 253 |
+
return result
|
| 254 |
+
|
| 255 |
+
def get_alternative_meanings(term, domain):
|
| 256 |
+
"""獲取詞彙在其他領域的含義"""
|
| 257 |
+
try:
|
| 258 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 259 |
+
|
| 260 |
+
prompt = f"""詞彙 "{term}" 在 "{domain}" 領域有特定含義。
|
| 261 |
+
請列出這個詞彙在其他領域可能的含義或相關詞彙,用於排除不相關的搜尋結果。
|
| 262 |
+
|
| 263 |
+
請提供3-5個可能需要排除的相關詞彙:"""
|
| 264 |
+
|
| 265 |
+
response = client.chat.completions.create(
|
| 266 |
+
model="gpt-4",
|
| 267 |
+
messages=[{"role": "user", "content": prompt}],
|
| 268 |
+
temperature=0.3,
|
| 269 |
+
max_tokens=150
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
result = response.choices[0].message.content
|
| 273 |
+
exclude_terms = []
|
| 274 |
+
for line in result.split('\n'):
|
| 275 |
+
if line.strip() and not line.startswith('例如'):
|
| 276 |
+
words = line.lower().split()
|
| 277 |
+
exclude_terms.extend([word for word in words if len(word) > 3])
|
| 278 |
+
|
| 279 |
+
return exclude_terms[:5]
|
| 280 |
+
|
| 281 |
+
except Exception as e:
|
| 282 |
+
print(f"生成排除詞彙失敗:{str(e)}")
|
| 283 |
+
return []
|
| 284 |
+
|
| 285 |
+
# 網路搜尋功能
|
| 286 |
+
def extract_web_content(url, max_chars=2000):
|
| 287 |
+
"""提取網頁內容"""
|
| 288 |
+
try:
|
| 289 |
+
headers = {
|
| 290 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
response = requests.get(url, headers=headers, timeout=10)
|
| 294 |
+
response.raise_for_status()
|
| 295 |
+
|
| 296 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 297 |
+
|
| 298 |
+
for script in soup(["script", "style"]):
|
| 299 |
+
script.decompose()
|
| 300 |
+
|
| 301 |
+
text = soup.get_text()
|
| 302 |
+
lines = (line.strip() for line in text.splitlines())
|
| 303 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 304 |
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
| 305 |
+
|
| 306 |
+
if len(text) > max_chars:
|
| 307 |
+
text = text[:max_chars] + "..."
|
| 308 |
+
|
| 309 |
+
return text
|
| 310 |
+
except Exception as e:
|
| 311 |
+
print(f"提取網頁內容失敗 {url}: {str(e)}")
|
| 312 |
+
return ""
|
| 313 |
+
|
| 314 |
+
def intelligent_web_search(query, material_context=""):
|
| 315 |
+
"""基於教材上下文的智能網路搜尋"""
|
| 316 |
+
try:
|
| 317 |
+
# 動態分析領域
|
| 318 |
+
domain_info = extract_domain_keywords_from_materials(material_context)
|
| 319 |
+
|
| 320 |
+
# 構建增強查詢
|
| 321 |
+
enhanced_query = query
|
| 322 |
+
if domain_info.get('keywords'):
|
| 323 |
+
relevant_keywords = domain_info['keywords'][:3]
|
| 324 |
+
enhanced_query = f"{query} {' '.join(relevant_keywords)}"
|
| 325 |
+
|
| 326 |
+
# 生成排除詞彙
|
| 327 |
+
exclude_terms = []
|
| 328 |
+
for ambiguous_term in domain_info.get('ambiguous_terms', []):
|
| 329 |
+
if ambiguous_term.upper() in query.upper():
|
| 330 |
+
exclude_terms.extend(get_alternative_meanings(ambiguous_term, domain_info.get('domain', '')))
|
| 331 |
+
|
| 332 |
+
ddgs = DDGS()
|
| 333 |
+
results = []
|
| 334 |
+
search_results = ddgs.text(enhanced_query, max_results=8)
|
| 335 |
+
|
| 336 |
+
for result in search_results:
|
| 337 |
+
content_lower = (result.get('title', '') + ' ' + result.get('body', '')).lower()
|
| 338 |
+
|
| 339 |
+
# 檢查排除詞彙
|
| 340 |
+
if any(exclude_term.lower() in content_lower for exclude_term in exclude_terms):
|
| 341 |
+
continue
|
| 342 |
+
|
| 343 |
+
# 檢查相關性
|
| 344 |
+
if domain_info.get('keywords'):
|
| 345 |
+
keyword_count = sum(1 for keyword in domain_info['keywords']
|
| 346 |
+
if keyword.lower() in content_lower)
|
| 347 |
+
if keyword_count == 0:
|
| 348 |
+
continue
|
| 349 |
+
|
| 350 |
+
results.append({
|
| 351 |
+
'title': result.get('title', ''),
|
| 352 |
+
'url': result.get('href', ''),
|
| 353 |
+
'snippet': result.get('body', ''),
|
| 354 |
+
'source': 'web_search'
|
| 355 |
+
})
|
| 356 |
+
|
| 357 |
+
if len(results) >= 3:
|
| 358 |
+
break
|
| 359 |
+
|
| 360 |
+
return results
|
| 361 |
+
except Exception as e:
|
| 362 |
+
print(f"智能網路搜尋失敗:{str(e)}")
|
| 363 |
+
return []
|
| 364 |
+
|
| 365 |
+
def enhanced_web_search_with_content(query, num_results=3):
|
| 366 |
+
"""進行智能網路搜尋並提取內容"""
|
| 367 |
+
global processed_data_store
|
| 368 |
+
|
| 369 |
+
# 獲取豐富的教材上下文
|
| 370 |
+
material_context = ""
|
| 371 |
+
if processed_data_store:
|
| 372 |
+
if processed_data_store.get("materials"):
|
| 373 |
+
material_files = [material["filename"] for material in processed_data_store["materials"]]
|
| 374 |
+
material_context += f"檔案:{', '.join(material_files)}. "
|
| 375 |
+
|
| 376 |
+
if processed_data_store.get("chunks"):
|
| 377 |
+
content_samples = []
|
| 378 |
+
for chunk in processed_data_store["chunks"][:3]:
|
| 379 |
+
content_sample = chunk["content"][:200]
|
| 380 |
+
content_samples.append(content_sample)
|
| 381 |
+
material_context += f"內容範例:{' '.join(content_samples)}"
|
| 382 |
+
|
| 383 |
+
search_results = intelligent_web_search(query, material_context)
|
| 384 |
+
enhanced_results = []
|
| 385 |
+
|
| 386 |
+
for result in search_results[:num_results]:
|
| 387 |
+
content = extract_web_content(result['url'])
|
| 388 |
+
if content:
|
| 389 |
+
result['content'] = content
|
| 390 |
+
enhanced_results.append(result)
|
| 391 |
+
|
| 392 |
+
return enhanced_results
|
| 393 |
+
|
| 394 |
+
# 檢查搜尋狀態變化
|
| 395 |
+
def check_search_status_change(current_message, history, current_search_enabled):
|
| 396 |
+
"""檢查是否為重複問題且搜尋狀態改變"""
|
| 397 |
+
if not history:
|
| 398 |
+
return False
|
| 399 |
+
|
| 400 |
+
# 檢查最近的問題是否相同
|
| 401 |
+
for user_msg, bot_msg in reversed(history[-3:]): # 檢查最近3輪對話
|
| 402 |
+
if user_msg and user_msg.strip().lower() == current_message.strip().lower():
|
| 403 |
+
# 檢查之前的回答是否包含網路搜尋結果
|
| 404 |
+
if bot_msg:
|
| 405 |
+
has_web_sources = "網路來源:" in bot_msg
|
| 406 |
+
# 如果之前有網路來源但現在沒啟用,或之前沒有但現在啟用了
|
| 407 |
+
if (has_web_sources and not current_search_enabled) or (not has_web_sources and current_search_enabled):
|
| 408 |
+
return True
|
| 409 |
+
|
| 410 |
+
return False
|
| 411 |
+
|
| 412 |
+
# 修改:RAG 資料前處理主函數(新增摘要功能)
|
| 413 |
+
def process_rag_data(material_files, transcript_content):
|
| 414 |
+
global processed_data_store
|
| 415 |
+
|
| 416 |
+
try:
|
| 417 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 418 |
+
|
| 419 |
+
processed_data = {
|
| 420 |
+
"timestamp": datetime.now().isoformat(),
|
| 421 |
+
"materials": [],
|
| 422 |
+
"transcript": None,
|
| 423 |
+
"chunks": [],
|
| 424 |
+
"material_summary": "",
|
| 425 |
+
"transcript_summary": ""
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
# 收集所有教材內容用於生成摘要
|
| 429 |
+
all_material_content = ""
|
| 430 |
+
|
| 431 |
+
# 處理教材檔案
|
| 432 |
+
if material_files:
|
| 433 |
+
for file in material_files:
|
| 434 |
+
file_name = os.path.basename(file.name)
|
| 435 |
+
file_content = extract_document_content(file.name)
|
| 436 |
+
|
| 437 |
+
if not file_content.startswith("錯誤") and not file_content.startswith("不支援"):
|
| 438 |
+
all_material_content += f"\n\n檔案:{file_name}\n{file_content}"
|
| 439 |
+
material_chunks = chunk_text(file_content)
|
| 440 |
+
|
| 441 |
+
material_info = {
|
| 442 |
+
"filename": file_name,
|
| 443 |
+
"content": file_content,
|
| 444 |
+
"chunks": len(material_chunks),
|
| 445 |
+
"type": "material"
|
| 446 |
+
}
|
| 447 |
+
processed_data["materials"].append(material_info)
|
| 448 |
+
|
| 449 |
+
for i, chunk in enumerate(material_chunks):
|
| 450 |
+
embedding = generate_embedding(chunk, client)
|
| 451 |
+
if embedding:
|
| 452 |
+
processed_data["chunks"].append({
|
| 453 |
+
"content": chunk,
|
| 454 |
+
"source": file_name,
|
| 455 |
+
"type": "material",
|
| 456 |
+
"chunk_id": f"{file_name}_chunk_{i+1}",
|
| 457 |
+
"embedding": embedding
|
| 458 |
+
})
|
| 459 |
+
|
| 460 |
+
# 生成教材摘要
|
| 461 |
+
if all_material_content.strip():
|
| 462 |
+
processed_data["material_summary"] = generate_summary(all_material_content, "教材")
|
| 463 |
+
|
| 464 |
+
# 處理逐字稿
|
| 465 |
+
if transcript_content and transcript_content.strip():
|
| 466 |
+
transcript_chunks = chunk_text(transcript_content)
|
| 467 |
+
|
| 468 |
+
processed_data["transcript"] = {
|
| 469 |
+
"content": transcript_content,
|
| 470 |
+
"chunks": len(transcript_chunks),
|
| 471 |
+
"type": "transcript"
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
# 生成逐字稿摘要
|
| 475 |
+
processed_data["transcript_summary"] = generate_summary(transcript_content, "逐字稿")
|
| 476 |
+
|
| 477 |
+
for i, chunk in enumerate(transcript_chunks):
|
| 478 |
+
embedding = generate_embedding(chunk, client)
|
| 479 |
+
if embedding:
|
| 480 |
+
processed_data["chunks"].append({
|
| 481 |
+
"content": chunk,
|
| 482 |
+
"source": "錄音逐字稿",
|
| 483 |
+
"type": "transcript",
|
| 484 |
+
"chunk_id": f"transcript_chunk_{i+1}",
|
| 485 |
+
"embedding": embedding
|
| 486 |
+
})
|
| 487 |
+
|
| 488 |
+
# 儲存到全域變數
|
| 489 |
+
processed_data_store = processed_data
|
| 490 |
+
|
| 491 |
+
# 儲存處理結果到臨時檔案
|
| 492 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json", encoding='utf-8') as temp_file:
|
| 493 |
+
json.dump(processed_data, temp_file, ensure_ascii=False, indent=2)
|
| 494 |
+
temp_file_path = temp_file.name
|
| 495 |
+
|
| 496 |
+
# 生成處理報告
|
| 497 |
+
total_materials = len(processed_data["materials"])
|
| 498 |
+
total_chunks = len(processed_data["chunks"])
|
| 499 |
+
has_transcript = processed_data["transcript"] is not None
|
| 500 |
+
|
| 501 |
+
report = f"""✅ RAG 資料前處理完成!
|
| 502 |
+
|
| 503 |
+
📊 處理統計:
|
| 504 |
+
• 教材檔案數量:{total_materials} 個
|
| 505 |
+
• 逐字稿:{'已處理' if has_transcript else '無'}
|
| 506 |
+
• 總文字塊數:{total_chunks} 個
|
| 507 |
+
• 嵌入向量:已生成
|
| 508 |
+
• 摘要:已生成
|
| 509 |
+
|
| 510 |
+
📋 處理詳情:"""
|
| 511 |
+
|
| 512 |
+
if processed_data["materials"]:
|
| 513 |
+
report += "\n\n📚 教材檔案:"
|
| 514 |
+
for material in processed_data["materials"]:
|
| 515 |
+
report += f"\n • {material['filename']} ({material['chunks']} 個文字塊)"
|
| 516 |
+
|
| 517 |
+
if has_transcript:
|
| 518 |
+
report += f"\n\n🎤 錄音逐字稿:{processed_data['transcript']['chunks']} 個文字塊"
|
| 519 |
+
|
| 520 |
+
report += f"\n\n💾 資料已準備完成,可前往 AI ChatBot 頁面進行問答!"
|
| 521 |
+
|
| 522 |
+
return report, temp_file_path, processed_data["material_summary"], processed_data["transcript_summary"]
|
| 523 |
+
|
| 524 |
+
except Exception as e:
|
| 525 |
+
return f"❌ RAG 前處理失敗:{str(e)}", None, "", ""
|
| 526 |
+
|
| 527 |
+
# 處理函數
|
| 528 |
+
def handle_material_upload(files):
|
| 529 |
+
if not files:
|
| 530 |
+
return "尚未上傳任何教材檔案"
|
| 531 |
+
|
| 532 |
+
uploaded_files = []
|
| 533 |
+
for file in files:
|
| 534 |
+
file_name = os.path.basename(file.name)
|
| 535 |
+
file_size = os.path.getsize(file.name) / (1024 * 1024)
|
| 536 |
+
uploaded_files.append(f"📄 {file_name} ({file_size:.2f} MB)")
|
| 537 |
+
|
| 538 |
+
return f"已上傳 {len(files)} 個教材檔案:\n" + "\n".join(uploaded_files)
|
| 539 |
+
|
| 540 |
+
def handle_audio_transcription(audio_file, language):
|
| 541 |
+
if not audio_file:
|
| 542 |
+
return "請先上傳錄音檔案", "", None, ""
|
| 543 |
+
|
| 544 |
+
language_names = {
|
| 545 |
+
"zh": "中文", "en": "英文", "ja": "日文", "ko": "韓文",
|
| 546 |
+
"fr": "法文", "de": "德文", "es": "西班牙文"
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
language_name = language_names.get(language, language)
|
| 550 |
+
result, temp_file = transcribe_audio(audio_file, language)
|
| 551 |
+
|
| 552 |
+
if temp_file:
|
| 553 |
+
status_message = f"✅ 轉錄完成!使用語言:{language_name}"
|
| 554 |
+
# 生成逐字稿摘要
|
| 555 |
+
summary = generate_transcript_summary(result)
|
| 556 |
+
return status_message, result, temp_file, summary
|
| 557 |
+
else:
|
| 558 |
+
return result, "", None, ""
|
| 559 |
+
|
| 560 |
+
def handle_rag_processing(material_files, transcript_content):
|
| 561 |
+
if not material_files and not transcript_content:
|
| 562 |
+
return "❌ 請先上傳教材檔案或完成錄音轉錄", None, "", ""
|
| 563 |
+
|
| 564 |
+
return process_rag_data(material_files, transcript_content)
|
| 565 |
+
|
| 566 |
+
# ChatBot 核心功能(保持不變)
|
| 567 |
+
def search_relevant_chunks(query, top_k=5):
|
| 568 |
+
global processed_data_store
|
| 569 |
+
|
| 570 |
+
if processed_data_store is None or not processed_data_store["chunks"]:
|
| 571 |
+
return []
|
| 572 |
+
|
| 573 |
+
try:
|
| 574 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 575 |
+
query_embedding = generate_embedding(query, client)
|
| 576 |
+
|
| 577 |
+
if query_embedding is None:
|
| 578 |
+
return []
|
| 579 |
+
|
| 580 |
+
similarities = []
|
| 581 |
+
for chunk in processed_data_store["chunks"]:
|
| 582 |
+
if "embedding" in chunk:
|
| 583 |
+
similarity = cosine_similarity(
|
| 584 |
+
[query_embedding],
|
| 585 |
+
[chunk["embedding"]]
|
| 586 |
+
)[0][0]
|
| 587 |
+
similarities.append((chunk, similarity))
|
| 588 |
+
|
| 589 |
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
| 590 |
+
return [item[0] for item in similarities[:top_k]]
|
| 591 |
+
|
| 592 |
+
except Exception as e:
|
| 593 |
+
print(f"搜尋相關內容失敗:{str(e)}")
|
| 594 |
+
return []
|
| 595 |
+
|
| 596 |
+
def enhanced_chatbot_with_web_search(message, history, enable_web_search=False):
|
| 597 |
+
global processed_data_store
|
| 598 |
+
|
| 599 |
+
try:
|
| 600 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 601 |
+
|
| 602 |
+
# 檢查是否為重複問題且搜尋狀態改變
|
| 603 |
+
search_status_changed = check_search_status_change(message, history, enable_web_search)
|
| 604 |
+
|
| 605 |
+
# 準備對話歷史上下文
|
| 606 |
+
conversation_context = ""
|
| 607 |
+
if history:
|
| 608 |
+
recent_history = history[-3:] if len(history) > 3 else history
|
| 609 |
+
for user_msg, bot_msg in recent_history:
|
| 610 |
+
if user_msg and bot_msg:
|
| 611 |
+
conversation_context += f"用戶:{user_msg}\n助理:{bot_msg}\n\n"
|
| 612 |
+
|
| 613 |
+
# 1. 搜尋 RAG 資料
|
| 614 |
+
rag_chunks = search_relevant_chunks(message, top_k=5)
|
| 615 |
+
|
| 616 |
+
# 2. 分析教材上下文
|
| 617 |
+
material_context = ""
|
| 618 |
+
if processed_data_store and processed_data_store.get("materials"):
|
| 619 |
+
material_files = [material["filename"] for material in processed_data_store["materials"]]
|
| 620 |
+
material_context = f"教材檔案:{', '.join(material_files)}"
|
| 621 |
+
|
| 622 |
+
# 3. 智能網路搜尋(如果啟用)
|
| 623 |
+
web_results = []
|
| 624 |
+
if enable_web_search:
|
| 625 |
+
web_results = enhanced_web_search_with_content(message, num_results=3)
|
| 626 |
+
|
| 627 |
+
# 4. 構建帶正確註腳的上下文
|
| 628 |
+
all_context_parts = []
|
| 629 |
+
footnotes = []
|
| 630 |
+
footnote_counter = 1
|
| 631 |
+
|
| 632 |
+
# 處理 RAG 資料
|
| 633 |
+
if rag_chunks:
|
| 634 |
+
for chunk in rag_chunks:
|
| 635 |
+
footnote_ref = f"[{footnote_counter}]"
|
| 636 |
+
footnotes.append(f"[{footnote_counter}] 教材來源:{chunk['source']}")
|
| 637 |
+
all_context_parts.append(f"教材內容{footnote_ref}:{chunk['content']}")
|
| 638 |
+
footnote_counter += 1
|
| 639 |
+
|
| 640 |
+
# 處理網路搜尋結果
|
| 641 |
+
if web_results:
|
| 642 |
+
for result in web_results:
|
| 643 |
+
footnote_ref = f"[{footnote_counter}]"
|
| 644 |
+
footnotes.append(f"[{footnote_counter}] 網路來源:{result['title']} - {result['url']}")
|
| 645 |
+
all_context_parts.append(f"網路內容{footnote_ref}:{result['content'][:800]}...")
|
| 646 |
+
footnote_counter += 1
|
| 647 |
+
|
| 648 |
+
# 5. 根據可用資料決定回答策略
|
| 649 |
+
if all_context_parts:
|
| 650 |
+
all_context = "\n\n".join(all_context_parts)
|
| 651 |
+
|
| 652 |
+
if rag_chunks:
|
| 653 |
+
# 構建系統提示詞,明確說明搜尋狀態
|
| 654 |
+
search_status_info = f"""
|
| 655 |
+
當前搜尋設定:{'已啟用網路搜尋' if enable_web_search else '未啟用網路搜尋'}
|
| 656 |
+
"""
|
| 657 |
+
|
| 658 |
+
# 如果搜尋狀態改變,添加特別說明
|
| 659 |
+
status_change_instruction = ""
|
| 660 |
+
if search_status_changed:
|
| 661 |
+
if enable_web_search:
|
| 662 |
+
status_change_instruction = """
|
| 663 |
+
重要:用戶剛剛啟用了網路搜尋功能,請提供包含網路搜尋結果的更全面回答,即使之前已經回答過類似問題。
|
| 664 |
+
"""
|
| 665 |
+
else:
|
| 666 |
+
status_change_instruction = """
|
| 667 |
+
重要:用戶剛剛關閉了網路搜尋功能,請僅基於教材內容回答,不要參考之前可能包含網路搜尋的回答。
|
| 668 |
+
"""
|
| 669 |
+
|
| 670 |
+
system_prompt = f"""你是一個智能學習助理。請根據提供的教材內容、逐字稿、網路搜尋結果以及對話歷史來回答用戶的問題。
|
| 671 |
+
|
| 672 |
+
{search_status_info}
|
| 673 |
+
{status_change_instruction}
|
| 674 |
+
|
| 675 |
+
教材上下文:{material_context}
|
| 676 |
+
|
| 677 |
+
對話歷史:
|
| 678 |
+
{conversation_context}
|
| 679 |
+
|
| 680 |
+
可用資料來源:
|
| 681 |
+
{all_context}
|
| 682 |
+
|
| 683 |
+
重要回答規則:
|
| 684 |
+
1. **以教材為核心**:優先使用教材和逐字稿的內容作為回答基礎
|
| 685 |
+
2. **正確使用註腳**:在回答中使用對應的註腳編號 [1], [2], [3] 等來標註具體的資料來源
|
| 686 |
+
3. **教材優先原則**:當教材有相關內容時,必須以教材內容為主要回答依據
|
| 687 |
+
4. **網路資料處理**:
|
| 688 |
+
- 如果啟用網路搜尋:網路搜尋結果用於補充教材中沒有的細節或例子
|
| 689 |
+
- 如果未啟用網路搜尋:僅使用教材和逐字稿內容,不要參考可能的網路資訊
|
| 690 |
+
5. **避免歧義**:根據教材的領域和上下文來理解問題,不回答無關領域的內容
|
| 691 |
+
6. **具體例子**:提供具體例子時,優先使用教材中的例子,再補充網路資料(如果啟用)
|
| 692 |
+
7. **繁體中文回答**:使用繁體中文進行回答
|
| 693 |
+
8. **保持連貫性**:結合對話歷史,但要根據當前的搜尋設定調整回答內容
|
| 694 |
+
9. **重複問題處理**:如果是重複問題但搜尋設定改變,請提供符合當前設定的新回答
|
| 695 |
+
|
| 696 |
+
註腳使用說明:
|
| 697 |
+
- 每當引用特定資料來源時,必須在該句末尾加上對應的註腳編號
|
| 698 |
+
- 如果一個句子引用多個來源,可以使用多個註腳 [1][2]
|
| 699 |
+
- 確保註腳編號與實際提供的資料來源對應
|
| 700 |
+
|
| 701 |
+
請根據以上原則回答用戶的問題,並正確使用註腳標註。"""
|
| 702 |
+
|
| 703 |
+
else:
|
| 704 |
+
# 只有網路搜尋結果的情況
|
| 705 |
+
system_prompt = f"""你是一個智能學習助理。用戶的問題在教材中沒有找到相關內容,但有網路搜尋結果可供參考。
|
| 706 |
+
|
| 707 |
+
當前搜尋設定:已啟用網路搜尋
|
| 708 |
+
|
| 709 |
+
對話歷史:
|
| 710 |
+
{conversation_context}
|
| 711 |
+
|
| 712 |
+
網路搜尋資訊:
|
| 713 |
+
{all_context}
|
| 714 |
+
|
| 715 |
+
回答規則:
|
| 716 |
+
1. 說明在教材中沒有找到相關資訊
|
| 717 |
+
2. 基於網路搜尋結果提供有用的回答
|
| 718 |
+
3. 正確使用註腳標註網路來源
|
| 719 |
+
4. 結合對話歷史,保持對話的連貫性
|
| 720 |
+
5. 用繁體中文回答
|
| 721 |
+
|
| 722 |
+
請根據以上資訊回答用戶的問題。"""
|
| 723 |
+
|
| 724 |
+
response = client.chat.completions.create(
|
| 725 |
+
model="gpt-4o",
|
| 726 |
+
messages=[
|
| 727 |
+
{"role": "system", "content": system_prompt},
|
| 728 |
+
{"role": "user", "content": message}
|
| 729 |
+
],
|
| 730 |
+
temperature=0.7,
|
| 731 |
+
max_tokens=1500
|
| 732 |
+
)
|
| 733 |
+
|
| 734 |
+
answer = response.choices[0].message.content
|
| 735 |
+
|
| 736 |
+
# 添加搜尋狀態說明
|
| 737 |
+
search_status_note = f"\n\n🔍 **搜尋狀態:** {'已啟用網路搜尋' if enable_web_search else '僅使用教材資料'}"
|
| 738 |
+
|
| 739 |
+
# 添加註腳列表
|
| 740 |
+
if footnotes:
|
| 741 |
+
footnote_section = "\n\n**參考資料:**\n" + "\n".join(footnotes)
|
| 742 |
+
return answer + search_status_note + footnote_section
|
| 743 |
+
else:
|
| 744 |
+
return answer + search_status_note
|
| 745 |
+
|
| 746 |
+
else:
|
| 747 |
+
# 沒有任何資料來源的備用回答
|
| 748 |
+
system_prompt = f"""你是一個智能學習助理。用戶的問題在提供的教材中找不到相關內容,且未啟用網路搜尋或搜尋無結果。
|
| 749 |
+
|
| 750 |
+
教材上下文:{material_context}
|
| 751 |
+
|
| 752 |
+
對話歷史:
|
| 753 |
+
{conversation_context}
|
| 754 |
+
|
| 755 |
+
回答規則:
|
| 756 |
+
1. 說明在用戶提供的教材中沒有找到相關資訊,且未進行網路搜尋
|
| 757 |
+
2. 基於一般知識提供有用的回答,但要說明這不是基於用戶的教材
|
| 758 |
+
3. 結合對話歷史,保持對話的連貫性
|
| 759 |
+
4. 建議用戶可以啟用網路搜尋獲得更多資訊
|
| 760 |
+
5. 用繁體中文回答
|
| 761 |
+
|
| 762 |
+
請回答用戶的問題。"""
|
| 763 |
+
|
| 764 |
+
response = client.chat.completions.create(
|
| 765 |
+
model="gpt-4o",
|
| 766 |
+
messages=[
|
| 767 |
+
{"role": "system", "content": system_prompt},
|
| 768 |
+
{"role": "user", "content": message}
|
| 769 |
+
],
|
| 770 |
+
temperature=0.8,
|
| 771 |
+
max_tokens=1000
|
| 772 |
+
)
|
| 773 |
+
|
| 774 |
+
answer = response.choices[0].message.content
|
| 775 |
+
|
| 776 |
+
disclaimer = "\n\n💡 **說明:** 在您的教材中沒有找到相關資訊,且未啟用網路搜尋。以上回答基於一般知識提供。建議啟用「聯網搜尋」獲得更完整和最新的資訊。"
|
| 777 |
+
|
| 778 |
+
return answer + disclaimer
|
| 779 |
+
|
| 780 |
+
except Exception as e:
|
| 781 |
+
return f"抱歉,處理您的問題時發生錯誤:{str(e)}"
|
| 782 |
+
|
| 783 |
+
# 修改:建立資料處理頁面(新增摘要欄位)
|
| 784 |
+
def create_data_processing_interface():
|
| 785 |
+
with gr.Blocks(theme=gr.themes.Soft()) as data_demo:
|
| 786 |
+
gr.Markdown("# 🎓 AI 學習助理 - 資料處理")
|
| 787 |
+
gr.Markdown("上傳教材檔案和錄音檔,進行 RAG 前處理後可前往 ChatBot 頁面進行問答!")
|
| 788 |
+
|
| 789 |
+
transcript_state = gr.State("")
|
| 790 |
+
|
| 791 |
+
with gr.Row():
|
| 792 |
+
# 左側:教材上傳區域
|
| 793 |
+
with gr.Column(scale=1):
|
| 794 |
+
gr.Markdown("## 📚 ���材上傳區")
|
| 795 |
+
|
| 796 |
+
material_files = gr.File(
|
| 797 |
+
label="上傳教材檔案",
|
| 798 |
+
file_count="multiple",
|
| 799 |
+
file_types=[".pdf", ".docx", ".ppt", ".pptx"],
|
| 800 |
+
height=200
|
| 801 |
+
)
|
| 802 |
+
|
| 803 |
+
material_status = gr.Textbox(
|
| 804 |
+
label="教材上傳狀態",
|
| 805 |
+
value="尚未上傳任何教材檔案",
|
| 806 |
+
interactive=False,
|
| 807 |
+
lines=5
|
| 808 |
+
)
|
| 809 |
+
|
| 810 |
+
material_files.change(
|
| 811 |
+
fn=handle_material_upload,
|
| 812 |
+
inputs=[material_files],
|
| 813 |
+
outputs=[material_status]
|
| 814 |
+
)
|
| 815 |
+
|
| 816 |
+
# 右側:錄音檔上傳與轉錄區域
|
| 817 |
+
with gr.Column(scale=1):
|
| 818 |
+
gr.Markdown("## 🎤 錄音檔轉錄區")
|
| 819 |
+
|
| 820 |
+
audio_file = gr.File(
|
| 821 |
+
label="上傳錄音檔案",
|
| 822 |
+
file_types=[".wav", ".mp3"],
|
| 823 |
+
height=100
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
language = gr.Dropdown(
|
| 827 |
+
label="選擇轉錄語言",
|
| 828 |
+
choices=[
|
| 829 |
+
("中文", "zh"), ("英文", "en"), ("日文", "ja"), ("韓文", "ko"),
|
| 830 |
+
("法文", "fr"), ("德文", "de"), ("西班牙文", "es")
|
| 831 |
+
],
|
| 832 |
+
value="zh",
|
| 833 |
+
info="選擇錄音檔案的主要語言"
|
| 834 |
+
)
|
| 835 |
+
|
| 836 |
+
transcribe_btn = gr.Button("🔄 開始轉錄", variant="primary", size="lg")
|
| 837 |
+
|
| 838 |
+
transcription_status = gr.Textbox(
|
| 839 |
+
label="轉錄狀態",
|
| 840 |
+
value="請上傳錄音檔案並選擇語言後點擊轉錄按鈕",
|
| 841 |
+
interactive=False,
|
| 842 |
+
lines=2
|
| 843 |
+
)
|
| 844 |
+
|
| 845 |
+
transcript_output = gr.Textbox(
|
| 846 |
+
label="逐字稿內容",
|
| 847 |
+
placeholder="轉錄完成後,逐字稿內容將顯示在這裡...",
|
| 848 |
+
interactive=False,
|
| 849 |
+
lines=6,
|
| 850 |
+
max_lines=10
|
| 851 |
+
)
|
| 852 |
+
|
| 853 |
+
# 新增:逐字稿摘要欄位
|
| 854 |
+
transcript_summary = gr.Textbox(
|
| 855 |
+
label="📝 逐字稿摘要",
|
| 856 |
+
placeholder="轉錄完成後,AI 將自動生成逐字稿摘要...",
|
| 857 |
+
interactive=False,
|
| 858 |
+
lines=4,
|
| 859 |
+
max_lines=6
|
| 860 |
+
)
|
| 861 |
+
|
| 862 |
+
download_file = gr.File(label="下載逐字稿", visible=False)
|
| 863 |
+
|
| 864 |
+
def transcribe_and_show_download(audio_file, language):
|
| 865 |
+
status, content, file_path, summary = handle_audio_transcription(audio_file, language)
|
| 866 |
+
|
| 867 |
+
if file_path:
|
| 868 |
+
return status, content, content, summary, gr.update(visible=True, value=file_path)
|
| 869 |
+
else:
|
| 870 |
+
return status, content, "", "", gr.update(visible=False, value=None)
|
| 871 |
+
|
| 872 |
+
transcribe_btn.click(
|
| 873 |
+
fn=transcribe_and_show_download,
|
| 874 |
+
inputs=[audio_file, language],
|
| 875 |
+
outputs=[transcription_status, transcript_output, transcript_state, transcript_summary, download_file]
|
| 876 |
+
)
|
| 877 |
+
|
| 878 |
+
# RAG 資料前處理區域
|
| 879 |
+
with gr.Row():
|
| 880 |
+
with gr.Column():
|
| 881 |
+
gr.Markdown("## 🔄 RAG 資料前處理")
|
| 882 |
+
gr.Markdown("將上傳的教材和轉錄的逐字稿進行前處理,準備用於 AI ChatBot")
|
| 883 |
+
|
| 884 |
+
with gr.Row():
|
| 885 |
+
rag_process_btn = gr.Button(
|
| 886 |
+
"🚀 開始 RAG 前處理",
|
| 887 |
+
variant="secondary",
|
| 888 |
+
size="lg",
|
| 889 |
+
scale=2
|
| 890 |
+
)
|
| 891 |
+
|
| 892 |
+
rag_status = gr.Textbox(
|
| 893 |
+
label="RAG 處理狀態",
|
| 894 |
+
value="準備就緒,點擊按鈕開始處理教材和逐字稿",
|
| 895 |
+
interactive=False,
|
| 896 |
+
lines=6
|
| 897 |
+
)
|
| 898 |
+
|
| 899 |
+
# 新增:教材概述欄位
|
| 900 |
+
material_overview = gr.Textbox(
|
| 901 |
+
label="📖 教材概述",
|
| 902 |
+
placeholder="RAG 前處理完成後,AI 將自動生成教材概述...",
|
| 903 |
+
interactive=False,
|
| 904 |
+
lines=6,
|
| 905 |
+
max_lines=8
|
| 906 |
+
)
|
| 907 |
+
|
| 908 |
+
rag_download = gr.File(label="下載處理結果 (JSON)", visible=False)
|
| 909 |
+
|
| 910 |
+
def process_and_show_result(material_files, transcript_content):
|
| 911 |
+
status, file_path, material_summary, transcript_summary = handle_rag_processing(material_files, transcript_content)
|
| 912 |
+
|
| 913 |
+
if file_path:
|
| 914 |
+
return status, material_summary, gr.update(visible=True, value=file_path)
|
| 915 |
+
else:
|
| 916 |
+
return status, "", gr.update(visible=False, value=None)
|
| 917 |
+
|
| 918 |
+
rag_process_btn.click(
|
| 919 |
+
fn=process_and_show_result,
|
| 920 |
+
inputs=[material_files, transcript_state],
|
| 921 |
+
outputs=[rag_status, material_overview, rag_download]
|
| 922 |
+
)
|
| 923 |
+
|
| 924 |
+
with gr.Row():
|
| 925 |
+
gr.Markdown("""
|
| 926 |
+
### 📋 使用說明
|
| 927 |
+
- **教材檔案**:支援 PDF、DOCX、PPT、PPTX 格式
|
| 928 |
+
- **錄音檔案**:支援 WAV、MP3 格式,檔案大小限制 25MB
|
| 929 |
+
- **轉錄語言**:支援中文、英文、日文、韓文、法文、德文、西班牙文
|
| 930 |
+
- **智能摘要**:AI 會自動生成逐字稿摘要和教材概述
|
| 931 |
+
- **RAG 前處理**:將教材和逐字稿分塊處理並生成嵌入向量
|
| 932 |
+
- **完成處理後**:前往 **AI ChatBot** 頁面進行智能問答
|
| 933 |
+
""")
|
| 934 |
+
|
| 935 |
+
return data_demo
|
| 936 |
+
|
| 937 |
+
# 建立增強版 ChatBot 頁面(保持不變)
|
| 938 |
+
def create_enhanced_chatbot_interface():
|
| 939 |
+
with gr.Blocks(theme=gr.themes.Soft()) as chatbot_demo:
|
| 940 |
+
gr.Markdown("# 🤖 AI ChatBot")
|
| 941 |
+
gr.Markdown("整合 RAG 資料和網路搜尋的智能學習助理,具備連貫對話能力")
|
| 942 |
+
|
| 943 |
+
# 檢查資料狀態
|
| 944 |
+
def check_data_status():
|
| 945 |
+
global processed_data_store
|
| 946 |
+
if processed_data_store is None:
|
| 947 |
+
return "❌ 尚未處理任何資料,請先前往「資料處理」頁面上傳並處理教材或錄音檔"
|
| 948 |
+
else:
|
| 949 |
+
total_chunks = len(processed_data_store["chunks"])
|
| 950 |
+
materials_count = len(processed_data_store["materials"])
|
| 951 |
+
has_transcript = processed_data_store["transcript"] is not None
|
| 952 |
+
return f"✅ 資料已載入!共 {total_chunks} 個文字塊({materials_count} 個教材檔案,{'含' if has_transcript else '不含'}逐字稿)"
|
| 953 |
+
|
| 954 |
+
# 介面元件
|
| 955 |
+
data_status = gr.Textbox(
|
| 956 |
+
label="資料狀態",
|
| 957 |
+
value=check_data_status(),
|
| 958 |
+
interactive=False,
|
| 959 |
+
lines=2
|
| 960 |
+
)
|
| 961 |
+
|
| 962 |
+
refresh_btn = gr.Button("🔄 重新檢查資料狀態", variant="secondary")
|
| 963 |
+
refresh_btn.click(fn=check_data_status, outputs=[data_status])
|
| 964 |
+
|
| 965 |
+
# 網路搜尋開關
|
| 966 |
+
with gr.Row():
|
| 967 |
+
web_search_toggle = gr.Checkbox(
|
| 968 |
+
label="🌐 啟用聯網搜尋",
|
| 969 |
+
value=False,
|
| 970 |
+
info="啟用後會搜尋網路資料並整合到回答中(狀態變化會影響重複問題的回答)"
|
| 971 |
+
)
|
| 972 |
+
|
| 973 |
+
# ChatBot 介面
|
| 974 |
+
chatbot = gr.Chatbot(
|
| 975 |
+
label="增強版 AI 學習助理",
|
| 976 |
+
height=500,
|
| 977 |
+
placeholder="請輸入您的問題,我會根據教材、逐字稿和網路搜尋(如啟用)來回答..."
|
| 978 |
+
)
|
| 979 |
+
|
| 980 |
+
msg = gr.Textbox(
|
| 981 |
+
label="輸入問題",
|
| 982 |
+
placeholder="例如:這份教材的主要重點是什麼?或者:最新的相關發展有哪些?",
|
| 983 |
+
lines=2
|
| 984 |
+
)
|
| 985 |
+
|
| 986 |
+
with gr.Row():
|
| 987 |
+
send_btn = gr.Button("💬 發送", variant="primary", scale=2)
|
| 988 |
+
clear_btn = gr.Button("🗑️ 清除對話", variant="secondary", scale=1)
|
| 989 |
+
|
| 990 |
+
# 事件處理函數
|
| 991 |
+
def user_message(message, history):
|
| 992 |
+
if not message.strip():
|
| 993 |
+
return "", history
|
| 994 |
+
return "", history + [[message, None]]
|
| 995 |
+
|
| 996 |
+
def bot_message(history, web_search_enabled):
|
| 997 |
+
if history and history[-1][1] is None:
|
| 998 |
+
user_msg = history[-1][0]
|
| 999 |
+
conversation_history = history[:-1]
|
| 1000 |
+
bot_response = enhanced_chatbot_with_web_search(
|
| 1001 |
+
user_msg,
|
| 1002 |
+
conversation_history,
|
| 1003 |
+
enable_web_search=web_search_enabled
|
| 1004 |
+
)
|
| 1005 |
+
history[-1][1] = bot_response
|
| 1006 |
+
return history
|
| 1007 |
+
|
| 1008 |
+
# 綁定事件
|
| 1009 |
+
msg.submit(
|
| 1010 |
+
user_message,
|
| 1011 |
+
[msg, chatbot],
|
| 1012 |
+
[msg, chatbot],
|
| 1013 |
+
queue=False
|
| 1014 |
+
).then(
|
| 1015 |
+
bot_message,
|
| 1016 |
+
[chatbot, web_search_toggle],
|
| 1017 |
+
chatbot
|
| 1018 |
+
)
|
| 1019 |
+
|
| 1020 |
+
send_btn.click(
|
| 1021 |
+
user_message,
|
| 1022 |
+
[msg, chatbot],
|
| 1023 |
+
[msg, chatbot],
|
| 1024 |
+
queue=False
|
| 1025 |
+
).then(
|
| 1026 |
+
bot_message,
|
| 1027 |
+
[chatbot, web_search_toggle],
|
| 1028 |
+
chatbot
|
| 1029 |
+
)
|
| 1030 |
+
|
| 1031 |
+
clear_btn.click(lambda: [], None, chatbot, queue=False)
|
| 1032 |
+
|
| 1033 |
+
# 使用說明
|
| 1034 |
+
with gr.Row():
|
| 1035 |
+
gr.Markdown("""
|
| 1036 |
+
### 💡 功能特色
|
| 1037 |
+
- **多重資料來源**:整合您的教材、逐字稿和網路搜尋結果
|
| 1038 |
+
- **智能領域分析**:自動識別教材領域,進行相關搜尋
|
| 1039 |
+
- **正確註腳標註**:清楚標明每個回答的資料來源和網址
|
| 1040 |
+
- **連貫對話記憶**:記住對話歷史,支援連續問答
|
| 1041 |
+
- **動態搜尋切換**:可隨時切換網路搜尋,重複問題會給出不同回答
|
| 1042 |
+
- **教材優先策略**:以教材內容為核心,網路資料作補充
|
| 1043 |
+
|
| 1044 |
+
### 📝 使用建議
|
| 1045 |
+
- **基礎問題**:關於教材內容的問題,可不啟用網路搜尋
|
| 1046 |
+
- **延伸問題**:需要最新資訊或更多例子時,建議啟用網路搜尋
|
| 1047 |
+
- **連續對話**:可以說「剛才你提到...」來參考之前的回答
|
| 1048 |
+
- **搜尋切換**:同一問題可切換搜尋狀態獲得不同深度的回答
|
| 1049 |
+
|
| 1050 |
+
### 🔍 搜尋策略
|
| 1051 |
+
- 🏠 **僅教材資料**:快速回答,基於您的專屬內容
|
| 1052 |
+
- 🌐 **教材 + 網路搜尋**:全面回答,整合多重資料來源
|
| 1053 |
+
- 📚 **智能註腳**:每個回答都會標明具體的資料來源
|
| 1054 |
+
- 🎯 **領域相關**:自動過濾無關領域的搜尋結果
|
| 1055 |
+
- 🔄 **狀態感知**:系統會檢測搜尋設定變化並調整回答
|
| 1056 |
+
""")
|
| 1057 |
+
|
| 1058 |
+
return chatbot_demo
|
| 1059 |
+
|
| 1060 |
+
# 新增播客生成相關函數
|
| 1061 |
+
def generate_podcast_script(materials, transcript):
|
| 1062 |
+
"""生成播客文稿"""
|
| 1063 |
+
try:
|
| 1064 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 1065 |
+
|
| 1066 |
+
# 合併教材和逐字稿內容
|
| 1067 |
+
combined_content = ""
|
| 1068 |
+
material_titles = []
|
| 1069 |
+
|
| 1070 |
+
if materials:
|
| 1071 |
+
for material in materials:
|
| 1072 |
+
material_titles.append(material['filename'])
|
| 1073 |
+
combined_content += f"\n教材檔案:{material['filename']}\n{material['content']}\n"
|
| 1074 |
+
|
| 1075 |
+
if transcript:
|
| 1076 |
+
combined_content += f"\n錄音逐字稿:\n{transcript}\n"
|
| 1077 |
+
|
| 1078 |
+
# 限制輸入長度避免超過 token 限制
|
| 1079 |
+
content_for_prompt = combined_content[:6000]
|
| 1080 |
+
|
| 1081 |
+
prompt = f"""請根據以下教材和逐字稿內容,撰寫一篇約1500字的播客節目文稿,目標播放時間約10分鐘。
|
| 1082 |
+
|
| 1083 |
+
教材檔案:{', '.join(material_titles) if material_titles else '無'}
|
| 1084 |
+
是否包含逐字稿:{'是' if transcript else '否'}
|
| 1085 |
+
|
| 1086 |
+
內容:
|
| 1087 |
+
{content_for_prompt}
|
| 1088 |
+
|
| 1089 |
+
請按照以下格式撰寫播客文稿:
|
| 1090 |
+
|
| 1091 |
+
1. **開場白**:簡潔有趣的開場,介紹本期主題
|
| 1092 |
+
2. **主要內容**:
|
| 1093 |
+
- 將教材內容轉化為口語化、易懂的說明
|
| 1094 |
+
- 適當加入提問和思考點
|
| 1095 |
+
- 使用生活化的比喻和例子
|
| 1096 |
+
- 保持邏輯清晰的結構
|
| 1097 |
+
3. **重點總結**:歸納核心概念和要點
|
| 1098 |
+
4. **結尾**:簡短的總結和下期預告
|
| 1099 |
+
|
| 1100 |
+
要求:
|
| 1101 |
+
- 使用繁體中文
|
| 1102 |
+
- 語調親切自然,適合口語播報
|
| 1103 |
+
- 內容深入淺出,適合學習者理解
|
| 1104 |
+
- 約1500字,播放時間約10分鐘
|
| 1105 |
+
- 適當加入停頓提示(用...表示)
|
| 1106 |
+
|
| 1107 |
+
請開始撰寫播客文稿:"""
|
| 1108 |
+
|
| 1109 |
+
response = client.chat.completions.create(
|
| 1110 |
+
model="gpt-4o",
|
| 1111 |
+
messages=[{"role": "user", "content": prompt}],
|
| 1112 |
+
temperature=0.7,
|
| 1113 |
+
max_tokens=2000
|
| 1114 |
+
)
|
| 1115 |
+
|
| 1116 |
+
script = response.choices[0].message.content
|
| 1117 |
+
return script
|
| 1118 |
+
|
| 1119 |
+
except Exception as e:
|
| 1120 |
+
return f"Podcast文稿生成失敗:{str(e)}"
|
| 1121 |
+
|
| 1122 |
+
def generate_podcast_audio(script_text):
|
| 1123 |
+
"""將文稿轉換為音頻"""
|
| 1124 |
+
try:
|
| 1125 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 1126 |
+
|
| 1127 |
+
# 使用 OpenAI TTS API
|
| 1128 |
+
response = client.audio.speech.create(
|
| 1129 |
+
model="gpt-4o-mini-tts",
|
| 1130 |
+
voice="alloy", # 可選:alloy, echo, fable, onyx, nova, shimmer
|
| 1131 |
+
input=script_text,
|
| 1132 |
+
speed=0.9 # 稍微放慢語速,適合學習
|
| 1133 |
+
)
|
| 1134 |
+
|
| 1135 |
+
# 儲存音頻到臨時檔案
|
| 1136 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
|
| 1137 |
+
temp_audio.write(response.content)
|
| 1138 |
+
temp_audio_path = temp_audio.name
|
| 1139 |
+
|
| 1140 |
+
return temp_audio_path
|
| 1141 |
+
|
| 1142 |
+
except Exception as e:
|
| 1143 |
+
print(f"音頻生成失敗:{str(e)}")
|
| 1144 |
+
return None
|
| 1145 |
+
|
| 1146 |
+
def estimate_reading_time(text):
|
| 1147 |
+
"""估算播放時間"""
|
| 1148 |
+
# 中文平均每分鐘約150-200字
|
| 1149 |
+
word_count = len(text)
|
| 1150 |
+
estimated_minutes = word_count / 175 # 使用中間值
|
| 1151 |
+
return word_count, estimated_minutes
|
| 1152 |
+
|
| 1153 |
+
# 建立播客生成頁面
|
| 1154 |
+
def create_podcast_page():
|
| 1155 |
+
with gr.Blocks(theme=gr.themes.Soft()) as podcast_demo:
|
| 1156 |
+
gr.Markdown("# 🎙️ AI Podcast生成器")
|
| 1157 |
+
gr.Markdown("根據您上傳的教材和錄音逐字稿生成約10分鐘的Podcast,包含文稿和音頻")
|
| 1158 |
+
|
| 1159 |
+
# 檢查資料狀態
|
| 1160 |
+
def check_podcast_data_status():
|
| 1161 |
+
global processed_data_store
|
| 1162 |
+
if processed_data_store is None:
|
| 1163 |
+
return "❌ 尚未處理任何資料,請先前往「資料處理」頁面完成前處理"
|
| 1164 |
+
else:
|
| 1165 |
+
materials_count = len(processed_data_store["materials"])
|
| 1166 |
+
has_transcript = processed_data_store["transcript"] is not None
|
| 1167 |
+
return f"✅ 資料已準備!{materials_count} 個教材檔案,{'含' if has_transcript else '不含'}逐字稿"
|
| 1168 |
+
|
| 1169 |
+
# 資料狀態顯示
|
| 1170 |
+
data_status = gr.Textbox(
|
| 1171 |
+
label="資料狀態",
|
| 1172 |
+
value=check_podcast_data_status(),
|
| 1173 |
+
interactive=False,
|
| 1174 |
+
lines=2
|
| 1175 |
+
)
|
| 1176 |
+
|
| 1177 |
+
refresh_data_btn = gr.Button("🔄 重新檢查資料狀態", variant="secondary")
|
| 1178 |
+
refresh_data_btn.click(fn=check_podcast_data_status, outputs=[data_status])
|
| 1179 |
+
|
| 1180 |
+
with gr.Row():
|
| 1181 |
+
# 左側:文稿生成
|
| 1182 |
+
with gr.Column(scale=1):
|
| 1183 |
+
gr.Markdown("## 📝 Podcast文稿生成")
|
| 1184 |
+
|
| 1185 |
+
generate_script_btn = gr.Button(
|
| 1186 |
+
"🚀 生成文稿",
|
| 1187 |
+
variant="primary",
|
| 1188 |
+
size="lg"
|
| 1189 |
+
)
|
| 1190 |
+
|
| 1191 |
+
script_status = gr.Textbox(
|
| 1192 |
+
label="生成狀態",
|
| 1193 |
+
value="點擊按鈕開始生成文稿",
|
| 1194 |
+
interactive=False,
|
| 1195 |
+
lines=2
|
| 1196 |
+
)
|
| 1197 |
+
|
| 1198 |
+
podcast_script = gr.Textbox(
|
| 1199 |
+
label="文稿",
|
| 1200 |
+
placeholder="文稿將在這裡顯示...",
|
| 1201 |
+
interactive=True, # 允許用戶編輯
|
| 1202 |
+
lines=15,
|
| 1203 |
+
max_lines=20
|
| 1204 |
+
)
|
| 1205 |
+
|
| 1206 |
+
script_info = gr.Textbox(
|
| 1207 |
+
label="文稿資訊",
|
| 1208 |
+
interactive=False,
|
| 1209 |
+
lines=2
|
| 1210 |
+
)
|
| 1211 |
+
|
| 1212 |
+
download_script = gr.File(label="下載文稿 (TXT)", visible=False)
|
| 1213 |
+
|
| 1214 |
+
# 右側:音頻生成
|
| 1215 |
+
with gr.Column(scale=1):
|
| 1216 |
+
gr.Markdown("## 🎵 音頻生成")
|
| 1217 |
+
|
| 1218 |
+
voice_selection = gr.Dropdown(
|
| 1219 |
+
label="選擇語音",
|
| 1220 |
+
choices=[
|
| 1221 |
+
("Alloy - 中性聲音", "alloy"),
|
| 1222 |
+
("Echo - 男性聲音", "echo"),
|
| 1223 |
+
("Fable - 英式男性", "fable"),
|
| 1224 |
+
("Onyx - 深沉男性", "onyx"),
|
| 1225 |
+
("Nova - 年輕女性", "nova"),
|
| 1226 |
+
("Shimmer - 溫和女性", "shimmer")
|
| 1227 |
+
],
|
| 1228 |
+
value="alloy",
|
| 1229 |
+
info="選擇適合的聲音"
|
| 1230 |
+
)
|
| 1231 |
+
|
| 1232 |
+
speed_control = gr.Slider(
|
| 1233 |
+
label="播放速度",
|
| 1234 |
+
minimum=0.5,
|
| 1235 |
+
maximum=2.0,
|
| 1236 |
+
value=0.9,
|
| 1237 |
+
step=0.1,
|
| 1238 |
+
info="調整播放速度(0.9倍適合學習)"
|
| 1239 |
+
)
|
| 1240 |
+
|
| 1241 |
+
generate_audio_btn = gr.Button(
|
| 1242 |
+
"🎤 生成音頻",
|
| 1243 |
+
variant="secondary",
|
| 1244 |
+
size="lg"
|
| 1245 |
+
)
|
| 1246 |
+
|
| 1247 |
+
audio_status = gr.Textbox(
|
| 1248 |
+
label="音頻生成狀態",
|
| 1249 |
+
value="請先生成文稿,然後點擊生成音頻",
|
| 1250 |
+
interactive=False,
|
| 1251 |
+
lines=2
|
| 1252 |
+
)
|
| 1253 |
+
|
| 1254 |
+
podcast_audio = gr.Audio(
|
| 1255 |
+
label="播客音頻",
|
| 1256 |
+
visible=False
|
| 1257 |
+
)
|
| 1258 |
+
|
| 1259 |
+
download_audio = gr.File(label="下載音頻 (MP3)", visible=False)
|
| 1260 |
+
|
| 1261 |
+
# 事件處理函數
|
| 1262 |
+
def generate_script():
|
| 1263 |
+
global processed_data_store
|
| 1264 |
+
|
| 1265 |
+
if processed_data_store is None:
|
| 1266 |
+
return "❌ 請先完成資料前處理", "", "", gr.update(visible=False)
|
| 1267 |
+
|
| 1268 |
+
try:
|
| 1269 |
+
materials = processed_data_store.get("materials", [])
|
| 1270 |
+
transcript_content = ""
|
| 1271 |
+
if processed_data_store.get("transcript"):
|
| 1272 |
+
transcript_content = processed_data_store["transcript"]["content"]
|
| 1273 |
+
|
| 1274 |
+
if not materials and not transcript_content:
|
| 1275 |
+
return "❌ 沒有可用的教材或逐字稿內容", "", "", gr.update(visible=False)
|
| 1276 |
+
|
| 1277 |
+
# 生成文稿
|
| 1278 |
+
script = generate_podcast_script(materials, transcript_content)
|
| 1279 |
+
|
| 1280 |
+
if script.startswith("播客文稿生成失敗"):
|
| 1281 |
+
return script, "", "", gr.update(visible=False)
|
| 1282 |
+
|
| 1283 |
+
# 計算文稿資訊
|
| 1284 |
+
word_count, estimated_time = estimate_reading_time(script)
|
| 1285 |
+
info_text = f"字數:{word_count} 字 | 預估播放時間:{estimated_time:.1f} 分鐘"
|
| 1286 |
+
|
| 1287 |
+
# 儲存文稿到檔案
|
| 1288 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".txt", encoding='utf-8') as temp_file:
|
| 1289 |
+
temp_file.write(script)
|
| 1290 |
+
temp_file_path = temp_file.name
|
| 1291 |
+
|
| 1292 |
+
return "✅ 播客文稿生成完成!", script, info_text, gr.update(visible=True, value=temp_file_path)
|
| 1293 |
+
|
| 1294 |
+
except Exception as e:
|
| 1295 |
+
return f"❌ 生成失敗:{str(e)}", "", "", gr.update(visible=False)
|
| 1296 |
+
|
| 1297 |
+
def generate_audio(script, voice, speed):
|
| 1298 |
+
if not script or not script.strip():
|
| 1299 |
+
return "❌ 請先生成播客文稿", gr.update(visible=False), gr.update(visible=False)
|
| 1300 |
+
|
| 1301 |
+
try:
|
| 1302 |
+
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
|
| 1303 |
+
|
| 1304 |
+
# 生成音頻
|
| 1305 |
+
response = client.audio.speech.create(
|
| 1306 |
+
model="gpt-4o-mini-tts",
|
| 1307 |
+
voice=voice,
|
| 1308 |
+
input=script,
|
| 1309 |
+
speed=speed
|
| 1310 |
+
)
|
| 1311 |
+
|
| 1312 |
+
# 儲存音頻
|
| 1313 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
|
| 1314 |
+
temp_audio.write(response.content)
|
| 1315 |
+
temp_audio_path = temp_audio.name
|
| 1316 |
+
|
| 1317 |
+
return "✅ 音頻生成完成!可以播放和下載", gr.update(visible=True, value=temp_audio_path), gr.update(visible=True, value=temp_audio_path)
|
| 1318 |
+
|
| 1319 |
+
except Exception as e:
|
| 1320 |
+
return f"❌ 音頻生成失敗:{str(e)}", gr.update(visible=False), gr.update(visible=False)
|
| 1321 |
+
|
| 1322 |
+
# 綁定事件
|
| 1323 |
+
generate_script_btn.click(
|
| 1324 |
+
fn=generate_script,
|
| 1325 |
+
outputs=[script_status, podcast_script, script_info, download_script]
|
| 1326 |
+
)
|
| 1327 |
+
|
| 1328 |
+
generate_audio_btn.click(
|
| 1329 |
+
fn=generate_audio,
|
| 1330 |
+
inputs=[podcast_script, voice_selection, speed_control],
|
| 1331 |
+
outputs=[audio_status, podcast_audio, download_audio]
|
| 1332 |
+
)
|
| 1333 |
+
|
| 1334 |
+
# 使用說明
|
| 1335 |
+
with gr.Row():
|
| 1336 |
+
gr.Markdown("""
|
| 1337 |
+
### 📋 使用說明
|
| 1338 |
+
- **資料準備**:確保已在「資料處理」頁面完成教材和逐字稿的前處理
|
| 1339 |
+
- **文稿生成**:AI 會根據您的教材內容生成約1500字的播客文稿
|
| 1340 |
+
- **文稿編輯**:生成後可以直接在文稿框中編輯內容
|
| 1341 |
+
- **音頻生成**:選擇合適的聲音和速度,將文稿轉換為音頻
|
| 1342 |
+
- **下載功能**:可分別下載文稿(TXT)和音頻(MP3)檔案
|
| 1343 |
+
|
| 1344 |
+
### 🎯 播客特色
|
| 1345 |
+
- **口語化表達**:適合播客的自然語調和節奏
|
| 1346 |
+
- **結構清晰**:包含開場、主要內容、總結和結尾
|
| 1347 |
+
- **學習導向**:深入淺出,適合教育用途
|
| 1348 |
+
- **時長適中**:約10分鐘,適合碎片化學習
|
| 1349 |
+
""")
|
| 1350 |
+
|
| 1351 |
+
return podcast_demo
|
| 1352 |
+
|
| 1353 |
+
# 修改主應用建立函數,新增播客頁面
|
| 1354 |
+
def create_complete_app():
|
| 1355 |
+
# 創建三個介面
|
| 1356 |
+
data_interface = create_data_processing_interface()
|
| 1357 |
+
enhanced_chatbot_interface = create_enhanced_chatbot_interface()
|
| 1358 |
+
podcast_interface = create_podcast_page()
|
| 1359 |
+
|
| 1360 |
+
# 使用 TabbedInterface 組合
|
| 1361 |
+
demo = gr.TabbedInterface(
|
| 1362 |
+
[data_interface, enhanced_chatbot_interface, podcast_interface],
|
| 1363 |
+
["📚 資料處理", "🤖 AI ChatBot", "🎙️ AI Podcast生成器"],
|
| 1364 |
+
title="🎓 AI Learning Hub"
|
| 1365 |
+
)
|
| 1366 |
+
|
| 1367 |
+
return demo
|
| 1368 |
+
|
| 1369 |
+
# 啟動應用程式
|
| 1370 |
+
if __name__ == "__main__":
|
| 1371 |
+
app = create_complete_app()
|
| 1372 |
+
app.launch(share=True, debug=True)
|